import requests
url = " http://www.xiaomishu.com/app/download/"
headers = {
# 'host': "akesudiqu.xiaomishu.com",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
}
while True:
response = requests.request("GET", url, headers=headers)
print(response.text)老师您好,requests代码如上面,能一直运行不会封锁,这个请求的url是来自scrapy请求的link。
但是scrapy spider文件如下,就以运行就报405,
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from pybloom_live import BloomFilter
from scrapy.http import Request, FormRequest, HtmlResponse
download_bf = BloomFilter(1024*1024*16, 0.01)
class XiaomishuSpider(CrawlSpider):
name = 'xiaomishu'
start_url = 'http://www.xiaomishu.com/citylist/'
pattern = re.compile(r'xiaomishu\.com/shop/\w{12}/', re.DOTALL)
rules = (
Rule(LinkExtractor(allow=r'xiaomishu', deny=(r'links.aspx', )),
callback='parse_item', follow=True),
)
headers = {
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
}
def start_requests(self):
yield scrapy.Request(self.start_url, headers=self.headers)
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = Request(url=link.url, callback=self._response_downloaded, headers=self.headers)
r.meta.update(rule=n, link_text=link.text)
yield rule.process_request(r)
def parse_item(self, response):
result = self.pattern.findall(response.text)
if len(result) >= 1:
print(result)
for res in result:
try:
if res not in download_bf:
download_bf.add(res)
print(res)
else:
print('数据已存在')
except ValueError:
pass上面scrapy代码,报405的链接,都能带到requests那个代码里一直运行。不知道为什么scrapy里报405?
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程