import requests url = " http://www.xiaomishu.com/app/download/" headers = { # 'host': "akesudiqu.xiaomishu.com", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } while True: response = requests.request("GET", url, headers=headers) print(response.text)
老师您好,requests代码如上面,能一直运行不会封锁,这个请求的url是来自scrapy请求的link。
但是scrapy spider文件如下,就以运行就报405,
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule import re from pybloom_live import BloomFilter from scrapy.http import Request, FormRequest, HtmlResponse download_bf = BloomFilter(1024*1024*16, 0.01) class XiaomishuSpider(CrawlSpider): name = 'xiaomishu' start_url = 'http://www.xiaomishu.com/citylist/' pattern = re.compile(r'xiaomishu\.com/shop/\w{12}/', re.DOTALL) rules = ( Rule(LinkExtractor(allow=r'xiaomishu', deny=(r'links.aspx', )), callback='parse_item', follow=True), ) headers = { 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } def start_requests(self): yield scrapy.Request(self.start_url, headers=self.headers) def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded, headers=self.headers) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r) def parse_item(self, response): result = self.pattern.findall(response.text) if len(result) >= 1: print(result) for res in result: try: if res not in download_bf: download_bf.add(res) print(res) else: print('数据已存在') except ValueError: pass
上面scrapy代码,报405的链接,都能带到requests那个代码里一直运行。不知道为什么scrapy里报405?
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程