1 2 3 4 5 6 7 8 9 10 11 12 13 | import requests url = " http://www.xiaomishu.com/app/download/" headers = { # 'host': "akesudiqu.xiaomishu.com", 'upgrade-insecure-requests' : "1" , 'user-agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" , } while True : response = requests.request( "GET" , url, headers = headers) print (response.text) |
老师您好,requests代码如上面,能一直运行不会封锁,这个请求的url是来自scrapy请求的link。
但是scrapy spider文件如下,就以运行就报405,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule import re from pybloom_live import BloomFilter from scrapy.http import Request, FormRequest, HtmlResponse download_bf = BloomFilter( 1024 * 1024 * 16 , 0.01 ) class XiaomishuSpider(CrawlSpider): name = 'xiaomishu' start_url = 'http://www.xiaomishu.com/citylist/' pattern = re. compile (r 'xiaomishu\.com/shop/\w{12}/' , re.DOTALL) rules = ( Rule(LinkExtractor(allow = r 'xiaomishu' , deny = (r 'links.aspx' , )), callback = 'parse_item' , follow = True ), ) headers = { 'upgrade-insecure-requests' : "1" , 'user-agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" , } def start_requests( self ): yield scrapy.Request( self .start_url, headers = self .headers) def _requests_to_follow( self , response): if not isinstance (response, HtmlResponse): return seen = set () for n, rule in enumerate ( self ._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url = link.url, callback = self ._response_downloaded, headers = self .headers) r.meta.update(rule = n, link_text = link.text) yield rule.process_request(r) def parse_item( self , response): result = self .pattern.findall(response.text) if len (result) > = 1 : print (result) for res in result: try : if res not in download_bf: download_bf.add(res) print (res) else : print ( '数据已存在' ) except ValueError: pass |
上面scrapy代码,报405的链接,都能带到requests那个代码里一直运行。不知道为什么scrapy里报405?
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程