class LagouSpider(scrapy.Spider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = [] headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", # "Content-Length": "25", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "www.lagou.com", "Origin": "https://www.lagou.com", "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "X-Anit-Forge-Code": "0", "X-Anit-Forge-Token": "None", "X-Requested-With": "XMLHttpRequest" } def start_requests(self): api = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false' post_data = { "first": "true", "pn": "1", "kd": "python" } yield FormRequest(url=api, formdata=post_data, headers=self.headers, callback=self.parse)
老师这里的header中不带content-length可以正常爬 状态码200 ,但是带上后就成302了
api = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false' headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": "25", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "www.lagou.com", "Origin": "https://www.lagou.com", "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "X-Anit-Forge-Code": "0", "X-Anit-Forge-Token": "None", "X-Requested-With": "XMLHttpRequest" } data = { "first": " true", "pn": " 1", "kd": " python" } r = requests.post(api, headers=headers, data=data) print(r.status_code, r.history, r.encoding, r.text, sep='\n')
而使用requests模块来爬的话,不管header中是否有content-length都可以正常爬取 状态码都为200
这是为什么呢。。百思不得其解。。
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程