1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | class LagouSpider(scrapy.Spider): name = 'lagou' allowed_domains = [ 'www.lagou.com' ] start_urls = [] headers = { "Accept" : "application/json, text/javascript, */*; q=0.01" , "Accept-Encoding" : "gzip, deflate, br" , "Accept-Language" : "zh-CN,zh;q=0.9" , "Connection" : "keep-alive" , # "Content-Length": "25", "Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8" , "Host" : "www.lagou.com" , "Origin" : "https://www.lagou.com" , "Referer" : "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" , "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" , "X-Anit-Forge-Code" : "0" , "X-Anit-Forge-Token" : "None" , "X-Requested-With" : "XMLHttpRequest" } def start_requests( self ): api = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false' post_data = { "first" : "true" , "pn" : "1" , "kd" : "python" } yield FormRequest(url = api, formdata = post_data, headers = self .headers, callback = self .parse) |
老师这里的header中不带content-length可以正常爬 状态码200 ,但是带上后就成302了
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | api = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false' headers = { "Accept" : "application/json, text/javascript, */*; q=0.01" , "Accept-Encoding" : "gzip, deflate, br" , "Accept-Language" : "zh-CN,zh;q=0.9" , "Connection" : "keep-alive" , "Content-Length" : "25" , "Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8" , "Host" : "www.lagou.com" , "Origin" : "https://www.lagou.com" , "Referer" : "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" , "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" , "X-Anit-Forge-Code" : "0" , "X-Anit-Forge-Token" : "None" , "X-Requested-With" : "XMLHttpRequest" } data = { "first" : " true" , "pn" : " 1" , "kd" : " python" } r = requests.post(api, headers = headers, data = data) print (r.status_code, r.history, r.encoding, r.text, sep = '\n' ) |
而使用requests模块来爬的话,不管header中是否有content-length都可以正常爬取 状态码都为200
这是为什么呢。。百思不得其解。。
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程