老师,我已经用了selinium模拟登陆了,也加了header,还是出现了301错误,该怎么解决啊?
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import pickle
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['www.lagou.com']#不在这个域名下的会被忽略
start_urls = ['https://www.lagou.com/']
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
rules = (
Rule(LinkExtractor(allow=("zhaopin/.*",)),follow=True),
Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
)
# def parse_start_url(self, response):
# return []
#
# def process_results(self, response, results):
# return results
def parse_job(self, response):
#解析拉勾网的职位
i = {}
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return i
def start_requests(self):
#第一次模拟登陆获取cookies用这一段
from selenium import webdriver
browser = webdriver.Chrome(executable_path="F:\chromedriver_win32\chromedriver.exe")
browser.get("https://passport.lagou.com/login/login.html")
browser.find_element_by_xpath("//form[@class='active']/div[@class='input_item clearfix'][1]/input").send_keys(
"13736821938")
browser.find_element_by_xpath("//form[@class='active']/div[@class='input_item clearfix'][2]/input").send_keys("gehongYI88")
print(10)
browser.find_element_by_xpath(
"//form[@class='active']/div[@class='input_item btn_group clearfix']/input").click()
import time
time.sleep(10)
Cookies = browser.get_cookies()
print(Cookies)
cookie_dict = {}
for cookie in Cookies:
#写入文件
f = open('F:/Users/hongyi/PycharmProjects/ArticleSpider/cookies/lagou' + cookie['name'] + '.zhihu','wb')
pickle.dump(cookie,f)
f.close()
cookie_dict[cookie['name']] = cookie['value']
browser.close()
# #直接从文件中提取保存的cookies
# cookie_dict = {}
# root_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'cookies')#得到cookies文件夹的路径
# for root, dirs, files in os.walk(root_dir):#遍历cookies文件夹中的所有文件
# for file in files:
# with open(os.path.join(root_dir, file), 'rb') as f:
# filecontent = pickle.load(f)#load pickle文件
# cookie_dict[file.split('.')[0].split('zhihu')[1]] = filecontent['value']#filecontent['value']取出文件内容作为字典的value
return [scrapy.Request(url=self.start_urls[0],dont_filter=True,cookies=cookie_dict,headers=self.headers)]带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程