具体代码如下 #中间件 from selenium import webdriver from scrapy.http import HtmlResponse import time class ChromeMiddlewares(object): def process_request(cls,request,spider): if request.meta.has_key('Chrome'): driver=webdriver.Chrome() driver.get(request.url) driver.find_element_by_name('account').send_keys('13808847170') driver.find_element_by_name('password').send_keys('aa886688') time.sleep(7) driver.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button").click() time.sleep(7) for i in range(3): driver.execute_script('window.scrollBy(0,3000)') time.sleep(5) content=driver.page_source driver.quit() filename='zhihu.html' with open(filename,'wb')as fp: fp.write(content.encode('utf-8')) return HtmlResponse(request.url,body=content,encoding='utf-8',request=request) #spider文件 # -*- coding: utf-8 -*- import scrapy class TestmoniSpider(scrapy.Spider): name = "testmoni" allowed_domains = ["https://www.zhihu.com/"] start_urls = [] def start_requests(self): urls='https://www.zhihu.com/#signin' rq=scrapy.Request(url=urls,callback=self.parse) rq.meta['Chrome']=True yield rq def parse(self, response):#这个函数主要写解析 print response.text setting文件 DOWNLOADER_MIDDLEWARES={'testselenium.middlewares.ChromeMiddlewares':1,}
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程