具体代码如下
#中间件
from selenium import webdriver
from scrapy.http import HtmlResponse
import time
class ChromeMiddlewares(object):
def process_request(cls,request,spider):
if request.meta.has_key('Chrome'):
driver=webdriver.Chrome()
driver.get(request.url)
driver.find_element_by_name('account').send_keys('13808847170')
driver.find_element_by_name('password').send_keys('aa886688')
time.sleep(7)
driver.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button").click()
time.sleep(7)
for i in range(3):
driver.execute_script('window.scrollBy(0,3000)')
time.sleep(5)
content=driver.page_source
driver.quit()
filename='zhihu.html'
with open(filename,'wb')as fp:
fp.write(content.encode('utf-8'))
return HtmlResponse(request.url,body=content,encoding='utf-8',request=request)
#spider文件
# -*- coding: utf-8 -*-
import scrapy
class TestmoniSpider(scrapy.Spider):
name = "testmoni"
allowed_domains = ["https://www.zhihu.com/"]
start_urls = []
def start_requests(self):
urls='https://www.zhihu.com/#signin'
rq=scrapy.Request(url=urls,callback=self.parse)
rq.meta['Chrome']=True
yield rq
def parse(self, response):#这个函数主要写解析
print response.text
setting文件
DOWNLOADER_MIDDLEWARES={'testselenium.middlewares.ChromeMiddlewares':1,}带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程