1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | 具体代码如下 #中间件 from selenium import webdriver from scrapy.http import HtmlResponse import time class ChromeMiddlewares( object ): def process_request( cls ,request,spider): if request.meta.has_key( 'Chrome' ): driver = webdriver.Chrome() driver.get(request.url) driver.find_element_by_name( 'account' ).send_keys( '13808847170' ) driver.find_element_by_name( 'password' ).send_keys( 'aa886688' ) time.sleep( 7 ) driver.find_element_by_xpath( "/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button" ).click() time.sleep( 7 ) for i in range ( 3 ): driver.execute_script( 'window.scrollBy(0,3000)' ) time.sleep( 5 ) content = driver.page_source driver.quit() filename = 'zhihu.html' with open (filename, 'wb' )as fp: fp.write(content.encode( 'utf-8' )) return HtmlResponse(request.url,body = content,encoding = 'utf-8' ,request = request) #spider文件 # -*- coding: utf-8 -*- import scrapy class TestmoniSpider(scrapy.Spider): name = "testmoni" allowed_domains = [ "https://www.zhihu.com/" ] start_urls = [] def start_requests( self ): urls = 'https://www.zhihu.com/#signin' rq = scrapy.Request(url = urls,callback = self .parse) rq.meta[ 'Chrome' ] = True yield rq def parse( self , response): #这个函数主要写解析 print response.text setting文件 DOWNLOADER_MIDDLEWARES = { 'testselenium.middlewares.ChromeMiddlewares' : 1 ,} |
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程