class SeleniumMiddleware(object):
def process_request(self, request, spider):
if spider.name == 'gome':
gome_type = request.meta.get('gomeType', 0)
log.msg('gome type: %s' % gome_type)
spider.browser.get(request.url)
if gome_type == 2:
while True:
spider.browser_wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="min-pager-number"]')))
# TODO 分页数据处理
pager_souce = spider.browser.page_source
pager_text = spider.browser.find_element_by_xpath('//*[@id="min-pager-number"]').text()
pager_list = re.findall(u"\d+", pager_text)
if len(pager_list) == 2 and int(pager_list[0]) < int(pager_list[1]):
spider.browser.find_element_by_xpath('//*[@id="mp-next"]').click()
else:
break
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8",
request=request)
pass
现在的问题是middleware中只能返回一次pagesource,如果pagesource的解析和存储都在middleware中,那scrapy用不用无所谓了。。
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程