老师yield callback = self.parse_drtail 无法进入parse_detail函数
代码
import scrapy
from scrapy import Request
from urllib import parse
import re
import requests
import json
class FirstscrapySpider(scrapy.Spider):
name = 'firstscrapy’
allowed_domains = [‘https://www.cnblogs.com/’]
start_urls = [‘https://news.cnblogs.com/’]
def parse(self, response):
# 1 获取新闻列表页的url,并交给scrapy下载后调用相应的解析方法
# 2 获取下一页的url交给scrapy下载,下载完成后继续跟进
print('start----------------------------------------------')
post_nodes = response.xpath('//div[@id="news_list"]/div[@class="news_block"]')[:1]
for post_node in post_nodes:
image_url = post_node.xpath('//div[@class="entry_summary"]/a/img/@src').extract_first('')
news_url = post_node.xpath('//h2[@class="news_entry"]/a/@href').extract_first('')
yield Request(url=parse.urljoin(response.url, news_url), meta={'front_image_url': image_url}, callback=self.parse_detail)
#获取下一页的url
next_url = response.xpath('//a[contains(text(), "Next >")]/@href').extract_first('')
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
print('进入arse_detail')
match_re = re.match('.*?(\d+)', response.url)
if match_re:
title = response.xpath('//div[@id="news_title"]/a/text()').extract_first('')
create_date = response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first('')
content = response.xpath('//div[@id="news_body"]').extract()
tag_list = response.xpath('//div[@class="news_tags"]/a/text()').extract()
tags = ','.join(tag_list)
post_id = match_re.group(1)
yield Request(url=parse.urljoin(response.url, '/NewsAjax/GetAjaxNewsInfo?contentId={}'.format(post_id)), callback=parse.nums)
def parse_nums(self, response):
js = json.loads(response.text)
comment_nums = js['CommentCount']
praise_nums = js['DiggCount']
review_nums = js['TotalView']
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程