源码如下:
# -*- coding: utf-8 -*-
import re
import logging
import scrapy
import ipdb
from urllib import parse
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider
from utils.common import get_md5
from items import MutoItem, MutoItemLoader
logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
level=logging.INFO)
class MutoSpider(RedisSpider):
"""采用redis方式来爬取网站"""
name = 'muto'
allowed_domains = ['www.mutongzixun.com']
# start_urls = 'https://www.mutongzixun.com'
redis_key = 'muto:start_urls'
headers = {
"HOST": "www.mutongzixun.com",
"Referer": "https://www.mutongzixun.com/",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
}
def __init__(self, *args, **kwargs):
super(MutoSpider, self).__init__(*args, **kwargs)
def parse(self, response):
# 获取信息页的所有url
all_urls = response.css(".row.mod-main-list a::attr(href)").extract()
pattern = re.compile(r"^/index.*")
# 去掉重复的连接
urls = set(all_urls)
urls = list(urls)
for url in urls:
match_obj = pattern.match(url)
if match_obj:
url = match_obj.group(0)
request_url = parse.urljoin(response.url, url)
logging.info("请求的URL地址:%s" % request_url)
# ipdb.set_trace()
yield Request(request_url,
headers=self.headers,
callback=self.parse_detail)
else:
continue
# 获取下一页的url
# next_url = response.xpath("//div[@class='row mod-main-list']//a[text()='下一页']/@href").extract()[0]
# logging.info("下一页的URL地址:%s" % next_url)
# # ipdb.set_trace()
# yield Request(next_url,
# headers=self.headers,
# callback=self.parse)
def parse_detail(self, response):
"""新闻详情页解析"""
match_re = re.match(r".*?(\d+)$", response.url)
if match_re:
detail_id = match_re.group(1)
item_loader = MutoItemLoader(item=MutoItem(), response=response)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_value("url", response.url)
item_loader.add_css("title", "h3.text-center::text")
item_loader.add_xpath("tag", "//ol[@class='breadcrumb']/li[2]/a/text()")
date_str = response.xpath("(//p[@class='text-center']/text())[1]").extract_first()
date_pattern = re.search(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", date_str)
date = date_pattern.group(0)
item_loader.add_value("date", date)
item_loader.add_css("content", "#show-content")
item_loader.add_css("img_url", "#show-content img::attr(src)")
item_loader.add_value('click_num', 0)
click_num_url = parse.urljoin(response.url, '/api.php?op=count&id={}&modelid=1'.format(detail_id))
# ipdb.set_trace()
news_item = item_loader.load_item()
yield news_item
# yield Request(click_num_url,
# headers=self.headers,
# meta={"item_loader": item_loader},
# callback=self.parse_click_nums)
def parse_click_nums(self, response):
"""api请求获取点击数量"""
item_loader = response.meta.get("item_loader")
res = response.text
match = re.search(r".*\('#hits'\).*'(\d+)'.*", res)
click_num = match.group(1)
item_loader.add_value("click_num", click_num)
news_item = item_loader.load_item()
# ipdb.set_trace()
print("^"*100)
print(news_item)
print("^"*100)
yield news_item
parse_detail 解析函数 不能解析完所有需要的内容,需要请求接口 也就是还需要回调函数parse_click_nums解析点击次数。我尝试cnblog的方式 在两个函数之间传递item,但是程序报错,查明原因是meta参数原因。请求的时候不调用meta 程序正常,请问在scrapy—redis中多个解析函数之间,如何更好传递Item。感谢!
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程