import re
import json
try:
import urlparse as parse
except:
from urllib import parse
import scrapy
from scrapy.loader import ItemLoader
from ArticleSpider.items import ZhihuQuestionItem, ZhihuAnswerItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu’
allowed_domains = [‘www.zhihu.com’]
start_urls = [‘https://www.zhihu.com/’]
# question的第一页answer的请求url
start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
# 'x-app-za':" OS = Android&Release=5.1.1&Model=SM-G925F&VersionName=5.21.2&VersionCode=764&Product=com.zhihu.android&Width=1080&Height=1920&Installer=%E5%BA%94%E7% 94%A8%E5%AE%9D-%E5%B9%BF%E5%91%8A&DeviceType=AndroidPhone&Brand=samsung&OperatorType=46000"
}
def parse(self, response):
"""
提取出html页面中的所有url 并跟踪这些url进行一步爬取
如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
"""
all_urls = response.css("a::attr(href)").extract()
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
all_urls = filter(lambda x:True if x.startswith("https") else False,all_urls)
for url in all_urls:
print(url)
match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*",url)
if match_obj:
#如果提取到question相关页面,则下载后交由提取函数进行提取
request_url = match_obj.group(1)
yield scrapy.Request(request_url,headers=self.headers,callback=self.parse_question)
else:
pass
#如果不是question相关页面,则直接进一步跟踪
# yield scrapy.Request(url,headers=headers,callback=self.parse)
def parse_question(self, response):
# 处理question页面, 从页面中提取出具体的question item
if "QuestionHeader-title" in response.text:
#新版本
match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(),response=response)
item_loader.add_css("title","h1.QuestionHeader-title::text")
item_loader.add_css("content",".QuestionHeader-detail")
item_loader.add_value("url",response.url)
item_loader.add_value("zhihu_id",question_id)
item_loader.add_css("answer_num",".List-headerText span::text")
item_loader.add_css("comments_num",".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num",".NumberBoard-itemValue::text")
item_loader.add_css("topics",".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:#处理知乎旧版本
match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", ".zh-question-title h2 a::text")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
url1 = self.start_answer_url.format(question_id,20,0)
print(url1)
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,callback=self.parse_answer)
yield question_item
def parse_answer(self, reponse):
在debug后:console没有值
运行后console中:2019-01-05 22:22:18 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <400 https://www.zhihu.com/api/v4/questions/65135073/answers?sort_by=default&include=data[*].is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata[*].author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge[%3F(type%3Dbest_answerer)].topics&limit=20&offset=0>: HTTP status code is not handled or not allowed
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程