请稍等 ...
×

采纳答案成功!

向帮助你的同学说点啥吧!感谢那些助人为乐的人

老师您好,请问我无法进入到parse_answer函数,并且返回400

import re
import json
try:
import urlparse as parse
except:
from urllib import parse

import scrapy
from scrapy.loader import ItemLoader
from ArticleSpider.items import ZhihuQuestionItem, ZhihuAnswerItem

class ZhihuSpider(scrapy.Spider):
name = 'zhihu’
allowed_domains = [‘www.zhihu.com’]
start_urls = [‘https://www.zhihu.com/’]

# question的第一页answer的请求url
start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"

headers = {
    "HOST": "www.zhihu.com",
    "Referer": "https://www.zhizhu.com",
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
   # 'x-app-za':" OS = Android&Release=5.1.1&Model=SM-G925F&VersionName=5.21.2&VersionCode=764&Product=com.zhihu.android&Width=1080&Height=1920&Installer=%E5%BA%94%E7% 94%A8%E5%AE%9D-%E5%B9%BF%E5%91%8A&DeviceType=AndroidPhone&Brand=samsung&OperatorType=46000"

}

def parse(self, response):
    """
    提取出html页面中的所有url 并跟踪这些url进行一步爬取
    如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
    """
    all_urls = response.css("a::attr(href)").extract()
    all_urls = [parse.urljoin(response.url, url) for url in all_urls]
    all_urls = filter(lambda x:True if x.startswith("https") else False,all_urls)
    for url in all_urls:
        print(url)
        match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*",url)
        if match_obj:
            #如果提取到question相关页面,则下载后交由提取函数进行提取
            request_url = match_obj.group(1)
            yield scrapy.Request(request_url,headers=self.headers,callback=self.parse_question)
        else:
            pass
            #如果不是question相关页面,则直接进一步跟踪
           # yield scrapy.Request(url,headers=headers,callback=self.parse)

def parse_question(self, response):
    # 处理question页面, 从页面中提取出具体的question item
    if "QuestionHeader-title" in response.text:
        #新版本
        match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*", response.url)
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(),response=response)
        item_loader.add_css("title","h1.QuestionHeader-title::text")
        item_loader.add_css("content",".QuestionHeader-detail")
        item_loader.add_value("url",response.url)
        item_loader.add_value("zhihu_id",question_id)
        item_loader.add_css("answer_num",".List-headerText span::text")
        item_loader.add_css("comments_num",".QuestionHeader-Comment button::text")
        item_loader.add_css("watch_user_num",".NumberBoard-itemValue::text")
        item_loader.add_css("topics",".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

    else:#处理知乎旧版本
        match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*", response.url)
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", ".zh-question-title h2 a::text")
        item_loader.add_css("content", "#zh-question-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", "#zh-question-answer-num::text")
        item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
        item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
        item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

        question_item = item_loader.load_item()

    url1 = self.start_answer_url.format(question_id,20,0)
    print(url1)
    yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,callback=self.parse_answer)
    yield question_item


def parse_answer(self, reponse):

正在回答

1回答

是个别出现400还是所有的这种请求都会出现400呢

0 回复 有任何疑惑可以回复我~
  • 提问者 慕斯3428064 #1
    所有的访问start_answer_url都有400
    回复 有任何疑惑可以回复我~ 2019-01-06 17:01:45
  • 提问者 慕斯3428064 #2
    解决了,是我的start_answer_url写的不对,我重新找了之后就能进入parse_answer函数了,谢谢老师
    回复 有任何疑惑可以回复我~ 2019-01-06 18:19:02
  • bobby 回复 提问者 慕斯3428064 #3
    好的,
    回复 有任何疑惑可以回复我~ 2019-01-08 12:23:17
问题已解决,确定采纳
还有疑问,暂不采纳
意见反馈 帮助中心 APP下载
官方微信