请稍等 ...
×

采纳答案成功!

向帮助你的同学说点啥吧!感谢那些助人为乐的人

IndexError: list index out of range

如题,很奇怪,之前第一天完全可以爬取信息入库,第二天打开再运行就报错了。老师请帮我看看这个错误该怎么解决。
我在网上搜到说可能是index超出范围,或者list是空值。怎么修改呢?

class CnblogsSpider(scrapy.Spider):
    name = 'cnblogs'
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['http://news.cnblogs.com/']


    def parse(self, response):
        post_nodes = response.css('#news_list .news_block')[1:2]
        for post_node in post_nodes:
            image_url = post_node.css('.entry_summary a img::attr(src)').extract_first("")
            post_url = post_node.css('h2 a::attr(href)').extract_first("")
            yield Request(url=parse.urljoin(response.url,post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
        #if next_url == "Next >":
            #next_url = response.css("div.pager a:last-child::attr(href)").extract_first("")
        #next_url = response.xpath("//a[contains(text(),'Next >')]/@href").extract_first("")
        #yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self,response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)
            article_item = CnblogsArticleItem()
            title = response.css("#news_title a::text").extract_first("")
            #title = response.xpath("//*[@id='news_title']//a/text()").extract_first("")
            create_date = response.css("#news_info .time::text").extract_first("")
            match_re = re.match(".*?(\d+.*)",create_date)
            if match_re:
                create_date = match_re.group(1)
            #create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()")
            content = response.css("#news_content").extract()[0]
            #content = response.xpath("//*[@id='news_content']").extract()[0]
            tag_list = response.css(".news_tags a::text").extract()
            tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract()[0]
            tags = ",".join(tag_list)


            #html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            #j_data = json.loads(html.text)
            article_item["title"] = title
            article_item["create_date"] = create_date
            article_item["content"] = content
            article_item["tags"] = tags
            article_item["title"] = title
            article_item["url"] = response.url
            if response.meta.get("front_image_url",""):
                article_item["front_image_url"] =[response.meta.get("front_image_url","")]
            else:
                article_item["front_image_url"] = []

            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item":article_item} ,callback=self.parse_nums)

            pass

图片描述

正在回答 回答被采纳积分+3

1回答

提问者 ak918xp 2020-02-21 20:16:52

好奇怪,一会行一会又不行,取决于新闻的不同,我觉得应该是index超出范围了

0 回复 有任何疑惑可以回复我~
  • bobby #1
    对于你来说 你要找到出错的url是什么 然后单独针对这个url抓取然后解析看看是否会重现这个问题
    回复 有任何疑惑可以回复我~ 2020-02-22 15:28:24
问题已解决,确定采纳
还有疑问,暂不采纳
意见反馈 帮助中心 APP下载
官方微信