def parse_answer(self, response):
answer_item_loader = ZhihuItemLoader(item=ZhihuAnswerItem(), response=response)
answer_dict = json.loads(response.text)
is_end = answer_dict['paging']['is_end']
next_answer_url = answer_dict['paging']['next']
for answer in answer_dict['data']:
create_time = answer['created_time']
update_time = answer['updated_time']
answer_id = answer['id']
praise_nums = answer['voteup_count']
answer_url = answer['url']
author_id = answer['author']['id'] if 'id' in answer['author'] else ''
question_id = answer['question']['id']
question_create_time = answer['question']['created']
question_update_time = answer['question']['updated_time']
content = answer['content'] if 'content' in answer['content'] else answer['excerpt']
comment_nums = answer['comment_count']
crawl_time = datetime.now()
answer_item_loader.add_value('answer_id', answer_id)
answer_item_loader.add_value('question_id', question_id)
answer_item_loader.add_value('answer_url', answer_url)
answer_item_loader.add_value('author_id', author_id)
answer_item_loader.add_value('content', content)
answer_item_loader.add_value('praise_nums', praise_nums)
answer_item_loader.add_value('comment_nums', comment_nums)
answer_item_loader.add_value('create_time', create_time)
answer_item_loader.add_value('update_time', update_time)
answer_item_loader.add_value('crawl_time', crawl_time)
answer_item = answer_item_loader.load_item()
yield answer_item
if is_end == False:
yield scrapy.Request(next_answer_url, callback=self.parse_answer)带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程