1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | def parse_answer( self , response): answer_item_loader = ZhihuItemLoader(item = ZhihuAnswerItem(), response = response) answer_dict = json.loads(response.text) is_end = answer_dict[ 'paging' ][ 'is_end' ] next_answer_url = answer_dict[ 'paging' ][ 'next' ] for answer in answer_dict[ 'data' ]: create_time = answer[ 'created_time' ] update_time = answer[ 'updated_time' ] answer_id = answer[ 'id' ] praise_nums = answer[ 'voteup_count' ] answer_url = answer[ 'url' ] author_id = answer[ 'author' ][ 'id' ] if 'id' in answer[ 'author' ] else '' question_id = answer[ 'question' ][ 'id' ] question_create_time = answer[ 'question' ][ 'created' ] question_update_time = answer[ 'question' ][ 'updated_time' ] content = answer[ 'content' ] if 'content' in answer[ 'content' ] else answer[ 'excerpt' ] comment_nums = answer[ 'comment_count' ] crawl_time = datetime.now() answer_item_loader.add_value( 'answer_id' , answer_id) answer_item_loader.add_value( 'question_id' , question_id) answer_item_loader.add_value( 'answer_url' , answer_url) answer_item_loader.add_value( 'author_id' , author_id) answer_item_loader.add_value( 'content' , content) answer_item_loader.add_value( 'praise_nums' , praise_nums) answer_item_loader.add_value( 'comment_nums' , comment_nums) answer_item_loader.add_value( 'create_time' , create_time) answer_item_loader.add_value( 'update_time' , update_time) answer_item_loader.add_value( 'crawl_time' , crawl_time) answer_item = answer_item_loader.load_item() yield answer_item if is_end = = False : yield scrapy.Request(next_answer_url, callback = self .parse_answer) |
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程