线程池执行完prase_list之后就退出了,不会执行prase_topic和prase

# -*- coding: utf-8 -*- # @Time : 2019/6/8 17:37 # @Author : imooc # @File : threadpool_spider.py # @Software: PyCharm import re import ast from urllib import parse from datetime import datetime import requests from scrapy import Selector from csdn_spider.models import * domain = "https://bbs.csdn.net" url_list = [] def get_nodes_json(): # 获取节点列表 left_menu_text = requests.get(domain + "/dynamic_js/left_menu.js?csdn").text nodes_str_match = re.search("forumNodes: (.*])", left_menu_text) if nodes_str_match: nodes_str = nodes_str_match.group(1).replace("null", "None") nodes_list = ast.literal_eval(nodes_str) return nodes_list return [] def process_nodes_list(nodes_list): # 提取出所有的url到urls_list for item in nodes_list: if "url" in item and item["url"]: url_list.append(item["url"]) if "children" in item: process_nodes_list(item["children"]) def get_level1_urls(nodes_list): # 提取第一层的url level1_urls = [] for item in nodes_list: if "url" in item and item["url"]: level1_urls.append(item["url"]) return level1_urls def get_last_urls(): # 剔除第一层的url,得到最终需要的url nodes_list = get_nodes_json() process_nodes_list(nodes_list) level1_urls = get_level1_urls(nodes_list) last_url = [] for url in url_list: if url not in level1_urls: last_url.append(url) all_urls = [] for url in last_url: all_urls.append(parse.urljoin(domain, url)) all_urls.append(parse.urljoin(domain, url + "/recommend")) all_urls.append(parse.urljoin(domain, url + "/closed")) return all_urls def parse_topic(topic_url): # 获取帖子详情及回复 print("解析帖子详情页: {}".format(topic_url)) topic_id = topic_url.split('/')[-1] res_text = requests.get(topic_url).text sel = Selector(text=res_text) all_divs = sel.xpath("//div[starts-with(@id,'post-')]") topic_item = all_divs[0] content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] praised_nums = topic_item.xpath(".//label[@class='red_praise digg']//em/text()").extract()[0] jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0] jtl = 0 # 匹配百分数 jtl_match = re.search('(100|[1-9]?d(.dd?d?)?)%$|0$', jtl_str, re.M) if jtl_match: jtl = float(jtl_match.group(1)) existed_topics = Topic.select().where(Topic.id == topic_id) if existed_topics: topic = existed_topics[0] topic.content = content topic.praised_nums = int(praised_nums) topic.jtl = jtl topic.save() for answer_item in all_divs[1:]: author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0] author_id = author_info.split("/")[-1] create_time_str = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0] create_time = datetime.strptime(create_time_str, "%Y-%m-%d %H:%M:%S") answer_content = answer_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] answer_praised_nums = answer_item.xpath(".//label[@class='red_praise digg']//em/text()").extract()[0] answer_id = answer_item.xpath(".//@data-post-id").extract()[0] answer = Answer() answer.id = int(answer_id) answer.topic_id = topic_id answer.author = author_id answer.create_time = create_time answer.praised_nums = int(answer_praised_nums) answer.content = answer_content existed_answer = Answer.select().where(Answer.id == answer_id) if existed_answer: answer.save() else: answer.save(force_insert=True) next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract() if next_page: next_url = parse.urljoin(domain, next_page[0]) executor.submit(parse_topic, next_url) def parse_author(url): author_id = url.split("/")[-1] # 获取用户的详情 print("解析用户的详情页: {}".format(url)) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0', } res_text = requests.get(url, headers=headers).text sel = Selector(text=res_text) author = Author() author.id = author_id all_li_strs = sel.xpath("//ul[@class='mod_my_t clearfix']/li/span/text()").extract() click_nums = all_li_strs[0] original_nums = all_li_strs[1] forward_nums = int(all_li_strs[2]) rate = int(all_li_strs[3]) answer_nums = int(all_li_strs[4]) praised_nums = int(all_li_strs[5]) author.click_nums = click_nums author.original_nums = original_nums author.forward_nums = forward_nums author.rate = rate author.answer_nums = answer_nums author.praised_nums = praised_nums desc = sel.xpath("//dd[@class='user_desc']/text()").extract() if desc: author.desc = desc[0].strip() person_b = sel.xpath("//dd[@class='person_b']/ul/li") for item in person_b: item_text = "".join(item.extract()) if "csdnc-m-add" in item_text: location = item.xpath(".//span/text()").extract()[0].strip() author.location = location else: industry = item.xpath(".//span/text()").extract()[0].strip() author.industry = industry name = sel.xpath("//h4[@class='username']/text()").extract()[0] author.name = name.strip() existed_author = Author.select().where(Author.id == author_id) if existed_author: author.save() else: author.save(force_insert=True) def parse_list(url): # 提取列表页数据 print("解析列表页: {}".format(url)) res_text = requests.get(url).text sel = Selector(text=res_text) all_trs = sel.xpath("//table[@class='forums_tab_table']//tbody//tr") for tr in all_trs: status = tr.xpath(".//td[1]/span/text()").extract()[0] score = tr.xpath(".//td[2]/em/text()").extract()[0] topic_url = parse.urljoin(domain, tr.xpath(".//td[3]/a[contains(@class,'forums_title')]/@href").extract()[0]) topic_id = int(topic_url.split("/")[-1]) topic_title = tr.xpath(".//td[3]/a[contains(@class,'forums_title')]/text()").extract()[0] author_url = parse.urljoin(domain, tr.xpath(".//td[4]/a/@href").extract()[0]) author_id = author_url.split("/")[-1] create_time_str = tr.xpath(".//td[4]/em/text()").extract()[0] create_time = datetime.strptime(create_time_str, "%Y-%m-%d %H:%M") answer_info = tr.xpath(".//td[5]/span/text()").extract()[0] answer_nums = int(answer_info.split("/")[0]) click_nums = int(answer_info.split("/")[1]) last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0] last_time = datetime.strptime(last_time_str, "%Y-%m-%d %H:%M") topic = Topic() topic.id = topic_id topic.title = topic_title topic.author = author_id topic.click_nums = click_nums topic.answer_nums = answer_nums topic.create_time = create_time topic.last_answer_time = last_time topic.score = score topic.status = status existed_topic = Topic.select().where(Topic.id == topic.id) if existed_topic: topic.save() else: topic.save(force_insert=True) executor.submit(parse_topic, topic_url) executor.submit(parse_author, author_url) next_page = sel.xpath("//a[@class=pageliststy next_page']/@href").extract() if next_page: next_url = parse.urljoin(domain, next_page[0]) executor.submit(parse_list, next_url) if __name__ == "__main__": from concurrent.futures import ThreadPoolExecutor executor = ThreadPoolExecutor(max_workers=10) last_urls = get_last_urls() for url in last_urls: executor.submit(parse_list, url)

2回答

仙女座舜 2020-03-09 17:30:53

兄弟解决了吗，啥原因，我用线程池提交函数和参数递归处理下一页也不执行

0 回复有任何疑惑可以回复我~

收起回答

bobby #1
```
有没有报错？你有没有调试过？
```
回复有任何疑惑可以回复我~ 2020-03-10 12:32:00
仙女座舜回复 bobby #2
```
我重开个帖子吧，代码贴上。
```
回复有任何疑惑可以回复我~ 2020-03-10 13:39:17

bobby 2019-06-10 17:41:14

你有没有试过在这里打断点看看能不能运行到这行代码？如果可以的话在parse_detail这些函数中也打断点看看能否进入

0 回复有任何疑惑可以回复我~

收起回答

李景焱 #1
```
打了断点 代码似乎不能运行
```
回复有任何疑惑可以回复我~ 2019-08-01 15:17:26
bobby 回复李景焱 #2
```
不能运行有具体的报错信息吗
```
回复有任何疑惑可以回复我~ 2019-08-02 16:21:16
1652833140 回复 bobby #3
```
没有报错 也没有退出 直接返回 parse_list继续执行下一个url
```
回复有任何疑惑可以回复我~ 2020-01-04 16:32:04

点开查看后面1条评论

线程池执行完prase_list之后就退出了,不会执行prase_topic和prase_author

正在回答回答被采纳积分+3

2回答

相似问题

请选择置顶位置

本课精华内容

代码问题

老师我的jdproductPageComments 返回的是空白页什么也没有怎么办

代码问题这个错误怎么办

老师这个错误怎么解决

使用selenium点击后返回异常数据

词云热力图应该如何实现？

【讨论题】滑动验证码的解决办法

如果合适的话使用 asyncio 做爬虫的优势是什么？

热搜

最近搜索清空