请稍等 ...
×

采纳答案成功!

向帮助你的同学说点啥吧!感谢那些助人为乐的人

线程池 wait 方法可以等待所有的线程任务完成,但是完成后不退出

# !/use/bin/python3
# _*_ coding:utf-8 _*_
# __author__ : __ajiang__
# 2020/5/1

import os
import re
import requests
from urllib import parse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED

from scrapy import Selector
from fake_useragent import UserAgent

from hanhan_spider.models import *

# 笔趣阁的主路径
domain = 'https://www.yqzww.net/'

executor = ThreadPoolExecutor(max_workers=10)


def get_chapter_id(url):
    """
    :param url: 章节url
    :return: 章节ID
    """
    res = re.match('.*\/([0-9]+).html', url)
    if res:
        return int(res.group(1))
    return None


def parse_novel_detail(novel_url):
    """
    :param novel_url: 小说详情页链接
    :return: None
    """
    headers['User-Agent'] = ua.random
    novel_detail_html = requests.get(novel_url, headers=headers)
    novel_detail_html.encoding = 'gbk'
    novel_detail_text = novel_detail_html.text
    sel = Selector(text=novel_detail_text)
    novel = NovelContent()

    novel_id = novel_url.split('/')[-2]
    _novel_id = novel_id.split('_')[1]
    novel.novel_id = _novel_id

    novel_status = sel.xpath('//meta[@property="og:novel:status"]/@content').extract()
    if novel_status:
        novel.status = novel_status[0]

    novel_image = sel.xpath('//meta[@property="og:image"]/@content').extract()
    if novel_image:
        novel.image = novel_image[0]

    novel_name = sel.xpath('//meta[@property="og:novel:book_name"]/@content').extract()
    if novel_name:
        novel.name = novel_name[0]
        print('开始爬取小说:{}'.format(novel_name[0]))

    novel_author = sel.xpath('//meta[@property="og:novel:author"]/@content').extract()
    if novel_author:
        novel.author = novel_author[0]

    novel_update_time = sel.xpath('//meta[@property="og:novel:update_time"]/@content').extract()
    if novel_update_time:
        novel.last_update = datetime.strptime(novel_update_time[0], '%Y-%m-%d %H:%M:%S')

    novel_description = sel.xpath('//meta[@property="og:description"]/@content').extract()
    if novel_description:
        novel.description = novel_description[0]

    # 判断这本书在不在数据库里面
    _novel = NovelContent.select().where(NovelContent.novel_id == _novel_id)
    if _novel:
        novel.save()
    else:
        novel.save(force_insert=True)
    print('小说{}爬取完成'.format(novel_name[0]))
    # 判断是否需要重新抓取新的章节 或者说这里不管,就把剩下的操作全部丢给解析章节的函数
    novel_chapter_urls = sel.xpath('//div[@class="article-list"]//dd')
    for novel_chapter_url in novel_chapter_urls:
        if novel_chapter_url.xpath('.//a/@href').extract():
            chapter_url = novel_chapter_url.xpath('.//a/@href').extract()[0]
            executor.submit(parse_novel_chapter, parse.urljoin(novel_url, chapter_url))


def parse_novel_chapter(chapter_url):
    """

    :param chapter_url: 小说详情页链接
    :return: None
    pass
    """
    headers['User-Agent'] = ua.random
    html = requests.get(chapter_url, headers=headers)
    # 使用utf-8编码解码
    html.encoding = 'gbk'
    page_text = html.text
    sel = Selector(text=page_text)
    chapter = NovelChapter()
    # 我们从章节的URL里面获取章节ID
    result_chapter = re.match('.*\/([0-9]+)\.html', chapter_url)
    chapter_id = 0
    if result_chapter:
        chapter_id = int(result_chapter.group(1))
    result_chapter = NovelChapter.select().where(NovelChapter.chapter_id == chapter_id)
    result_novel = re.match('.*\/.*_([0-9]+)\/.*\.html', chapter_url)
    novel_id = 0
    if result_novel:
        novel_id = int(result_novel.group(1))
    res_novel = NovelContent.select().where(NovelContent.novel_id == novel_id)

    chapter_next_btn = sel.xpath('//div[@class="bottem"]//a[contains(text(), "下一章")]/@href').extract()
    chapter_pre_btn = sel.xpath('//div[@class="bottem"]//a[contains(text(), "上一章")]/@href').extract()

    if not result_chapter:
        chapter.chapter_id = chapter_id
        try:
            if result_novel:
                chapter.novel = res_novel
        except Exception as e:
            chapter.novel = novel_id
        chapter_title = sel.xpath('//div[@class="bookname"]/h1/text()').extract()
        if chapter_title:
            chapter.title = chapter_title[0]
            print('开始爬取章节:{}'.format(chapter_title[0]))
        chapter_content = sel.xpath('//div[@id="content"]').extract()
        if chapter_content:
            chapter.content = chapter_content[0]
        if '.html' in chapter_next_btn[0]:
            chapter.next_chapter = get_chapter_id(chapter_next_btn[0])

        if '.html' in chapter_pre_btn[0]:
            chapter.pre_chapter = get_chapter_id(chapter_pre_btn[0])

        chapter.save(force_insert=True)
        print('爬取章节{}结束'.format(chapter_title[0]))

    # 解析下一章的信息
    if '.html' in chapter_next_btn[0]:
        executor.submit(parse_novel_chapter, parse.urljoin(domain, chapter_next_btn[0]))
    else:
        return False


if __name__ == '__main__':
    # 任务列表,往线程池里面提交任务
    # 使用随机的UA
    ua_location = os.path.dirname(os.path.dirname(__file__)) + '/fake-useragent.json'
    ua = UserAgent(path=ua_location)
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en,zh;q=0.9,ar;q=0.8,zh-CN;q=0.7,zh-TW;q=0.6,zh-HK;q=0.5",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "www.yqzww.net",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "cross-site",
        "Upgrade-Insecure-Requests": "1"
    }
    task_list = [executor.submit(parse_novel_detail, 'https://www.yqzww.net/book_87570/')]
    wait(task_list, return_when=ALL_COMPLETED)
    # parse_novel_chapter('https://www.yqzww.net/book_87570/33206060.html')

老师这个是我写的一个爬小说详情和章节列表的一个爬虫,我在最后使用了

wait(task_list, return_when=ALL_COMPLETED)

但是爬到最有一章的时候没有退出,我在 parse_novel_chapter 函数里面 的最后 有判断

 # 解析下一章的信息
   if '.html' in chapter_next_btn[0]:
       executor.submit(parse_novel_chapter, parse.urljoin(domain,chapter_next_btn[0]))
   else:
       return False

这个执行到 return false 的时候 程序就停住了 并没有退出,是应该提交一个什么信号给executor么? 麻烦老师帮忙看一下

正在回答 回答被采纳积分+3

1回答

bobby 2020-05-04 10:11:30

wait(task_list, return_when=ALL_COMPLETED)
 这里的意思是这个task_list中的任务完成以后退出 如果中途添加了新任务 就管不到了,你这里说的程序就停住了是什么意思? 具体表现是什么

0 回复 有任何疑惑可以回复我~
  • 提问者 吴大宝100 #1
    我在解析小说章节的时候使用executor.submit(parse_novel_detail),解析章节详情的使用executor.submit(parse_novel_chapter),如果在wait()的时候不使用return_when=ALL_COMPLETED ,在解析完小说详情的时候程序就会退出,解析详情方法里的调用的executor.submit(parse_novel_chapter) parse_novel_chapter 方法不会执行到。如果使用return_when=ALL_COMPLETED,parse_novel_chapter可以执行。但是执行到最后一章完成之后程序会一直进入类似与等待的状态不会退出。请问怎样才能让它执行完退出呐
    回复 有任何疑惑可以回复我~ 2020-07-12 12:48:01
  • bobby 回复 提问者 吴大宝100 #2
    进入到最后一个数据的时候你需要将while循环退出
    回复 有任何疑惑可以回复我~ 2020-07-13 18:31:58
问题已解决,确定采纳
还有疑问,暂不采纳
意见反馈 帮助中心 APP下载
官方微信