使用crawlspider模板做分布式为什么入库速度非常慢？数据也很少？-慕课网

# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Rule, CrawlSpider from scrapy_redis.spiders import RedisCrawlSpider from items import BuyinfoItem,SellinfoItem,CompanyinfoItem from utils.common import suiji_str, html_geshi, zhsha256, img_random, date_Handle, address_Handle,qq_Handle,url_qian, \ Imgdloss, add_requests, list_extract, price_Handle class EastsooSpider(RedisCrawlSpider): name = 'EasTsoo' allowed_domains = ['www.eastsoo.com'] redis_key = '{0}:start_urls'.format(name) rules = ( # 采购 Rule(LinkExtractor(allow=r"buy/[\w|-]+\.html$"), callback='buy_html', follow=True), # 供应 Rule(LinkExtractor(allow=r"buyoffer/\w+\.html$"), callback='sell_html', follow=True), # 公司 Rule(LinkExtractor(allow=r"www\.eastsoo\.com/u\d+($|/$)"), callback='com_html', follow=True), ) def sell_html(self, response): # 供应 tong = response.xpath("//div[@class='buy_top line']") if tong: es_id = suiji_str() title = response.xpath("//head/title/text()").extract_first("") tags = response.xpath("//meta[@name='keywords']/@content").extract_first("") content = response.xpath("//meta[@name='description']/@content").extract_first("") htmltext = html_geshi(response.xpath("//body").extract_first("")) content += htmltext img = img_random() if img: img_url = Imgdloss(response.xpath("//div[@class='x4 buy_top_pic']//img/@src").extract_first("")).xiazai() else: img_url = "" company = response.xpath("//div[@class='buy_company']/div/a/text()").extract_first("").strip() city = response.xpath("//ul[@class='buy_top_message']//font[contains(text(),'所在地区：')]/following-sibling::text()").extract_first("").replace(" ", "·") xurl = response.xpath("//a[@class='button radius-small bg-main']/@href").extract_first("") if xurl: xphtml = add_requests("http://www.eastsoo.com", xurl) tele = list_extract(xphtml.xpath("//td[contains(text(),'机：')]/following-sibling::td/text()")) else: tele = "" price = price_Handle(response.xpath("//font[contains(text(),'当前价格：')]/following-sibling::span/text()").extract_first("")) # 传递Item SellInfo = SellinfoItem() SellInfo['es_id'] = es_id SellInfo['title'] = title SellInfo['tags'] = tags SellInfo['content'] = content SellInfo['url'] = response.url SellInfo['url_id'] = zhsha256(response.url) SellInfo['img_url'] = img_url SellInfo['company'] = company SellInfo['city'] = city SellInfo['tele'] = tele SellInfo['price'] = price yield SellInfo def buy_html(self, response): # 采购 tong = response.xpath("//dl[@class='buyoffer_content']") if tong: es_id = suiji_str() title = response.xpath("//head/title/text()").extract_first("") tags = response.xpath("//meta[@name='keywords']/@content").extract_first("").strip() content = response.xpath("//meta[@name='description']/@content").extract_first("") htmltext = html_geshi(response.xpath("//body").extract_first("")) content += htmltext url = response.url url_id = zhsha256(url) img_url = "" company = response.xpath("//dl[@class='buyoffer_content']//li[contains(text(),'联系人：')]/text()").extract_first("").replace("联系人：", "") fabu_date = date_Handle(response.xpath("//dl[@class='buyoffer_content']//span[contains(text(),'发布时间：')]/following-sibling::text()").extract_first("")) # 传递Item BuyInfo = BuyinfoItem() BuyInfo['es_id'] = es_id BuyInfo['title'] = title BuyInfo['tags'] = tags BuyInfo['content'] = content BuyInfo['url'] = url BuyInfo['url_id'] = url_id BuyInfo['img_url'] = img_url BuyInfo['company'] = company BuyInfo['fabu_date'] = fabu_date yield BuyInfo def com_html(self,response): # 公司 tong = response.xpath("//div[@class='width margin-top-big shop_index_top']") if tong: es_id = suiji_str() title = response.xpath("//head/title/text()").extract_first("") tags = response.xpath("//meta[@name='keywords']/@content").extract_first("").strip() content = "" htmltext = html_geshi(response.xpath("//body").extract_first("")) content += htmltext url = response.url url_id = zhsha256(url) img_url = Imgdloss(response.xpath("//dl[@class='shop_company_content']//img/@src").extract_first("")).xiazai() company = response.xpath("//div[@class='shop_left_company_name']/text()").extract_first("") xurl = response.xpath("//div[@id='top_menu']//a[contains(text(),'联系方式')]/@href").extract_first("") xphtml = add_requests(xurl, '') tongs = xphtml.xpath("//table[@class='table']") if tongs: contacts = list_extract(xphtml.xpath("//table[@class='table']//td[contains(text(),'联　系：')]/following-sibling::td/text()")).replace("先生","").replace("女士","").strip() tele = list_extract(xphtml.xpath("//table[@class='table']//td[contains(text(),'电　话：')]/following-sibling::td/text()")).strip().replace("*","") mobile = list_extract(xphtml.xpath("//table[@class='table']//td[contains(text(),'手　机：')]/following-sibling::td/text()")).strip().strip().replace("*","") fax = "" # //td[contains(text(),'传真：')]/following-sibling::td/img/@src address = list_extract(xphtml.xpath("//table[@class='table']//td[contains(text(),'地　址：')]/following-sibling::td/text()")).strip().strip().replace("*","") qq = list_extract(xphtml.xpath("//table[@class='table']//td[contains(text(),'Ｑ　Ｑ：')]/following-sibling::td/text()")).strip().strip().replace("*","") wangwang = "" ComInfo = CompanyinfoItem() ComInfo['es_id'] = es_id ComInfo['title'] = title ComInfo['tags'] = tags ComInfo['content'] = content ComInfo['url'] = url ComInfo['url_id'] = url_id ComInfo['img_url'] = img_url ComInfo['company'] = company ComInfo['contacts'] = contacts ComInfo['tele'] = tele ComInfo['mobile'] = mobile ComInfo['fax'] = fax ComInfo['address'] = address ComInfo['qq'] = qq ComInfo['wangwang'] = wangwang yield ComInfo

class MysqlTwistedpipline(object): """异步连接池插入数据库 ::1、settings中要将MysqlTwistedpipline类写入ITEM_PIPELLINES当中 """ def __init__(self,dbpool): self.dbpool = dbpool self.number = 0 self.erorr = 0 @classmethod def from_settings(cls,settings): dbparms = dict( host = settings["MYSQL_HOST"], port = settings["MYSQL_PORT"], user = settings["MYSQL_USER"], password = settings["MYSQL_PASSWORD"], db = settings["MYSQL_DB"], charset = "utf8", cursorclass = MySQLdb.cursors.DictCursor, use_unicode = True ) dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) self.number += 1 print("-" * 30, "\n执行【异步插入】pipeline\n第{0}条数据插入\n".format(self.number), "-" * 30) query.addErrback(self.handle_error, item, spider) # 处理异常 def handle_error(self, failure, item, spider): # 错误处理异步插入异常函数 self.erorr += 1 print(failure,item['url']) print("-" * 30, "\n执行【异步插入】erorr\n第{0}条数据插入\n".format(self.erorr), "-" * 30) def do_insert(self, cursor, item): # 采购信息-执行具体的插入 insert_sql,params = item.get_insert_sql() cursor.execute(insert_sql, params)

2回答

提问者玖河 2019-04-30 09:59:03

2019-04-25 19:07:58 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://login.1688.com/member/signin.htm?from=sm&Done=http://detail.1688.com/offer/568004942128.html> from <GET https://detail.1688.com/offer/568004942128.html>
2019-04-25 19:07:59 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): 125.107.223.61:4267
2019-04-25 19:08:00 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): 117.42.235.94:4221
2019-04-25 19:08:00 [urllib3.connectionpool] DEBUG: http://117.42.235.94:4221 "GET http://baidu.com/ HTTP/1.1" 200 81
2019-04-25 19:08:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://login.1688.com/member/signin.htm?from=sm&Done=http://detail.1688.com/offer/532667680177.html> (referer: https://younaimei.1688.com/page/offerlist.htm?tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum=3)
2019-04-25 19:08:00 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): 123.169.38.231:9077
2019-04-25 19:08:07 [urllib3.connectionpool] DEBUG: http://123.169.38.231:9077 "GET http://baidu.com/ HTTP/1.1" 200 81
2019-04-25 19:08:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://login.1688.com/member/signin.htm?from=sm&Done=http://detail.1688.com/offer/532667680177.html>

None
2019-04-25 19:08:07 [twisted] CRITICAL: Rollback failed
Traceback (most recent call last):
File "c:\users\administrator\envs\xbscrapy\lib\site-packages\twisted\python\threadpool.py", line 250, in inContext
result = inContext.theWork()
File "c:\users\administrator\envs\xbscrapy\lib\site-packages\twisted\python\threadpool.py", line 266, in <lambda>
inContext.theWork = lambda: context.call(ctx, func, *args, **kw)
File "c:\users\administrator\envs\xbscrapy\lib\site-packages\twisted\python\context.py", line 122, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "c:\users\administrator\envs\xbscrapy\lib\site-packages\twisted\python\context.py", line 85, in callWithContext
return func(*args,**kw)
--- <exception caught here> ---
File "c:\users\administrator\envs\xbscrapy\lib\site-packages\twisted\enterprise\adbapi.py", line 472, in _runInteraction
conn.rollback()
File "c:\users\administrator\envs\xbscrapy\lib\site-packages\twisted\enterprise\adbapi.py", line 52, in rollback
self._connection.rollback()
MySQLdb._exceptions.OperationalError: (2006, 'MySQL server has gone away')

2019-04-25 19:08:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://detail.1688.com/offer/570274810860.html> (referer: https://younaimei.1688.com/page/offerlist.htm?tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum=3)
2019-04-25 19:08:08 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): 117.42.235.76:4251
2019-04-25 19:08:13 [urllib3.connectionpool] DEBUG: http://117.42.235.76:4251 "GET http://baidu.com/ HTTP/1.1" 200 81

0 回复有任何疑惑可以回复我~

收起回答

bobby 2019-04-27 15:24:49

这里入库慢可能性非常多：1. 是不是符合虽然url很多，但是符合入库要求的url比较少？是不是入库的时候有错误？是不是很多数据重复抓取了？是不是数据库性能比较低？

0 回复有任何疑惑可以回复我~

收起回答

提问者玖河 #1

经常出现这个错误：MySQLdb._exceptions.OperationalError: (2006, 'MySQL server has gone away')

还有另外一个问题
Redis内还有requests，制造也还在运行状态，但是却没有进行进一步爬取，而是出于等待状态

回复有任何疑惑可以回复我~ 2019-04-30 10:02:29

提问者玖河 #2

MySQLdb._exceptions.OperationalError: (2006, 'MySQL server has gone away')

这个问题网上说是长连接造成的，但是因为我用腾讯云的数据库，最大连接等待值只能到7200，请问一下有没有更好的解决方案？

回复有任何疑惑可以回复我~ 2019-04-30 10:04:28

bobby 回复提问者玖河 #3
```
你加我qq 442421039 我看看
```
回复有任何疑惑可以回复我~ 2019-05-02 22:48:35

使用crawlspider模板做分布式为什么入库速度非常慢？数据也很少？

正在回答回答被采纳积分+3

2回答

相似问题

请选择置顶位置

本课精华内容

Missing argument grant_type

MysqlTwistedPipline的对象没有cursor属性

知乎answer提取不到

为什么知乎数据导入不到MySQL数据库中？

运行 scrapy crawl jobbole 报错

【讨论题】你认为什么是 JS 逆向？

有没有方法可以比较准确的解析出 title 和正文内容

如何将数据的保存和抓取独立出来？

如何将 nodejs 服务集成进来呢？

【讨论题】字体反爬应该如何解析？

热搜

最近搜索清空