python 代码
from urllib import parse
import scrapy
from scrapy import Request
import requests
import re
import json
from ArticleSpier.items import JobBoleArticleItem
from ArticleSpier.utils import common
from ArticleSpier.items import ArticleItemLoader
import time
from selenium import webdriver
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com/']
def start_requests(self):
# browser = webdriver.Chrome(executable_path="/Users/lucasma/Downloads/chromedriver")
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
browser = webdriver.Chrome(executable_path="/Users/lucasma/Downloads/chromedriver",
chrome_options=chrome_options)
browser.get("https://www.zhihu.com/signin?next=%2F")
browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[1]/div/form/div[1]/div[2]').click()
# time.sleep(60)
browser.find_element_by_xpath(
'//*[@id="root"]/div/main/div/div/div/div[1]/div/form/div[2]/div/label/input').send_keys("130***8963")
browser.find_element_by_xpath(
'//*[@id="root"]/div/main/div/div/div/div[1]/div/form/div[3]/div/label/input').send_keys("cnmd****2021@!")
browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[1]/div/form/button').click()
time.sleep(60)
def parse(self, response):
pass
错误代码
/usr/local/bin/python3 /Users/lucasma/PycharmProjects/ArticleSpier/main.py
/Users/lucasma/PycharmProjects/ArticleSpier/ArticleSpier/images
2020-12-23 21:27:13 [scrapy.utils.log] INFO: Scrapy 1.7.0 started (bot: ArticleSpier)
2020-12-23 21:27:13 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.5 (default, Apr 25 2018, 14:23:58) - [GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.1)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h 22 Sep 2020), cryptography 3.2.1, Platform Darwin-20.1.0-x86_64-i386-64bit
2020-12-23 21:27:13 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'ArticleSpier', 'NEWSPIDER_MODULE': 'ArticleSpier.spiders', 'SPIDER_MODULES': ['ArticleSpier.spiders']}
2020-12-23 21:27:13 [scrapy.extensions.telnet] INFO: Telnet Password: 5b622914ecbc2947
2020-12-23 21:27:13 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2020-12-23 21:27:13 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-12-23 21:27:13 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-12-23 21:27:13 [scrapy.middleware] INFO: Enabled item pipelines:
['ArticleSpier.pipelines.ArticleImagePipeline',
'ArticleSpier.pipelines.JsonWithEncodingPipeline',
'ArticleSpier.pipelines.MysqlTwistedPipeline',
'ArticleSpier.pipelines.ArticlespierPipeline']
2020-12-23 21:27:14 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62696/session {"capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "chrome", "platformName": "any", "goog:chromeOptions": {"excludeSwitches": ["enable-automation"], "debuggerAddress": "127.0.0.1:9222", "extensions": [], "args": ["--disable-extensions"]}}}, "desiredCapabilities": {"browserName": "chrome", "version": "", "platform": "ANY", "goog:chromeOptions": {"excludeSwitches": ["enable-automation"], "debuggerAddress": "127.0.0.1:9222", "extensions": [], "args": ["--disable-extensions"]}}}
2020-12-23 21:27:14 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): 127.0.0.1:62696
2020-12-23 21:27:14 [urllib3.connectionpool] DEBUG: http://127.0.0.1:62696 "POST /session HTTP/1.1" 400 2002
2020-12-23 21:27:14 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
Unhandled error in Deferred:
2020-12-23 21:27:14 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 183, in crawl
return self._crawl(crawler, *args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 187, in _crawl
d = crawler.crawl(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 86, in crawl
start_requests = iter(self.spider.start_requests())
File "/Users/lucasma/PycharmProjects/ArticleSpier/ArticleSpier/spiders/zhihu.py", line 34, in start_requests
chrome_options=chrome_options)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 81, in __init__
desired_capabilities=desired_capabilities)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 157, in __init__
self.start_session(capabilities, browser_profile)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 252, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidArgumentException: Message: invalid argument: cannot parse capability: goog:chromeOptions
from invalid argument: unrecognized chrome option: excludeSwitches
2020-12-23 21:27:14 [twisted] CRITICAL:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 86, in crawl
start_requests = iter(self.spider.start_requests())
File "/Users/lucasma/PycharmProjects/ArticleSpier/ArticleSpier/spiders/zhihu.py", line 34, in start_requests
chrome_options=chrome_options)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 81, in __init__
desired_capabilities=desired_capabilities)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 157, in __init__
self.start_session(capabilities, browser_profile)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 252, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidArgumentException: Message: invalid argument: cannot parse capability: goog:chromeOptions
from invalid argument: unrecognized chrome option: excludeSwitches
Process finished with exit code 0
老师,什么原因 ,chrome 的原因?还是啥
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程