#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import re
import json
import scrapy
try:
from urllib import parse
except:
import urlparse as parse
class ZhihuSpider(scrapy.Spider):
name = 'zhihu_sel'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
#questions的第一页answer的请求url
#start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"
headers = {
"HOST":"www.zhihu.com",
"Referer":"https://www.baidu.com/",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36"}
custom_settings = {
"COOKIES_ENABLED":True
}
def parse(self, response):
"""
提取出html页面中的所有url 并跟踪这些url进行一步爬取
如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
"""
all_urls=response.css("a::attr(href)").extract() #此时提取出的URL没有主域名,需要join连接
all_urls=[parse.urljoin(response.url,url) for url in all_urls]
for url in all_urls:
pass
def parse_question(self,response):
pass
def parse_answer(self,response):
pass
def start_requests(self): #第一步完成知乎登录
from selenium import webdriver
browser = webdriver.Chrome(executable_path="D:/tools/chromedriver/chromedriver64.exe")
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper Input").send_keys(
"18374820162") #用户名
browser.find_element_by_css_selector(".SignFlow-password Input").send_keys(
"glw0107")#密码
browser.find_element_by_css_selector(
".Button.SignFlow-submitButton").click()
import time
time.sleep(10)
Cookies = browser.get_cookies()
print(Cookies)
cookie_dict = {}
import pickle
for cookie in Cookies:
#写入文件
f = open('D:/scrapy/zhihu' + cookie['name'] + '.zhihu','wb')
pickle.dump(cookie, f)
f.close()
cookie_dict[cookie['name']] = cookie['value']
browser.close()
return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]
# def login(self,response):
# response_text=response.text
#
# match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text,re.DOTALL)
# xsrf=''
# if match_obj:
# xsrf= (match_obj.group(1))
# if xsrf:
# post_url = "https://www.zhihu.com/"
# post_data = {"_xsrf": xsrf, "phone_num":"18782902568", "password": "admin124"}
#
# return[scrapy.FormRequest(
# url = post_url,
# formdata = post_data,
# headers=self.headers,
# callback=self.check_login
# )] #完成表单提交
# def check_login(self, response):
# #验证login是否成功,验证服务器的返回数据判断是否成功
# text_json=json.loads(response.text)
# if "msg" in text_json and text_json["msg"]=="登录成功":
# for url in self.start_urls:
# yield scrapy.Request(url,dont_filter=True,headers=self.headers)
-------------------------------------------------------------------------------------------------------
这是输出
:\Users\1\Envs\article_spider\Scripts\python.exe "D:\program files\JetBrains\PyCharm 2017.3.2\helpers\pydev\pydevd.py" --multiproc --qt-support=auto --client 127.0.0.1 --port 62350 --file D:/py_project/ArticleSpider/main.py
pydev debugger: process 9800 is connecting
Connected to pydev debugger (build 173.4127.16)
2018-07-27 17:23:43 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: ArticleSpider)
2018-07-27 17:23:43 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 18.7.0, Python 3.5.4 (v3.5.4:3f56838, Aug 8 2017, 02:17:05) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.5.0 (OpenSSL 1.1.0g 2 Nov 2017), cryptography 2.1.4, Platform Windows-10-10.0.15063-SP0
2018-07-27 17:23:43 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'ArticleSpider', 'SPIDER_MODULES': ['ArticleSpider.spiders'], 'NEWSPIDER_MODULE': 'ArticleSpider.spiders'}
2018-07-27 17:23:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.logstats.LogStats']
2018-07-27 17:23:43 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-07-27 17:23:43 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-07-27 17:23:43 [scrapy.middleware] INFO: Enabled item pipelines:
['ArticleSpider.pipelines.MysqlTwistedPipeline']
2018-07-27 17:23:43 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session {"desiredCapabilities": {"browserName": "chrome", "goog:chromeOptions": {"extensions": [], "args": []}, "platform": "ANY", "version": ""}, "capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "chrome", "goog:chromeOptions": {"extensions": [], "args": []}, "platformName": "any"}}}
2018-07-27 17:23:44 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:23:44 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/url {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c", "url": "https://www.zhihu.com/signin"}
2018-07-27 17:23:47 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:23:47 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/element {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c", "using": "css selector", "value": ".SignFlow-accountInput.Input-wrapper Input"}
2018-07-27 17:23:47 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:23:47 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/element/0.6425891660798901-1/value {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c", "id": "0.6425891660798901-1", "text": "18374820162", "value": ["1", "8", "3", "7", "4", "8", "2", "0", "1", "6", "2"]}
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/element {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c", "using": "css selector", "value": ".SignFlow-password Input"}
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/element/0.6425891660798901-2/value {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c", "id": "0.6425891660798901-2", "text": "glw0107", "value": ["g", "l", "w", "0", "1", "0", "7"]}
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/element {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c", "using": "css selector", "value": ".Button.SignFlow-submitButton"}
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/element/0.6425891660798901-3/click {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c", "id": "0.6425891660798901-3"}
2018-07-27 17:23:48 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:24:03 [selenium.webdriver.remote.remote_connection] DEBUG: GET http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/cookie {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c"}
[{'secure': False, 'expiry': 1532684325.727671, 'name': 'tgw_l7_route', 'httpOnly': False, 'value': '9553ebf607071b8b9dd310a140c349c5', 'path': '/', 'domain': 'www.zhihu.com'}, {'secure': False, 'expiry': 1548235429.176146, 'name': 'z_c0', 'httpOnly': True, 'value': '"2|1:0|10:1532683429|4:z_c0|92:Mi4xUWlXTkNnQUFBQUFBTU9hOThkYjJEU1lBQUFCZ0FsVk5wVEpJWEFDdmVKcnJlSl92dmVvR2ZBTFVoQ1NvNjBFVGdn|944d1db9f3f6bb1b78f967c8d987663d1eeda660906aa8ef7ed75f386990dc3d"', 'path': '/', 'domain': '.zhihu.com'}, {'secure': False, 'expiry': 1627291425.72777, 'name': 'q_c1', 'httpOnly': False, 'value': '53ce5a644d5e4a2daec599bd14701e9e|1532683426000|1532683426000', 'path': '/', 'domain': '.zhihu.com'}, {'secure': False, 'expiry': 1595755425.727722, 'name': '_zap', 'httpOnly': False, 'value': 'c85a1616-b07b-4e0c-aa63-168de8b40f73', 'path': '/', 'domain': '.zhihu.com'}, {'secure': False, 'expiry': 1627291425.727753, 'name': 'd_c0', 'httpOnly': False, 'value': '"ADDmvfHW9g2PThgUtPn3YmiypNmUcVWTG28=|1532683426"', 'path': '/', 'domain': '.zhihu.com'}, {'secure': False, 'name': '_xsrf', 'httpOnly': False, 'value': 'ad769b5c-301e-44e2-b3a3-1cc6563e0efe', 'path': '/', 'domain': '.zhihu.com'}, {'secure': False, 'expiry': 1535275428.092091, 'name': 'capsion_ticket', 'httpOnly': True, 'value': '"2|1:0|10:1532683428|14:capsion_ticket|44:NzA1YWEzYmQ5MTVlNDllZGJlZGQ4MWVhMTAyZDIyYTI=|bb02146e5a28c87704e8c87e3e11a6e3ea258eab27d41c96de45cf58c69d8e70"', 'path': '/', 'domain': '.zhihu.com'}]
2018-07-27 17:24:03 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:24:08 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:62370/session/645224376e6b6f99f2e739fb74ae5f7c/window {"sessionId": "645224376e6b6f99f2e739fb74ae5f7c"}
2018-07-27 17:24:08 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-07-27 17:24:08 [scrapy.core.engine] INFO: Spider opened
2018-07-27 17:24:08 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-07-27 17:24:08 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6024
2018-07-27 17:24:13 [scrapy.core.engine] DEBUG: Crawled (400) <GET http://www.zhihu.com/> (referer: None)
2018-07-27 17:24:14 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <400 http://www.zhihu.com/>: HTTP status code is not handled or not allowed
2018-07-27 17:24:14 [scrapy.core.engine] INFO: Closing spider (finished)
2018-07-27 17:24:14 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 267,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 411,
'downloader/response_count': 1,
'downloader/response_status_count/400': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 7, 27, 9, 24, 14, 46255),
'httperror/response_ignored_count': 1,
'httperror/response_ignored_status_count/400': 1,
'log_count/DEBUG': 22,
'log_count/INFO': 8,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2018, 7, 27, 9, 24, 8, 762266)}
2018-07-27 17:24:14 [scrapy.core.engine] INFO: Spider closed (finished)
Process finished with exit code 0
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程