from
urllib
import
parse
import
re
import
json
import
scrapy
import
undetected_chromedriver
from
scrapy
import
Request
import
requests
class
JobboleSpider(scrapy.Spider):
name
=
'jobbole3'
allowed_domains
=
[
'news.cnblogs.com'
]
start_urls
=
[
'http://news.cnblogs.com/'
]
custom_settings
=
{
"COOKIES_ENABLED"
:
True
}
def
start_requests(
self
):
import
undetected_chromedriver.v2 as uc
browser
=
uc.Chrome()
browser.get(
"https://account.cnblogs.com/signin"
)
input
(
"请回车继续:"
)
cookies
=
browser.get_cookies()
cookie_dict
=
{}
for
cookie
in
cookies:
cookie_dict[cookie[
'name'
]]
=
cookie[
'value'
]
for
url
in
self
.start_urls:
headers
=
{
'user-agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
yield
scrapy.Request(url, cookies
=
cookie_dict, headers
=
headers, dont_filter
=
True
)
def
parse(
self
, response):
url
=
response.css(
'div#news_list h2 a::attr(href)'
).extract()
pass