**老师,数据可以获取,就是debug到parse_detail就出错,希望老师能够帮忙解决一下,不然无法进行下去。
from urllib import parse
from scrapy.http import Request
import scrapy
import undetected_chromedriver.v2 as uc
class JobboleSpider(scrapy.Spider):
name = 'jobbole’
allowed_domains = [‘news.cnblogs.com’]
start_urls = [‘http://news.cnblogs.com/’]
custom_settings = {
#使后面请求沿用之前的cookie,默认是关闭的,这个设置只对这个爬虫生效
"COOKIES_ENABLED":True
}
def start_requests(self):
#这是入口可以模拟登陆拿到cookie,selenium 控制浏览器会被有些网站识别出来,undetected-chromedriver可以防止被识别
browser = uc.Chrome()
browser.get(“https://account.cnblogs.com/signin”)
input(“回车继续:”)
cookies = browser.get_cookies()
cookie_dic ={}
for cookie in cookies:
cookie_dic[cookie[‘name’]]= cookie[“value”]
for url in self.start_urls:
#将cookie交给scrapy,
headers = {
“user-agent”:“Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36”
}
yield scrapy.Request(url, headers=headers, cookies=cookie_dic, callback=self.parse,dont_filter=True)
def parse(self, response):
# 获取新闻列表url和对应的图片
post_nodes = response.xpath("//div[@id='news_list']/div[@class='news_block']")
for post_node in post_nodes:
image_url = post_node.xpath("//div[@class='entry_summary']/a/img/@src").extract_first("")
post_url = post_node.xpath("//h2[@class='news_entry']/a/@href").extract_first("")
url = parse.urljoin(response.url, post_url)
print(url)
# 交出url
yield Request(url=url, meta={"front_image_url": image_url},
callback=self.parse_detail, dont_filter=True)
def parse_detail(self, response):
pass
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程