# -*- coding: utf-8 -*-
import requests,re,pprint
from selenium import webdriver
from modle import *
start_url = "http://www.zmz2019.com/resourcelist/?page=1&channel=tv&area=美国"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
domain = "http://www.zmz2019.com"
tail_url = "http://www.zmz2019.com/resource/26654"
complete_info = 'http://www.zmz2019.com/resource/index_json/rid/{}/channel/tv'.format('Id')
browser = webdriver.Chrome(executable_path='D:\Python37\chromedriver.exe')
browser.get(tail_url)
import time
time.sleep(5)
cookies = browser.get_cookies()
cookies_dict = {}
for item in cookies:
cookies_dict[item["name"]] = item["value"]
def parse_tail(tail_url):
tail_res = requests.get(tail_url,headers=headers,cookies=cookies_dict).text
P_Tail_Title = '<dl class="fr" id="operate_link">.*?<h2>.*?(.*?)<label id="play_status">'
Tail_Title = re.findall(P_Tail_Title,tail_res,re.S)
pass
def parse_topic(url):
movie = Renren_Spider()
res = requests.get(url,headers=headers).text
p_List = '<div class="resource-showlist has-point">.*?<ul>(.*?)</ul>'
lists = re.findall(p_List,res,re.S)[0].strip()
p_Lis = '<li class="clearfix">(.*?)</li>'
Lis = re.findall(p_Lis,lists,re.S)
for li in Lis:
P_Title = '<div class="fl-info">.*?</strong>(.*?)</a>'
Title = re.findall(P_Title,li,re.S)[0]
P_Href = '<div class="fl-info">.*?<a href="(.*?)"'
Href = re.findall(P_Href,li,re.S)[0]
p_Id = "/.*?/(\d+)"
Id = int(re.findall(p_Id,Href)[0])
Href = domain + Href
p_type = '<p>【类型】(.*?)</p>'
type = re.findall(p_type,li,re.S)[0]
movie.Id = Id
movie.Title = Title
movie.Href = Href
movie.Type = type
exist_id = Renren_Spider.select().where(Renren_Spider.Id == movie.Id)
if exist_id:
movie.save()
else:
movie.save(force_insert=True)
P_pages = '<div class="pages">(.*?)</div>'
pages = re.findall(P_pages,res,re.S)[0]
P_next_page = "</b> <a href='(.*?)'>下一页"
next_page = re.findall(P_next_page,pages)[0]
p_check = r"/resourcelist/\?page=(.*?)&"
check = int(re.findall(p_check,next_page)[0])
if check <= 5:
#if next_page :
next_page = domain + next_page
parse_topic(next_page)
if __name__ == "__main__":
#parse_topic(start_url)
parse_tail(tail_url)
Booby老师:
您好,学生最近为了练习正则表达式写了上面这个爬虫,但在解析详情页(’ parse_tail()’)时node中的中文信息全是乱码,尝试添加header和cookies到request.get()中还是爬出乱码。请老师指点如何解决该问题?
谢谢!