# coding=gbk
import json
import multiprocessing
import time
import pandas as pd
import requests
import re
from Lagou.setting import proxy_ip , User_agent
class Handle_lagou(object):
# 定义初始化方法:
def __init__(self):
# 定义初始化session 保存cookies信息
self.lagou_session = requests.session()
# 定义初始请求头
# self.header = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
# }
self.header = {
'User-Agent': User_agent()
}
# 初始化城市,存放城市信息
self.lagou_city = ""
# 定义获取城市的方法
def handle_city(self):
# 定义获取城市名称的re
city_search = re.compile(r'www\.lagou\.com\/.*\/">(.*?)</a>')
# 创建请求路由
city_url = "https://www.lagou.com/jobs/allCity.html"
# 调用handle_requests方法,请求
city_result = self.handle_requests(method="GET", url=city_url)
# 正则提取响应中的 城市信息
self.city_list = city_search.findall(city_result)
# 清除 获取城市列表时的cookies信息
self.lagou_session.cookies.clear()
def handle_city_job(self, city):
# 先发送get请求, 获取 post请求的cookies
first_url = 'https://www.lagou.com/jobs/list_python?&px=default&city={}'.format(city)
first_response = self.handle_requests(method="GET", url=first_url)
# 获取城市岗位页数
total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
# 如果没有岗位信息,直接return
try:
page_num = total_page_search.search(first_response).group(1)
# print(page_num,city)
# print("\n\n"+page_num, city + "\n\n")
print(city,page_num+"页")
except:
return
else:
# 根据 page_num,range出 页码数:
for page in range(1, int(page_num)+1):
# 建立 post请求的formdata
data = {
"pn": page,
"kd": "python"
}
# Ajax异步请求的 url
page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
# Ajax异步请求的
referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
# referer的URL需要进行encode
self.header['Referer'] = referer_url.encode()
response = self.handle_requests(method="POST", url=page_url, data=data, info=city)
lagou_data = json.loads(response)
job_list = lagou_data['content']['positionResult']['result']
jobs = []
for job in job_list:
jobs.append(job)
pd.DataFrame(jobs).to_csv('job.csv', mode='a', index=False, )
# print(job)
# 定义请求的方法 传入method:请求方式,data:请求参数,url:请求路由.info:请求参数信息
def handle_requests(self, method, url, data=None, info=None, cookies=None):
while True:
# proxy = proxy_ip()
try:
proxy = proxy_ip()
if method == "GET":
response = self.lagou_session.get(url=url, headers=self.header, )
elif method == "POST":
response = self.lagou_session.post(url=url, headers=self.header, data=data, proxies=proxy, timeout=6)
# response = self.lagou_session.post(url=url,headers=self.header,data=data,timeout=6)
response.encoding = "utf-8"
except:
print("\n\n"+"请求中报错"+"\n\n")
if '频繁' in response.text:
print("频繁")
# 清除cookies
self.lagou_session.cookies.clear()
# 重新获取cookies信息
first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
self.handle_requests(method="GET", url=first_request_url)
time.sleep(10)
continue
return response.text
if __name__ == '__main__':
lagou = Handle_lagou()
lagou.handle_city()
# for city in lagou.city_list:
# lagou.handle_city_job(city)
pool = multiprocessing.Pool(2)
for city in lagou.city_list:
pool.apply_async(lagou.handle_city_job, args=(city,))
pool.close()
pool.join()