class proxyMiddleware(object): def process_request(self , request , spider): iptype = re.match("https|http" , request.url).group() proxy_ip = self.get_random_ip(iptype) proxy_user_pass = "18520864631:150145lqfli" encoded_user_pass = base64.b64encode(proxy_user_pass.encode(encoding='utf-8')).decode() request.meta["proxy"] = proxy_ip request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass def process_response(self , request , response , spider): #处理response if response.status != 200 : iptype = re.match("https|http", request.url).group() proxy_ip = self.get_random_ip(iptype) request.meta["proxy"] = proxy_ip return request return response def get_random_ip(self , type): with open("proxy_ip.txt" , "r") as rf: ip_str = rf.read() ip_list = ip_str.split(',') ip_list_http = [] ip_list_https = [] for ip in ip_list: if ip.startswith("http://"): ip_list_http.append(ip) elif ip.startswith("https://"): ip_list_https.append(ip) if type == "http": ip = random.choice(ip_list_http).strip() return ip else: ip = random.choice(ip_list_https).strip() return ip
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程