1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | class proxyMiddleware( object ): def process_request( self , request , spider): iptype = re.match( "https|http" , request.url).group() proxy_ip = self .get_random_ip(iptype) proxy_user_pass = "18520864631:150145lqfli" encoded_user_pass = base64.b64encode(proxy_user_pass.encode(encoding = 'utf-8' )).decode() request.meta[ "proxy" ] = proxy_ip request.headers[ 'Proxy-Authorization' ] = 'Basic ' + encoded_user_pass def process_response( self , request , response , spider): #处理response if response.status ! = 200 : iptype = re.match( "https|http" , request.url).group() proxy_ip = self .get_random_ip(iptype) request.meta[ "proxy" ] = proxy_ip return request return response def get_random_ip( self , type ): with open ( "proxy_ip.txt" , "r" ) as rf: ip_str = rf.read() ip_list = ip_str.split( ',' ) ip_list_http = [] ip_list_https = [] for ip in ip_list: if ip.startswith( "http://" ): ip_list_http.append(ip) elif ip.startswith( "https://" ): ip_list_https.append(ip) if type = = "http" : ip = random.choice(ip_list_http).strip() return ip else : ip = random.choice(ip_list_https).strip() return ip |
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程