知乎现在已经是滑动验证码,而且只有滑块图和带缺口图,在网上找到了一些方法,
只用用滑块图和缺口背景图也可以计算出滑动距离和滑动轨迹,
主要也是用的第三方的图像处理模块来进行识别比如numpy和opencv-python,
他把它做成了一个slideVerfication类,
然后在主方法中调用这个类。
具体在这两个链接中
链接2:主要的说明,注释 + 部分代码
https://cloud.tencent.com/developer/article/1737736
遇到的问题:
我将他的slideVerfication类中的这个方法
def onload_save_img(self, url, filename=“image.png”):
""“
下载图片保存
:param url:图片地址
:param filename: 保存的图片名
:return:
”""
try:
response = requests.get(url=url)
except(requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError)as e:
print(“图片下载失败”)
raise e
else:
with open(filename, “wb”) as f:
f.write(response.content)
修改一下加到,自己按照你视频5-6,5-7 selenium自动识别验证码完成模拟登录思路,尝试改成滑动验证码的方式
结果遇到滑块图和缺口背景图无法下载
自己写的知乎爬虫文件模拟登录部分(还没写完。。。。),
代码:
import scrapy
import time
import pickle
import requests
import cv2
import numpy
from mouse import move,click
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com/signin']
start_urls = ['https://www.zhihu.com/signin']
def parse(self, response):
pass
def start_requests(self):
chrome_option = Options()
chrome_option.add_argument("--disable-extensions")
chrome_option.add_experimental_option("debuggerAddress","127.0.0.1:9222")
browser = webdriver.Chrome(executable_path="D:/pythonProject/chromedriver.exe",chrome_options=chrome_option)
try:
browser.maximize_window()
except:
pass
browser.get("https://www.zhihu.com/signin")
# 账号 + 密码 方式(鼠标移动,点击)
time.sleep(2)
move(921, 348)
click()
time.sleep(2)
# 在输入账号密码,通过“control+a”覆盖浏览器自动填充的内容
browser.find_element_by_css_selector('.SignFlow-account input').send_keys(Keys.CONTROL + "a")
time.sleep(5)
browser.find_element_by_css_selector('.SignFlow-account input').send_keys("13721093306")
browser.find_element_by_css_selector('.SignFlow-password input').send_keys(Keys.CONTROL + "a")
time.sleep(5)
browser.find_element_by_css_selector('.SignFlow-password input').send_keys("xxxxx")
# browser.find_element_by_xpath('//lable[@class="SignFlow-accountInput Input-wrapper"]/input').send_keys("13721093306")
# browser.find_element_by_xpath('//lable[@class="Input-wrapper"]/input').send_keys("xxxxx")
# 点击登录
time.sleep(2)
# move(959,564)
# click()
browser.find_element_by_css_selector('.Button.SignFlow-submitButton').click()
time.sleep(10)
# 模拟滑动验证
# 切换到滑动验证码的iframe中
tcaptcha = browser.find_element_by_id("tcaptcha_iframe")
browser.switch_to.frame(tcaptcha)
# 获取滑动相关的元素
# 选择拖动滑块的节点
slide_element = browser.find_element_by_id('tcaptcha_drag_thumb')
# 获取滑块图片的节点 68 * 68 (intrinsic : 136 * 136)
slideBlock_ele = browser.find_element_by_id('slideBlock')
# 获取缺口背景图片节点
slideBg = browser.find_element_by_id('slideBg')
# 获取验证码的图片
slider_url = "https://t.captcha.qq.com" + slideBlock_ele.get_attribute("src")
background_url = "https://t.captcha.qq.com" + slideBg.get_attribute("src")
# =================下面的保存图片在这个start_requests(self):方法要怎么实现呢?======================
# =================下面的保存图片在这个start_requests(self):方法要怎么实现呢?======================
# =================下面的保存图片在这个start_requests(self):方法要怎么实现呢?======================
# 下载验证码背景图,滑动图片
slider = "slider.jpg"
background = "background.jpg"
try:
response = requests.get(url=slider_url)
except(requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError)as e:
print("图片下载失败")
raise e
else:
with open(slider, "wb") as f:
f.write(response.content)
try:
response = requests.get(url=background_url)
except(requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError)as e:
print("图片下载失败")
raise e
else:
with open(background, "wb") as f:
f.write(response.content)
#======================================================================================================
# 读取进行色度图片,转换为numpy中的数组类型数据,
slider_pic = cv2.imread(slider, 0)
background_pic = cv2.imread(background, 0)
# 获取缺口图数组的形状 -- 缺口图的宽和高
width, height = slider_pic.shape[::-1]
# 将处理之后的图片另存
slider01 = "slider01.jpg"
background_01 = "background01.jpg"
cv2.imwrite(background_01, background_pic)
cv2.imwrite(slider01, slider_pic)
# 读取另存的滑块图
slider_pic = cv2.imread(slider01)
# 进行色彩转换
slider_pic = cv2.cvtColor(slider_pic, cv2.COLOR_BGR2GRAY)
# 获取色差的绝对值
slider_pic = abs(255 - slider_pic)
# 保存图片
cv2.imwrite(slider01, slider_pic)
# 读取滑块
slider_pic = cv2.imread(slider01)
# 读取背景图
background_pic = cv2.imread(background_01)
# 比较两张图的重叠区域
result = cv2.matchTemplate(slider_pic, background_pic, cv2.TM_CCOEFF_NORMED)
# 获取图片的缺口位置
top, left = numpy.unravel_index(result.argmax(), result.shape)
# 背景图中的图片缺口坐标位置
print("当前滑块的缺口位置:", (left, top, left + width, top + height))
报错信息:
D:\pythonProject\venv\Scripts\python.exe D:/pythonProject/ArticleSpider/main.py
图片下载失败
Unhandled error in Deferred:
2021-02-28 16:59:30 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "D:\pythonProject\venv\lib\site-packages\scrapy\crawler.py", line 192, in crawl
return self._crawl(crawler, *args, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\scrapy\crawler.py", line 196, in _crawl
d = crawler.crawl(*args, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\twisted\internet\defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "D:\pythonProject\venv\lib\site-packages\twisted\internet\defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "D:\pythonProject\venv\lib\site-packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "D:\pythonProject\venv\lib\site-packages\scrapy\crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
File "D:\pythonProject\ArticleSpider\ArticleSpider\spiders\zhihu.py", line 129, in start_requests
raise e
File "D:\pythonProject\ArticleSpider\ArticleSpider\spiders\zhihu.py", line 126, in start_requests
response = requests.get(url=slider_url)
File "D:\pythonProject\venv\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='t.captcha.qq.comhttps', port=443): Max retries exceeded with url: //t.captcha.qq.com/hycdn?index=2&image=937148127155333120?aid=2012031314&sess=s0ZaB1N45S-To0MBjJuCousb2-WZQpI3Yw72hfgtzmSfOf6kDz5XCfl9I2SWqsg1L6REZDHI1lfP0uYLXY3MoGyN6XOYF2B90XRBOwUrC1_9D-119Z9HwefhG1sAzKxAzU7Ml0vQZ9X_uFcFPq7IaOoHHkhER7xVA6O62joLi_1woP0k-Gir1a95DTHMf4-I2HfwctAcbVTgKWR8K7bmeNCuB9rEaJ7h89QbigdLUcPT7CbaNqcOIcIw**&sid=6771715379983822849&img_index=2&subsid=4 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001942BAE9B88>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
2021-02-28 16:59:30 [twisted] CRITICAL:
Traceback (most recent call last):
File "D:\pythonProject\venv\lib\site-packages\urllib3\connection.py", line 170, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw
File "D:\pythonProject\venv\lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "D:\Python\lib\socket.py", line 748, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\pythonProject\venv\lib\site-packages\urllib3\connectionpool.py", line 706, in urlopen
chunked=chunked,
File "D:\pythonProject\venv\lib\site-packages\urllib3\connectionpool.py", line 382, in _make_request
self._validate_conn(conn)
File "D:\pythonProject\venv\lib\site-packages\urllib3\connectionpool.py", line 1010, in _validate_conn
conn.connect()
File "D:\pythonProject\venv\lib\site-packages\urllib3\connection.py", line 353, in connect
conn = self._new_conn()
File "D:\pythonProject\venv\lib\site-packages\urllib3\connection.py", line 182, in _new_conn
self, "Failed to establish a new connection: %s" % e
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x000001942BAE9B88>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\pythonProject\venv\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "D:\pythonProject\venv\lib\site-packages\urllib3\connectionpool.py", line 756, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "D:\pythonProject\venv\lib\site-packages\urllib3\util\retry.py", line 573, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='t.captcha.qq.comhttps', port=443): Max retries exceeded with url: //t.captcha.qq.com/hycdn?index=2&image=937148127155333120?aid=2012031314&sess=s0ZaB1N45S-To0MBjJuCousb2-WZQpI3Yw72hfgtzmSfOf6kDz5XCfl9I2SWqsg1L6REZDHI1lfP0uYLXY3MoGyN6XOYF2B90XRBOwUrC1_9D-119Z9HwefhG1sAzKxAzU7Ml0vQZ9X_uFcFPq7IaOoHHkhER7xVA6O62joLi_1woP0k-Gir1a95DTHMf4-I2HfwctAcbVTgKWR8K7bmeNCuB9rEaJ7h89QbigdLUcPT7CbaNqcOIcIw**&sid=6771715379983822849&img_index=2&subsid=4 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001942BAE9B88>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\pythonProject\venv\lib\site-packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "D:\pythonProject\venv\lib\site-packages\scrapy\crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
File "D:\pythonProject\ArticleSpider\ArticleSpider\spiders\zhihu.py", line 129, in start_requests
raise e
File "D:\pythonProject\ArticleSpider\ArticleSpider\spiders\zhihu.py", line 126, in start_requests
response = requests.get(url=slider_url)
File "D:\pythonProject\venv\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "D:\pythonProject\venv\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='t.captcha.qq.comhttps', port=443): Max retries exceeded with url: //t.captcha.qq.com/hycdn?index=2&image=937148127155333120?aid=2012031314&sess=s0ZaB1N45S-To0MBjJuCousb2-WZQpI3Yw72hfgtzmSfOf6kDz5XCfl9I2SWqsg1L6REZDHI1lfP0uYLXY3MoGyN6XOYF2B90XRBOwUrC1_9D-119Z9HwefhG1sAzKxAzU7Ml0vQZ9X_uFcFPq7IaOoHHkhER7xVA6O62joLi_1woP0k-Gir1a95DTHMf4-I2HfwctAcbVTgKWR8K7bmeNCuB9rEaJ7h89QbigdLUcPT7CbaNqcOIcIw**&sid=6771715379983822849&img_index=2&subsid=4 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001942BAE9B88>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
老师什么时候会更新下知乎的模拟登录吗?
带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎
了解课程