Python爬蟲selenium驗證之中文識別點選+圖片驗證碼案例(最新推薦)
1.獲取圖片
import re import time import ddddocr import requests from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver import ActionChains service = Service("driver/chromedriver.exe") driver = webdriver.Chrome(service=service) # 1.打開首頁 driver.get('https://www.geetest.com/adaptive-captcha-demo') # 2.點擊【文字點選驗證】 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.XPATH, '//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) tag.click() # 3.點擊開始驗證 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) tag.click() time.sleep(5) # 要識別的目標(biāo)圖片 target_tag = driver.find_element( By.CLASS_NAME, 'geetest_ques_back' ) target_tag.screenshot("target.png") # 識別圖片 bg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) bg_tag.screenshot("bg.png") time.sleep(2000) driver.close()
2.目標(biāo)識別
截圖每個字符,并基于ddddocr識別。
import re import time import ddddocr import requests from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver import ActionChains service = Service("driver/chromedriver.exe") driver = webdriver.Chrome(service=service) # 1.打開首頁 driver.get('https://www.geetest.com/adaptive-captcha-demo') # 2.點擊【滑動拼圖驗證】 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.XPATH, '//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) tag.click() # 3.點擊開始驗證 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) tag.click() # 4.等待驗證碼出來 time.sleep(5) # 5.識別任務(wù)圖片 target_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = ddddocr.DdddOcr(show_ad=False) word = ocr.classification(tag.screenshot_as_png) target_word_list.append(word) print("要識別的文字:", target_word_list) time.sleep(2000) driver.close()
3.背景坐標(biāo)識別
3.1 ddddocr
能識別,但是發(fā)現(xiàn)默認識別率有點低,想要提升識別率,可以搭建Pytorch
環(huán)境對模型進行訓(xùn)練,參考:https://github.com/sml2h3/dddd_trainer
import re import time import ddddocr import requests from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver import ActionChains from PIL import Image, ImageDraw from io import BytesIO service = Service("driver/chromedriver.exe") driver = webdriver.Chrome(service=service) # 1.打開首頁 driver.get('https://www.geetest.com/adaptive-captcha-demo') # 2.點擊【滑動拼圖驗證】 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.XPATH, '//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) tag.click() # 3.點擊開始驗證 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) tag.click() # 4.等待驗證碼出來 time.sleep(5) # 5.識別任務(wù)圖片 target_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = ddddocr.DdddOcr(show_ad=False) word = ocr.classification(tag.screenshot_as_png) target_word_list.append(word) print("要識別的文字:", target_word_list) # 6.背景圖片 bg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) content = bg_tag.screenshot_as_png # 7.識別背景中的所有文字并獲取坐標(biāo) ocr = ddddocr.DdddOcr(show_ad=False, det=True) poses = ocr.detection(content) # [(x1, y1, x2, y2), (x1, y1, x2, y2), x1, y1, x2, y2] # 8.循環(huán)坐標(biāo)中的每個文字并識別 bg_word_dict = {} img = Image.open(BytesIO(content)) for box in poses: x1, y1, x2, y2 = box # 根據(jù)坐標(biāo)獲取每個文字的圖片 corp = img.crop(box) img_byte = BytesIO() corp.save(img_byte, 'png') # 識別文字 ocr2 = ddddocr.DdddOcr(show_ad=False) word = ocr2.classification(img_byte.getvalue()) # 識別率低 # 獲取每個字的坐標(biāo) {"鴨":} bg_word_dict[word] = [int((x1 + x2) / 2), int((y1 + y2) / 2)] print(bg_word_dict) time.sleep(1000) driver.close()
3.2 打碼平臺
https://www.chaojiying.com/
import base64 import requests from hashlib import md5 file_bytes = open('5.jpg', 'rb').read() res = requests.post( url='http://upload.chaojiying.net/Upload/Processing.php', data={ 'user': "deng", 'pass2': md5("密碼".encode('utf-8')).hexdigest(), 'codetype': "9501", 'file_base64': base64.b64encode(file_bytes) }, headers={ 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } ) res_dict = res.json() print(res_dict) # {'err_no': 0, 'err_str': 'OK', 'pic_id': '1234612060701120002', 'pic_str': '的,86,73|粉,111,38|菜,40,49|香,198,101', 'md5': 'faac71fc832b2ead01ffb4e813f3be60'}
結(jié)合極驗案例截圖+識別:
import re import time import ddddocr import requests import base64 import requests from hashlib import md5 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver import ActionChains from PIL import Image, ImageDraw from io import BytesIO service = Service("driver/chromedriver.exe") driver = webdriver.Chrome(service=service) # 1.打開首頁 driver.get('https://www.geetest.com/adaptive-captcha-demo') # 2.點擊【滑動拼圖驗證】 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.XPATH, '//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) tag.click() # 3.點擊開始驗證 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) tag.click() # 4.等待驗證碼出來 time.sleep(5) # 5.識別任務(wù)圖片 target_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = ddddocr.DdddOcr(show_ad=False) word = ocr.classification(tag.screenshot_as_png) target_word_list.append(word) print("要識別的文字:", target_word_list) # 6.背景圖片 bg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) content = bg_tag.screenshot_as_png bg_tag.screenshot("bg.png") # 7.識別背景中的所有文字并獲取坐標(biāo) res = requests.post( url='http://upload.chaojiying.net/Upload/Processing.php', data={ 'user': "deng", 'pass2': md5("密碼".encode('utf-8')).hexdigest(), 'codetype': "9501", 'file_base64': base64.b64encode(content) }, headers={ 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } ) res_dict = res.json() print(res_dict) # 8.每個字的坐標(biāo) {"鴨":(196,85), ...} target_word_list = ["花","鴨","字"] bg_word_dict = {} for item in res_dict["pic_str"].split("|"): word, x, y = item.split(",") bg_word_dict[word] = (x, y) print(bg_word_dict) time.sleep(1000) driver.close()
4.坐標(biāo)點擊
根據(jù)坐標(biāo),在驗證碼上進行點擊。
ActionChains(driver).move_to_element_with_offset(標(biāo)簽對象, xoffset=x, yoffset=y).click().perform()
import re import time import ddddocr import requests import base64 import requests from hashlib import md5 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver import ActionChains from PIL import Image, ImageDraw from io import BytesIO service = Service("driver/chromedriver.exe") driver = webdriver.Chrome(service=service) # 1.打開首頁 driver.get('https://www.geetest.com/adaptive-captcha-demo') # 2.點擊【滑動拼圖驗證】 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.XPATH, '//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]' )) tag.click() # 3.點擊開始驗證 tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element( By.CLASS_NAME, 'geetest_btn_click' )) tag.click() # 4.等待驗證碼出來 time.sleep(5) # 5.識別任務(wù)圖片 target_word_list = [] parent = driver.find_element(By.CLASS_NAME, 'geetest_ques_back') tag_list = parent.find_elements(By.TAG_NAME, "img") for tag in tag_list: ocr = ddddocr.DdddOcr(show_ad=False) word = ocr.classification(tag.screenshot_as_png) target_word_list.append(word) print("要識別的文字:", target_word_list) # 6.背景圖片 bg_tag = driver.find_element( By.CLASS_NAME, 'geetest_bg' ) content = bg_tag.screenshot_as_png # bg_tag.screenshot("bg.png") # 7.識別背景中的所有文字并獲取坐標(biāo) res = requests.post( url='http://upload.chaojiying.net/Upload/Processing.php', data={ 'user': "deng", 'pass2': md5("自己密碼".encode('utf-8')).hexdigest(), 'codetype': "9501", 'file_base64': base64.b64encode(content) }, headers={ 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } ) res_dict = res.json() bg_word_dict = {} for item in res_dict["pic_str"].split("|"): word, x, y = item.split(",") bg_word_dict[word] = (x, y) print(bg_word_dict) # target_word_list = ['粉', '菜', '香'] # bg_word_dict = {'粉': ('10', '10'), '菜': ('50', '50'), '香': ('100', '93')} # 8.點擊 for word in target_word_list: time.sleep(2) group = bg_word_dict.get(word) if not group: continue x, y = group x = int(x) - int(bg_tag.size['width'] / 2) y = int(y) - int(bg_tag.size['height'] / 2) ActionChains(driver).move_to_element_with_offset(bg_tag, xoffset=x, yoffset=y).click().perform() time.sleep(1000) driver.close()
5.圖片驗證碼
在很多登錄、注冊、頻繁操作等行為時,一般都會加入驗證碼的功能。
如果想要基于代碼實現(xiàn)某些功能,就必須實現(xiàn):自動識別驗證碼,然后再做其他功能。
6.識別
基于Python的模塊 ddddocr
可以實現(xiàn)對圖片驗證碼的識別。
pip3.11 install ddddocr==1.4.9 -i https://mirrors.aliyun.com/pypi/simple/ pip3.11 install Pillow==9.5.0
pip install ddddocr==1.4.9 -i https://mirrors.aliyun.com/pypi/simple/ pip install Pillow==9.5.0
6.1 本地識別
import ddddocr ocr = ddddocr.DdddOcr(show_ad=False) with open("img/v1.jpg", mode='rb') as f: body = f.read() code = ocr.classification(body) print(code)
6.2 在線識別
也可以直接請求獲取圖片,然后直接識別:
import ddddocr import requests res = requests.get(url="https://console.zbox.filez.com/captcha/create/reg?_t=1701511836608") ocr = ddddocr.DdddOcr(show_ad=False) code = ocr.classification(res.content) print(code)
import ddddocr import requests res = requests.get( url=f"https://api.ruanwen.la/api/auth/captcha?captcha_token=n5A6VXIsMiI4MTKoco0VigkZbByJbDahhRHGNJmS" ) ocr = ddddocr.DdddOcr(show_ad=False) code = ocr.classification(res.content) print(code)
6.3 base64
有些平臺的圖片是以base64編碼形式存在,需要處理下在識別。
import base64 import ddddocr content = base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAGQAAAAoCAYAAAAIeF9DAAAHGElEQVR4Xu2a2VNTZxTAHZ/62of+BX3rdPrUmaq1da3WQWur1mqntrQWLe7UkUoQlEWFqFDZZN8hUBWKQUVpQDCyVUeltVWIIiAEZHWBAEk4zffZe+bmS+6SEEzE/GbOkHvPuXeY85t7vyWZBV48ilnsCS/uxSvEw3hthJydXWITnsiMFyLWfLGcu5jRQuQ2W27dy8ArBOTXvQxmrBBHm+xo/XQhKkTffRu0NSfgt8KvISttGaQlfQyFOWtBXboDbt7Ig6HBdvYSj6C7awDC3kqGg4oC8N0YC6uWhcPCOUHgszQMtvudgsK8Gnj2dNTqGmeErH47Z8rBYlfI6MgAXLqwH5Lj50iGJzL//UDJWP1pBDTfasNrnBEyHdgIef6sF1R5X9o0Xig8Ebb5QrF8QajlLTDoMTIIVkLMZiOcKfrWquGXLyqgo70BDKNDNG8wDFleCTehsS4JivK/4l/uMWxco4T4WDUo30yH+zo9PH0yavn/x+nnuBg1LPhgP0qJjjzjuUJuNGWiiJSEedByt4KffiWxt9bIz65GISveO2CVczcoxGQah+y05SikqT6ZX/fKw4khkfNGEQpZNDeILXUrKKT13mWUkZmyBIxGA7/O5XT2PoL0c9kQcPIX+D5yK2w/vgcis6Oh5qYWJicnac2msM0YrmRw4BkK+cwyA3OEuLh4CApS0GhoaGTTNtTXN2B9fHwCm7YBhdRWR6GQK5rD/BqXU9FQCb4RflYN50d0fgwYxsemTUjJ6ToUEh6iYtOi1NXVY4MTEhLZtA2khquvr69n0zagEP5gfu/uBXruga4aykr8ISv1E0g/tRBUueug+o8I6NH/hTdwFO3tOhsB9iK5NN2lQsbHjZbJSR9kplbC4nkKKsNnySHoejTAlooyOjoKisAXDSYRN2cvW4Lo9XqsCwkJpddKgUJyM3xQSH9fC9RUHbWabbFBnigy63KEEcMIbI3agU0mrynN9WoYfDoIJrOJ/iXH5DwryFnYqS4XP26Kg86OPrZcklPvBIBKVYSNVqvL6Tl7qNVqrCsqKmbTdkEhaUkfYbOv1cbaCLAXNVVR/HtJQl5VXIO3Ru+E3sHHbAmFnCf56RKyb3cmPLjfw5bKgjS/tVWHjY6IiISkd22FmEwmmuPqdDodW2IXFJKaOB8bnZr4IRQXbITWlst01U6ehJGRfnpM1h58KY68vo4VxGKD1doXr0UhSH66hHCx/+dsePJkhL1EkiSLlJDdQdjs5mbbHjQ3N2NeqTyGExUpUAgZJ7gmny32BeOE/ffdhOX8adUmrK2qlD9L2RWzFxvc3adn01Z09XW7RAiH0WiCx73DoKm8DVt8E1DK+tVRTknRaDTY8MzMLDZNz3F5jaaKTQuCQgpy1mCTH3U08Wts6LSs3Lnawty1bFoQMr3lGjxhnGDTVpC8K4XwMZvNELwvF6XEnTjHlkgyPDwMCkUwbTj5S47l5KRAIefL9mCThZ4ODpLnasnsSy6eIoTQ/vAxClm36iibloXQUyD19IiBQhquJTgnJHkRmxbEna8slgnLNJgT4uxq3d44QYJ8FhtfxEAhXZ3XscmdHeIrULLZyNWq8tazaUHcMagL0XKvC4WQ70ucwXYmdZ/OprhjkiM1joBCJifNkJe5kjaZDuoCWyfsoO7I1Ncd0157jI1NwC7/FBQSGJDFlsiGv9YoLi6m6w3uuLy8nC2XxGq395+/S7HRZNqra6m0rC4HLYOgif4lx8X5G7AmOX4uDPTLm18T7C0Mq65fsSwIh/5fGA7R46ksDDd8oYSTx89BnfZfut1O9q1MJjPdfm970EO3Tcj2PH/6e7XmDnsb2bCrcRLcsV7v+FrHSgh5SviDu1RUrAyB1tlX+beQRO7WSUppBn7+LtyPvY0g/EbLicOH5K2gxeDvV3GRmJjElsnCSgiBDNgV5wNtms8P8l3Jn41pluoXix0ixRExcjYXh58/weOflLvYWwjCNlwoyECennzJ8vTLW7CJQXZ9WSGNjeLjsBA2Qjja27RQeTEYcsJW0G2VjJTF9McO2toYwR83OCKFbL+nlWVBwK+BdDq87Zj19jvJc0ICE4PZywUhP3BQ/94E4QdU8MM3J+HzFZH0Bw5L5wfDGp8jsHdnBuRlVdNFoqswGAwQGnoQZZDPY2NjbJksBIUQHGkwhzPX2KPkShkKSTqbwqZnLIJCptLYqVxLaNd3gN/RbSik9paWLZmxuEVIcMohKL92EVo6dNA/PABGk5F+IdXW/RBOV5XA5iP+KMNfuRvGJ8bZW8xY3CKEHcTFouGO+L7aTMNjhWw+7P9avao4BIUQpBprDznXtPd0wJmqUjiSo4R98Qq6WPSN2EIXhBFZUXRAJ9Pe1xFRIQQ5DeZwpNaLfSSFEEijxZotlfciH1lCOLjGs+HFdTgkxMv08x9BPe61Ol73uQAAAABJRU5ErkJggg==") # with open('x.png', mode='wb') as f: # f.write(content) ocr = ddddocr.DdddOcr(show_ad=False) code = ocr.classification(content) print(code)
7.案例:x文街
https://i.ruanwen.la/
import requests import ddddocr # 獲得圖片驗證碼地址 res = requests.post(url="https://api.ruanwen.la/api/auth/captcha/generate") res_dict = res.json() captcha_token = res_dict['data']['captcha_token'] captcha_url = res_dict['data']['src'] # 訪問并獲取圖片驗證碼 res = requests.get(captcha_url) # 識別驗證碼 ocr = ddddocr.DdddOcr(show_ad=False) code = ocr.classification(res.content) print(code) # 登錄認證 res = requests.post( url="https://api.ruanwen.la/api/auth/authenticate", json={ "mobile": "手機號", "device": "pc", "password": "密碼", "captcha_token": captcha_token, "captcha": code, "identity": "advertiser" } ) print(res.json()) # {'success': True, 'message': '驗證成功', 'data': {'token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJodHRwczovL2FwaS5ydWFud2VuLmxhL2FwaS9hdXRoL2F1dGhlbnRpY2F0ZSIsImlhdCI6MTcwMTY1MzI2NywiZXhwIjoxNzA1MjUzMjY3LCJuYmYiOjE3MDE2NTMyNjcsImp0aSI6IjQ3bk05ejZyQ0JLV28wOEQiLCJzdWIiOjUzMzEyNTgsInBydiI6IjQxZGY4ODM0ZjFiOThmNzBlZmE2MGFhZWRlZjQyMzQxMzcwMDY5MGMifQ.XxFYMEot-DfjTUcuVuoCjcBqu3djvzJiTeJERaR95co'}, 'status': 200}
到此這篇關(guān)于Python爬蟲selenium驗證-中文識別點選+圖片驗證碼案例的文章就介紹到這了,更多相關(guān)Python selenium驗證內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
- 使用 Python 和 Selenium 解決 Cloudflare 驗證碼的問題
- python+selenium行為鏈登錄12306(滑動驗證碼滑塊)
- Python Selenium破解滑塊驗證碼最新版(GEETEST95%以上通過率)
- Python +Selenium解決圖片驗證碼登錄或注冊問題(推薦)
- Selenium+Python 自動化操控登錄界面實例(有簡單驗證碼圖片校驗)
- selenium+python實現(xiàn)1688網(wǎng)站驗證碼圖片的截取功能
- Python使用selenium實現(xiàn)網(wǎng)頁用戶名 密碼 驗證碼自動登錄功能
- Python Selenium Cookie 繞過驗證碼實現(xiàn)登錄示例代碼
- python+selenium識別驗證碼并登錄的示例代碼
相關(guān)文章
Python調(diào)用Java可執(zhí)行jar包問題
這篇文章主要介紹了Python調(diào)用Java可執(zhí)行jar包問題,具有很好的參考價值,希望對大家有所幫助。如有錯誤或未考慮完全的地方,望不吝賜教2022-12-12numpy數(shù)組的重塑和轉(zhuǎn)置實現(xiàn)
本文主要介紹了numpy數(shù)組的重塑和轉(zhuǎn)置實現(xiàn),文中通過示例代碼介紹的非常詳細,對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2023-03-03Python模擬登陸淘寶并統(tǒng)計淘寶消費情況的代碼實例分享
借助urllib、urllib2和BeautifulSoup等幾個模塊的常用爬蟲開發(fā)組合,我們能夠輕易實現(xiàn)一份淘寶對賬單,這里我們就來看一則Python模擬登陸淘寶并統(tǒng)計淘寶消費情況的代碼實例分享:2016-07-07