Python爬蟲使用實(shí)例wallpaper問題記錄
1/ 排雷避坑
?? 中文亂碼問題
print(requests.get(url=url,headers=headers).text)
出現(xiàn)中文亂碼
原因分析:
<meta charset="gbk" />
解決方法:
法一:
response = requests.get(url=url,headers=headers) response.encoding = response.apparent_encoding # 自動(dòng)轉(zhuǎn)碼, 防止中文亂碼 print(response.text)
法二:
print(requests.get(url=url,headers=headers).content.decode('gbk'))
2/ 數(shù)據(jù)來源
css解析
for li in lis: href = li.css('a::attr(href)').get() title = li.css('b::text').get() print(href, title)
刪掉標(biāo)題為空的那一張圖
獲取圖片url
有的網(wǎng)站,保存的數(shù)據(jù)是裂開的圖片,可能是因?yàn)檫@個(gè)參數(shù):
3/ 正則處理
處理圖片url和標(biāo)題的時(shí)候用了re模塊
電腦壁紙
通過匹配非數(shù)字字符并在遇到數(shù)字時(shí)截?cái)嘧址?/p>
title1 = selector1.css('.photo .photo-pic img::attr(title)').get() modified_title = re.split(r'\d', title1, 1)[0].strip()
re.split(r'\d', title, 1)
將 title 字符串按第一個(gè)數(shù)字進(jìn)行分割。返回的列表的第一個(gè)元素就是數(shù)字前面的部分。strip()
去掉字符串首尾的空白字符。
url圖片路徑替換,因?yàn)閺狞c(diǎn)開圖片到達(dá)的那個(gè)頁面無法得到的圖片路徑還是html頁面,不是https://····.jpg,所以替換成另一個(gè)可以獲取到的頁面。
https://sj.zol.com.cn/bizhi/detail_{num1}_{num2}.html
正則替換修改為
https://app.zol.com.cn/bizhi/detail_{num1}.html
例如 https://sj.zol.com.cn/bizhi/detail_12901_139948.html
轉(zhuǎn)換為 https://app.zol.com.cn/bizhi/detail_12901.html
.
# https://sj.zol.com.cn/bizhi/detail_12901_139948.html url = "https://sj.zol.com.cn/bizhi/detail_12901_139948.html" pattern = r'https://sj\.zol\.com\.cn/bizhi/detail_(\d+)_\d+\.html' replacement = r'https://app.zol.com.cn/bizhi/detail_\1.html' new_url = re.sub(pattern, replacement, url) print(url,new_url)
4/ 電腦壁紙
?? 單線程單頁
適用于當(dāng)頁面和第一頁
# python單線程爬取高清4k壁紙圖片 import os import re import requests import parsel url = 'https://pic.netbian.com/4kmeinv/' # 請求地址 # 模擬偽裝 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} # response = requests.get(url=url,headers=headers) # response.encoding = response.apparent_encoding # 自動(dòng)轉(zhuǎn)碼, 防止中文亂碼 # print(response.text) html_data = requests.get(url=url,headers=headers).content.decode('gbk') # print(html_data) selector = parsel.Selector(html_data) lis = selector.css('.slist li') for li in lis: #href = li.css('a::attr(href)').get() title = li.css('b::text').get() if title: + li.css('a::attr(href)').get() response = requests.get(url=href, headers=headers) #print(href, title) # 這里只是獲取頁面 # img_content = requests.get(url=href, headers=headers).content # 不可行, 都是同一張圖 https://pic.netbian.com/uploads/allimg/230813/221347-16919360273e05.jpg + li.css('a::attr(href)').get() response1 = requests.get(url=href, headers=headers).content.decode('gbk') selector1 = parsel.Selector(response1) # 若要標(biāo)題亂碼,此處可不解碼 # response1 = requests.get(url=href, headers=headers) # selector1 = parsel.Selector(response1.text) # img_url = selector1.css('.slist li img::attr(src)').get() # 這一步錯(cuò)了, 要去href頁面找img_url, 這是在原來的url頁面找了 img_url = 'https://pic.netbian.com' + selector1.css('.photo .photo-pic img::attr(src)').get() img_content = requests.get(url=img_url,headers=headers).content # 順便更新一下title, 因?yàn)樵瓉淼氖前虢氐? 不全 title1 = selector1.css('.photo .photo-pic img::attr(title)').get() modified_title = re.split(r'\d', title1, 1)[0].strip() with open('img\\'+modified_title+'.jpg',mode='wb') as f: f.write(img_content) #print(href, title) print('正在保存:', modified_title, img_url)
?? 單線程多page
適用于從第二頁開始的多頁
# python單線程爬取高清4k壁紙圖片 import os import re import time import requests import parsel # url的規(guī)律 # https://pic.netbian.com/new/index.html # https://pic.netbian.com/new/index_1.html # https://pic.netbian.com/new/index_2.html # ... start_time = time.time() for page in range(2,10): print(f'--------- 正在爬取第{page}的內(nèi)容 ----------') url = f'https://pic.netbian.com/4kmeinv/index_{page}.html' # 請求地址 # 模擬偽裝 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} # response = requests.get(url=url,headers=headers) # response.encoding = response.apparent_encoding # 自動(dòng)轉(zhuǎn)碼, 防止中文亂碼 # print(response.text) html_data = requests.get(url=url, headers=headers).content.decode('gbk') # print(html_data) selector = parsel.Selector(html_data) lis = selector.css('.slist li') for li in lis: # href = li.css('a::attr(href)').get() title = li.css('b::text').get() if title: + li.css('a::attr(href)').get() response = requests.get(url=href, headers=headers) # print(href, title) # 這里只是獲取頁面 # img_content = requests.get(url=href, headers=headers).content # 不可行, 都是同一張圖 https://pic.netbian.com/uploads/allimg/230813/221347-16919360273e05.jpg + li.css('a::attr(href)').get() response1 = requests.get(url=href, headers=headers).content.decode('gbk') selector1 = parsel.Selector(response1) # 若要標(biāo)題亂碼,此處可不解碼 # response1 = requests.get(url=href, headers=headers) # selector1 = parsel.Selector(response1.text) # img_url = selector1.css('.slist li img::attr(src)').get() # 這一步錯(cuò)了, 要去href頁面找img_url, 這是在原來的url頁面找了 img_url = 'https://pic.netbian.com' + selector1.css('.photo .photo-pic img::attr(src)').get() img_content = requests.get(url=img_url, headers=headers).content # 順便更新一下title, 因?yàn)樵瓉淼氖前虢氐? 不全 title1 = selector1.css('.photo .photo-pic img::attr(title)').get() modified_title = re.split(r'\d', title1, 1)[0].strip() with open('img\\' + modified_title + '.jpg', mode='wb') as f: f.write(img_content) # print(href, title) print('正在保存:', modified_title, img_url) stop_time = time.time() print(f'耗時(shí):{int(stop_time)-int(start_time)}秒')
運(yùn)行效果:
?? 多線程多頁
# python多線程爬取高清4k壁紙圖片 import os import re import time import requests import parsel import concurrent.futures def get_img(url): # 模擬偽裝 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} # response = requests.get(url=url,headers=headers) # response.encoding = response.apparent_encoding # 自動(dòng)轉(zhuǎn)碼, 防止中文亂碼 # print(response.text) html_data = requests.get(url=url, headers=headers).content.decode('gbk') # print(html_data) selector = parsel.Selector(html_data) lis = selector.css('.slist li') for li in lis: # href = li.css('a::attr(href)').get() title = li.css('b::text').get() if title: + li.css('a::attr(href)').get() response = requests.get(url=href, headers=headers) # print(href, title) # 這里只是獲取頁面 # img_content = requests.get(url=href, headers=headers).content # 不可行, 都是同一張圖 https://pic.netbian.com/uploads/allimg/230813/221347-16919360273e05.jpg + li.css('a::attr(href)').get() response1 = requests.get(url=href, headers=headers).content.decode('gbk') selector1 = parsel.Selector(response1) # 若要標(biāo)題亂碼,此處可不解碼 # response1 = requests.get(url=href, headers=headers) # selector1 = parsel.Selector(response1.text) # img_url = selector1.css('.slist li img::attr(src)').get() # 這一步錯(cuò)了, 要去href頁面找img_url, 這是在原來的url頁面找了 img_url = 'https://pic.netbian.com' + selector1.css('.photo .photo-pic img::attr(src)').get() img_content = requests.get(url=img_url, headers=headers).content # 順便更新一下title, 因?yàn)樵瓉淼氖前虢氐? 不全 title1 = selector1.css('.photo .photo-pic img::attr(title)').get() modified_title = re.split(r'\d', title1, 1)[0].strip() img_folder = 'img1\\' if not os.path.exists(img_folder): os.makedirs(img_folder) with open(img_folder + modified_title + '.jpg', mode='wb') as f: f.write(img_content) # print(href, title) print('正在保存:', modified_title, img_url) def main(url): get_img(url) start_time = time.time() executor = concurrent.futures.ThreadPoolExecutor(max_workers=5) for page in range(2, 12): print(f'--------- 正在爬取第{page}的內(nèi)容 ----------') url = f'https://pic.netbian.com/4kmeinv/index_{page}.html' # 請求地址 executor.submit(main, url) executor.shutdown() stop_time = time.time() print(f'耗時(shí):{int(stop_time) - int(start_time)}秒')
5/ 手機(jī)壁紙
類似地,另一個(gè)網(wǎng)站,圖片集合多頁,點(diǎn)開之后里面有多張圖片
先試圖獲取外部的,再獲取里面的,然后2個(gè)一起
?? 單線程單頁0
import os import re import requests import parsel url = 'https://sj.zol.com.cn/bizhi/5/' # 請求地址 # 模擬偽裝 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} # response = requests.get(url=url,headers=headers) # response.encoding = response.apparent_encoding # 自動(dòng)轉(zhuǎn)碼, 防止中文亂碼 # print(response.text) response = requests.get(url=url,headers=headers) #print(response.text) selector = parsel.Selector(response.text) lis = selector.css('.pic-list2 li') #img_name=1 for li in lis: #href = li.css('a::attr(href)').get() title = li.css('.pic img::attr(title)').get() #href = li.css('.pic img::attr(src)').get() #print(title, href) if title: # +li.css('a::attr(href)').get() # https://sj.zol.com.cn/bizhi/detail_12901_139948.html # https://app.zol.com.cn/bizhi/detail_12901_139948.html#p1 # + li.css('a::attr(href)').get() + '#p1' href=li.css('img::attr(src)').get() #print(href, title) # + li.css('a::attr(href)').get() + '#p1' #response1 = requests.get(url=href, headers=headers).content.decode('utf-8') #selector1 = parsel.Selector(response1) #img_url=selector1.css('.gallery li img::attr(src)').get() #print(img_url) # 這里只是獲取頁面 img_content = requests.get(url=href, headers=headers).content # 不可行, 都是同一張圖 https://pic.netbian.com/uploads/allimg/230813/221347-16919360273e05.jpg # https://sj.zol.com.cn/bizhi/detail_12901_139948.html # https://app.zol.com.cn/bizhi/detail_12901_139948.html#p1 #href= selector1.css('.photo-list-box li::attr(href)').get() # + + '#p1' #response2 = requests.get(url=href, headers=headers) #selector2 = parsel.Selector(response2.text) #print(href) # 若要標(biāo)題亂碼,此處可不解碼 # response1 = requests.get(url=href, headers=headers) # selector1 = parsel.Selector(response1.text) # img_url = selector1.css('.slist li img::attr(src)').get() # 這一步錯(cuò)了, 要去href頁面找img_url, 這是在原來的url頁面找了 #img_url = selector1.css('.gallery img::attr(src)').get() #img_content = requests.get(url=img_url, headers=headers).content #print(img_url) # 順便更新一下title, 因?yàn)樵瓉淼氖前虢氐? 不全 # title1 = selector1.css('.photo .photo-pic img::attr(title)').get() img_folder = 'img3\\' if not os.path.exists(img_folder): os.makedirs(img_folder) with open(img_folder + title + '.jpg', mode='wb') as f: f.write(img_content) # print(href, title) print('正在保存:', title, href) #img_name += 1
?? 單線程單頁1
# 下載子頁面全部 import os import requests import parsel url = 'https://app.zol.com.cn/bizhi/detail_12901.html' # 請求地址 # 模擬偽裝 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} response = requests.get(url=url,headers=headers) selector = parsel.Selector(response.text) lis = selector.css('.album-list li') i = 0 for li in lis: # Get all img elements within the current li img_tags = li.css('img::attr(src)').getall() # This gets all the img src attributes for href in img_tags: # Iterate over all img src attributes img_content = requests.get(url=href, headers=headers).content img_folder = 'img4\\' if not os.path.exists(img_folder): os.makedirs(img_folder) with open(img_folder + str(i) + '.jpg', mode='wb') as f: f.write(img_content) # print(href, i) print('正在保存:', i, href) i += 1 # Increment i for each image saved
?? 單線程單頁
import os import re import requests import parsel url = 'https://sj.zol.com.cn/bizhi/5/' # 請求地址 # 模擬偽裝 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} response = requests.get(url=url,headers=headers) #print(response.text) selector = parsel.Selector(response.text) #lis = selector.css('.pic-list2 li') # 篩除包含的底部 3個(gè) 猜你喜歡 lis=selector.css('.pic-list2 .photo-list-padding') for li in lis: #href = li.css('a::attr(href)').get() title = li.css('.pic img::attr(title)').get() href = li.css('a::attr(href)').get() #print(title, href) # https://sj.zol.com.cn/bizhi/detail_12901_139948.html #url = "https://sj.zol.com.cn/bizhi/detail_12901_139948.html" pattern = r'/bizhi/detail_(\d+)_\d+\.html' replacement = r'https://app.zol.com.cn/bizhi/detail_\1.html' new_url = re.sub(pattern, replacement, href) #print(href, new_url) #url = 'https://app.zol.com.cn/bizhi/detail_12901.html' # 請求地址 # 模擬偽裝 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} response = requests.get(url=new_url, headers=headers) selector = parsel.Selector(response.text) lis1 = selector.css('.album-list li') i = 0 for li1 in lis1: # Get all img elements within the current li img_tags = li1.css('img::attr(src)').getall() # This gets all the img src attributes for href in img_tags: # Iterate over all img src attributes img_content = requests.get(url=href, headers=headers).content img_folder = 'img5\\' if not os.path.exists(img_folder): os.makedirs(img_folder) with open(img_folder + title+'_'+str(i) + '.jpg', mode='wb') as f: f.write(img_content) # print(href, i) print('正在保存:',title+'_'+str(i), href) i += 1 # Increment i for each image saved
?? 單線程多頁
import os import re import requests import parsel for page in range(1,3): print(f'--------- 正在爬取第{page}的內(nèi)容 ----------') if page==1: url = 'https://sj.zol.com.cn/bizhi/5/' # 請求地址 else: url = f'https://sj.zol.com.cn/bizhi/5/{page}.html' # 請求地址 # 模擬偽裝 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} response = requests.get(url=url, headers=headers) # print(response.text) selector = parsel.Selector(response.text) # lis = selector.css('.pic-list2 li') # 篩除包含的底部 3個(gè) 猜你喜歡 lis = selector.css('.pic-list2 .photo-list-padding') for li in lis: # href = li.css('a::attr(href)').get() title = li.css('.pic img::attr(title)').get() href = li.css('a::attr(href)').get() # print(title, href) # https://sj.zol.com.cn/bizhi/detail_12901_139948.html # url = "https://sj.zol.com.cn/bizhi/detail_12901_139948.html" pattern = r'/bizhi/detail_(\d+)_\d+\.html' replacement = r'https://app.zol.com.cn/bizhi/detail_\1.html' new_url = re.sub(pattern, replacement, href) # print(href, new_url) # url = 'https://app.zol.com.cn/bizhi/detail_12901.html' # 請求地址 # 模擬偽裝 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} response = requests.get(url=new_url, headers=headers) selector = parsel.Selector(response.text) lis1 = selector.css('.album-list li') i = 0 for li1 in lis1: # Get all img elements within the current li img_tags = li1.css('img::attr(src)').getall() # This gets all the img src attributes for href in img_tags: # Iterate over all img src attributes img_content = requests.get(url=href, headers=headers).content img_folder = 'img6\\' if not os.path.exists(img_folder): os.makedirs(img_folder) with open(img_folder + title + '_' + str(i) + '.jpg', mode='wb') as f: f.write(img_content) # print(href, i) print('正在保存:', title + '_' + str(i), href) i += 1 # Increment i for each image saved
?? 多線程多頁
import os import re import time import requests import parsel import concurrent.futures def get_imgs(url): # 模擬偽裝 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} response = requests.get(url=url, headers=headers) # print(response.text) selector = parsel.Selector(response.text) # lis = selector.css('.pic-list2 li') # 篩除包含的底部 3個(gè) 猜你喜歡 lis = selector.css('.pic-list2 .photo-list-padding') for li in lis: # href = li.css('a::attr(href)').get() title = li.css('.pic img::attr(title)').get() href = li.css('a::attr(href)').get() # print(title, href) # https://sj.zol.com.cn/bizhi/detail_12901_139948.html # url = "https://sj.zol.com.cn/bizhi/detail_12901_139948.html" pattern = r'/bizhi/detail_(\d+)_\d+\.html' replacement = r'https://app.zol.com.cn/bizhi/detail_\1.html' new_url = re.sub(pattern, replacement, href) # print(href, new_url) # url = 'https://app.zol.com.cn/bizhi/detail_12901.html' # 請求地址 # 模擬偽裝 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'} response = requests.get(url=new_url, headers=headers) selector = parsel.Selector(response.text) lis1 = selector.css('.album-list li') i = 0 for li1 in lis1: # Get all img elements within the current li img_tags = li1.css('img::attr(src)').getall() # This gets all the img src attributes for href in img_tags: # Iterate over all img src attributes img_content = requests.get(url=href, headers=headers).content img_folder = 'img7\\' if not os.path.exists(img_folder): os.makedirs(img_folder) with open(img_folder + title + '_' + str(i) + '.jpg', mode='wb') as f: f.write(img_content) # print(href, i) print('正在保存:', title + '_' + str(i), href) i += 1 # Increment i for each image saved def main(url): get_imgs(url) start_time = time.time() executor = concurrent.futures.ThreadPoolExecutor(max_workers=4) for page in range(1, 9): #print(f'--------- 正在爬取第{page}的內(nèi)容 ----------') if page == 1: url = 'https://sj.zol.com.cn/bizhi/5/' # 請求地址 else: url = f'https://sj.zol.com.cn/bizhi/5/{page}.html' # 請求地址 executor.submit(main, url) executor.shutdown() stop_time = time.time() print(f'耗時(shí):{int(stop_time) - int(start_time)}秒')
到此這篇關(guān)于Python爬蟲使用實(shí)例-wallpaper的文章就介紹到這了,更多相關(guān)Python wallpaper內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
相關(guān)文章
解決使用OpenCV中的imread()內(nèi)存報(bào)錯(cuò)問題
這篇文章主要介紹了解決使用OpenCV中的imread()內(nèi)存報(bào)錯(cuò)問題,具有很好的參考價(jià)值,希望對大家有所幫助。一起跟隨小編過來看看吧2021-03-03Python中對元組和列表按條件進(jìn)行排序的方法示例
這篇文章主要介紹了Python中對元組和列表按條件進(jìn)行排序的方法示例,需要的朋友可以參考下2015-11-11Python+matplotlib實(shí)現(xiàn)折線圖的美化
這篇文章主要和大家分享一個(gè)非常有趣的Python教程—如何美化一個(gè)?matplotlib折線圖。文中的示例代碼講解詳細(xì),感興趣的可以了解一下2022-05-05詳解python statistics模塊及函數(shù)用法
本節(jié)介紹 Python 中的另一個(gè)常用模塊 —— statistics模塊,該模塊提供了用于計(jì)算數(shù)字?jǐn)?shù)據(jù)的數(shù)理統(tǒng)計(jì)量的函數(shù)。這篇文章重點(diǎn)給大家介紹python statistics 模塊的一些用法,感興趣的朋友跟隨小編一起看看吧2019-10-10Python報(bào)錯(cuò)AssertionError:can only test a c
這篇文章主要介紹了Python報(bào)錯(cuò)AssertionError:can only test a child proc問題,具有很好的參考價(jià)值,希望對大家有所幫助,如有錯(cuò)誤或未考慮完全的地方,望不吝賜教2023-09-09Python文件操作指南解鎖三個(gè)txt文件合并技術(shù)
本文將深入介紹如何利用Python編寫腳本,將三個(gè)文本文件中指定的列數(shù)據(jù)合并成一個(gè)新文件,通過豐富的示例代碼和詳細(xì)解釋,幫助掌握這一實(shí)用而靈活的數(shù)據(jù)處理技巧2024-01-01Python實(shí)現(xiàn)批量解壓文件夾下所有壓縮包
這篇文章主要為大家詳細(xì)介紹了如何使用Python實(shí)現(xiàn)批量解壓文件夾下所有壓縮包,文中的示例代碼講解詳細(xì),感興趣的小伙伴可以跟隨小編一起學(xué)習(xí)一下2025-02-02