python抓取網(wǎng)頁中圖片并保存到本地
在上篇文章給大家分享PHP源碼批量抓取遠(yuǎn)程網(wǎng)頁圖片并保存到本地的實(shí)現(xiàn)方法,感興趣的朋友可以點(diǎn)擊了解詳情。
#-*-coding:utf-8-*- import os import uuid import urllib2 import cookielib '''獲取文件后綴名''' def get_file_extension(file): return os.path.splitext(file)[1] '''創(chuàng)建文件目錄,并返回該目錄''' def mkdir(path): # 去除左右兩邊的空格 path=path.strip() # 去除尾部 \符號 path=path.rstrip("\\") if not os.path.exists(path): os.makedirs(path) return path '''自動生成一個(gè)唯一的字符串,固定長度為36''' def unique_str(): return str(uuid.uuid1()) ''' 抓取網(wǎng)頁文件內(nèi)容,保存到內(nèi)存 @url 欲抓取文件 ,path+filename ''' def get_file(url): try: cj=cookielib.LWPCookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) req=urllib2.Request(url) operate=opener.open(req) data=operate.read() return data except BaseException, e: print e return None ''' 保存文件到本地 @path 本地路徑 @file_name 文件名 @data 文件內(nèi)容 ''' def save_file(path, file_name, data): if data == None: return mkdir(path) if(not path.endswith("/")): path=path+"/" file=open(path+file_name, "wb") file.write(data) file.flush() file.close() #獲取文件后綴名 print get_file_extension("123.jpg"); #創(chuàng)建文件目錄,并返回該目錄 #print mkdir("d:/ljq") #自動生成一個(gè)唯一的字符串,固定長度為36 print unique_str() url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0"; save_file("d:/ljq/", "123.jpg", get_file(url))
通過Python抓取指定Url中的圖片保存至本地
# *** encoding: utf-8 *** __author__='jiangyt' """ fetch images from specific url v1.0 """ import urllib, httplib, urlparse import re import random """judge url exists or not""" def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if ':' in host: # port specified, try to use it host, port = host.split(':', 1) try: port = int(port) except ValueError: print 'invalid port number %r' % (port,) return False else: # no port specified, use default port port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request("HEAD", path) resp = connection.getresponse( ) if resp.status == 200: # normal 'found' status found = True elif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) else: # everything else -> not found print "Status %d %s : %s" % (resp.status, resp.reason, url) found = False except Exception, e: print e.__class__, e, url found = False return found """get html src,return lines[]""" def gGetHtmlLines(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except Exception, e: print "gGetHtmlLines() error! Exception ==>>" + e return """get html src,return string""" def gGetHtml(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.read() page.close() return html except Exception, e: print "gGetHtml() error! Exception ==>>" + e return """根據(jù)url獲取文件名""" def gGetFileName(url): if url==None: return None if url=="" : return "" arr=url.split("/") return arr[len(arr)-1] """生成隨機(jī)文件名""" def gRandFilename(type): fname = '' for i in range(16): fname = fname + chr(random.randint(65,90)) fname = fname + chr(random.randint(48,57)) return fname + '.' + type """根據(jù)url和其上的link,得到link的絕對地址""" def gGetAbslLink(url,link): if url==None or link == None : return if url=='' or link=='' : return url addr = '' if link[0] == '/' : addr = gGetHttpAddr(url) + link elif len(link)>3 and link[0:4] == 'http': addr = link elif len(link)>2 and link[0:2] == '..': addr = gGetHttpAddrFatherAssign(url,link) else: addr = gGetHttpAddrFather(url) + link return addr """根據(jù)輸入的lines,匹配正則表達(dá)式,返回list""" def gGetRegList(linesList,regx): if linesList==None : return rtnList=[] for line in linesList: matchs = re.search(regx, line, re.IGNORECASE) if matchs!=None: allGroups = matchs.groups() for foundStr in allGroups: if foundStr not in rtnList: rtnList.append(foundStr) return rtnList """根據(jù)url下載文件,文件名參數(shù)指定""" def gDownloadWithFilename(url,savePath,file): #參數(shù)檢查,現(xiàn)忽略 try: urlopen=urllib.URLopener() fp = urlopen.open(url) data = fp.read() fp.close() file=open(savePath + file,'w+b') file.write(data) file.close() except IOError, error: print "DOWNLOAD %s ERROR!==>>%s" % (url, error) except Exception, e: print "Exception==>>" + e """根據(jù)url下載文件,文件名自動從url獲取""" def gDownload(url,savePath): #參數(shù)檢查,現(xiàn)忽略 fileName = gGetFileName(url) #fileName =gRandFilename('jpg') gDownloadWithFilename(url,savePath,fileName) """根據(jù)某網(wǎng)頁的url,下載該網(wǎng)頁的jpg""" def gDownloadHtmlJpg(downloadUrl,savePath): lines= gGetHtmlLines(downloadUrl) # 'get the page source' regx = r"""src\s*="?(\S+)\.jpg""" lists =gGetRegList(lines,regx) #'get the links which match regular express' if lists==None: return for jpg in lists: jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg' gDownload(jpg,savePath) print gGetFileName(jpg) """根據(jù)url取主站地址""" def gGetHttpAddr(url): if url== '' : return '' arr=url.split("/") return arr[0]+"http://"+arr[2] """根據(jù)url取上級目錄""" def gGetHttpAddrFather(url): if url=='' : return '' arr=url.split("/") addr = arr[0]+'//'+arr[2]+ '/' if len(arr)-1>3 : for i in range(3,len(arr)-1): addr = addr + arr[i] + '/' return addr """根據(jù)url和上級的link取link的絕對地址""" def gGetHttpAddrFatherAssign(url,link): if url=='' : return '' if link=='': return '' linkArray=link.split("/") urlArray = url.split("/") partLink ='' partUrl = '' for i in range(len(linkArray)): if linkArray[i]=='..': numOfFather = i + 1 #上級數(shù) else: partLink = partLink + '/' + linkArray[i] for i in range(len(urlArray)-1-numOfFather): partUrl = partUrl + urlArray[i] if i < len(urlArray)-1-numOfFather -1 : partUrl = partUrl + '/' return partUrl + partLink """根據(jù)url獲取其上的相關(guān)htm、html鏈接,返回list""" def gGetHtmlLink(url): #參數(shù)檢查,現(xiàn)忽略 rtnList=[] lines=gGetHtmlLines(url) regx = r"""href="?(\S+)\.htm""" for link in gGetRegList(lines,regx): link = gGetAbslLink(url,link) + '.htm' if link not in rtnList: rtnList.append(link) print link return rtnList """根據(jù)url,抓取其上的jpg和其鏈接htm上的jpg""" def gDownloadAllJpg(url,savePath): #參數(shù)檢查,現(xiàn)忽略 gDownloadHtmlJpg(url,savePath) #抓取link上的jpg links=gGetHtmlLink(url) for link in links: gDownloadHtmlJpg(link,savePath) """test""" def main(): u='http://site.douban.com/196738/room/2462453/'#想要抓取圖片的地址 save='/root/python/tmp/' #圖片所要存放的目錄 print 'download pic from [' + u +']' print 'save to [' +save+'] ...' gDownloadHtmlJpg(u,save) print "download finished" if __name__ == "__main__": main() else: print "called from intern."
以上代碼是小編給大家介紹的python抓取網(wǎng)頁中圖片并保存到本地的全部內(nèi)容,希望大家喜歡。
相關(guān)文章
Django nginx配置實(shí)現(xiàn)過程詳解
這篇文章主要介紹了Django nginx配置實(shí)現(xiàn)過程詳解,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友可以參考下2020-09-09多個(gè)geojson經(jīng)過坐標(biāo)系轉(zhuǎn)換后如何合并為一個(gè)shp數(shù)據(jù)
這篇文章主要介紹了多個(gè)geojson經(jīng)過坐標(biāo)系轉(zhuǎn)換后如何合并為一個(gè)shp數(shù)據(jù)問題,具有很好的參考價(jià)值,希望對大家有所幫助,如有錯(cuò)誤或未考慮完全的地方,望不吝賜教2023-10-10Python 執(zhí)行矩陣與線性代數(shù)運(yùn)算
這篇文章主要介紹了Python 執(zhí)行矩陣與線性代數(shù)運(yùn)算,文中講解非常細(xì)致,代碼幫助大家更好的理解和學(xué)習(xí),感興趣的朋友可以了解下2020-08-08Python+wxPython實(shí)現(xiàn)將圖片轉(zhuǎn)換為草圖
將照片轉(zhuǎn)換為藝術(shù)風(fēng)格的草圖是一種有趣的方式,可以為您的圖像添加獨(dú)特的效果,本文主要介紹了如何Python和wxPython來實(shí)現(xiàn)這一目標(biāo),需要的可以參考下2023-08-08odoo?為可編輯列表視圖字段搜索添加查詢過濾條件的詳細(xì)過程
Odoo 是基于 Python 寫的一系列開源商業(yè)應(yīng)用程序套裝,前身是 OpenERP,這篇文章主要介紹了odoo?為可編輯列表視圖字段搜索添加查詢過濾條件,需要的朋友可以參考下2023-02-02python將ip地址轉(zhuǎn)換成整數(shù)的方法
這篇文章主要介紹了python將ip地址轉(zhuǎn)換成整數(shù)的方法,涉及Python針對IP地址的轉(zhuǎn)換技巧,需要的朋友可以參考下2015-03-03