欧美bbbwbbbw肥妇,免费乱码人妻系列日韩,一级黄片

python抓取網(wǎng)頁中圖片并保存到本地

 更新時(shí)間:2015年12月01日 10:59:16   作者:Ruthless  
本篇文章給大家介紹python抓取網(wǎng)頁中圖片并保存到本地,對python抓取網(wǎng)頁圖片相關(guān)知識感興趣的朋友一起學(xué)習(xí)吧

在上篇文章給大家分享PHP源碼批量抓取遠(yuǎn)程網(wǎng)頁圖片并保存到本地的實(shí)現(xiàn)方法,感興趣的朋友可以點(diǎn)擊了解詳情。

#-*-coding:utf-8-*- 
import os
import uuid
import urllib2
import cookielib
'''獲取文件后綴名'''
def get_file_extension(file): 
  return os.path.splitext(file)[1] 
'''創(chuàng)建文件目錄,并返回該目錄'''
def mkdir(path):
  # 去除左右兩邊的空格
  path=path.strip()
  # 去除尾部 \符號
  path=path.rstrip("\\")
  if not os.path.exists(path):
    os.makedirs(path)
  return path
'''自動生成一個(gè)唯一的字符串,固定長度為36'''
def unique_str():
  return str(uuid.uuid1())
'''
抓取網(wǎng)頁文件內(nèi)容,保存到內(nèi)存
@url 欲抓取文件 ,path+filename
'''
def get_file(url):
  try:
    cj=cookielib.LWPCookieJar()
    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    req=urllib2.Request(url)
    operate=opener.open(req)
    data=operate.read()
    return data
  except BaseException, e:
    print e
    return None
'''
保存文件到本地
@path 本地路徑
@file_name 文件名
@data 文件內(nèi)容
'''
def save_file(path, file_name, data):
  if data == None:
    return
  mkdir(path)
  if(not path.endswith("/")):
    path=path+"/"
  file=open(path+file_name, "wb")
  file.write(data)
  file.flush()
  file.close()
#獲取文件后綴名
print get_file_extension("123.jpg");
#創(chuàng)建文件目錄,并返回該目錄
#print mkdir("d:/ljq")
#自動生成一個(gè)唯一的字符串,固定長度為36
print unique_str()
url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0";
save_file("d:/ljq/", "123.jpg", get_file(url))

通過Python抓取指定Url中的圖片保存至本地

# *** encoding: utf-8 ***
__author__='jiangyt'
""" 
fetch images from specific url
v1.0
""" 
import urllib, httplib, urlparse 
import re 
import random 
"""judge url exists or not""" 
def httpExists(url): 
  host, path = urlparse.urlsplit(url)[1:3] 
  if ':' in host: 
    # port specified, try to use it 
    host, port = host.split(':', 1) 
    try: 
      port = int(port) 
    except ValueError: 
      print 'invalid port number %r' % (port,) 
      return False 
  else: 
    # no port specified, use default port 
    port = None 
  try: 
    connection = httplib.HTTPConnection(host, port=port) 
    connection.request("HEAD", path) 
    resp = connection.getresponse( ) 
    if resp.status == 200: # normal 'found' status 
      found = True 
    elif resp.status == 302: # recurse on temporary redirect 
      found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) 
    else: # everything else -> not found 
      print "Status %d %s : %s" % (resp.status, resp.reason, url) 
      found = False 
  except Exception, e: 
    print e.__class__, e, url 
    found = False 
  return found 
"""get html src,return lines[]""" 
def gGetHtmlLines(url): 
  if url==None : return 
  if not httpExists(url): return 
  try: 
    page = urllib.urlopen(url) 
    html = page.readlines() 
    page.close() 
    return html 
  except Exception, e: 
    print "gGetHtmlLines() error! Exception ==>>" + e 
    return 
"""get html src,return string""" 
def gGetHtml(url): 
  if url==None : return 
  if not httpExists(url): return 
  try: 
    page = urllib.urlopen(url) 
    html = page.read() 
    page.close() 
    return html 
  except Exception, e: 
    print "gGetHtml() error! Exception ==>>" + e 
    return 
"""根據(jù)url獲取文件名""" 
def gGetFileName(url): 
  if url==None: return None 
  if url=="" : return "" 
  arr=url.split("/") 
  return arr[len(arr)-1] 
"""生成隨機(jī)文件名""" 
def gRandFilename(type): 
  fname = '' 
  for i in range(16): 
    fname = fname + chr(random.randint(65,90)) 
    fname = fname + chr(random.randint(48,57)) 
  return fname + '.' + type 
"""根據(jù)url和其上的link,得到link的絕對地址""" 
def gGetAbslLink(url,link): 
  if url==None or link == None : return 
  if url=='' or link=='' : return url 
  addr = '' 
  if link[0] == '/' : 
    addr = gGetHttpAddr(url) + link 
  elif len(link)>3 and link[0:4] == 'http': 
    addr = link 
  elif len(link)>2 and link[0:2] == '..': 
    addr = gGetHttpAddrFatherAssign(url,link) 
  else: 
    addr = gGetHttpAddrFather(url) + link 
  return addr 
"""根據(jù)輸入的lines,匹配正則表達(dá)式,返回list""" 
def gGetRegList(linesList,regx): 
  if linesList==None : return 
  rtnList=[] 
  for line in linesList: 
    matchs = re.search(regx, line, re.IGNORECASE) 
    if matchs!=None: 
      allGroups = matchs.groups() 
      for foundStr in allGroups: 
        if foundStr not in rtnList: 
          rtnList.append(foundStr) 
  return rtnList 
"""根據(jù)url下載文件,文件名參數(shù)指定""" 
def gDownloadWithFilename(url,savePath,file): 
  #參數(shù)檢查,現(xiàn)忽略 
  try: 
    urlopen=urllib.URLopener() 
    fp = urlopen.open(url) 
    data = fp.read() 
    fp.close() 
    file=open(savePath + file,'w+b') 
    file.write(data) 
    file.close() 
  except IOError, error: 
    print "DOWNLOAD %s ERROR!==>>%s" % (url, error) 
  except Exception, e: 
    print "Exception==>>" + e 
"""根據(jù)url下載文件,文件名自動從url獲取""" 
def gDownload(url,savePath): 
  #參數(shù)檢查,現(xiàn)忽略 
  fileName = gGetFileName(url) 
  #fileName =gRandFilename('jpg') 
  gDownloadWithFilename(url,savePath,fileName) 
"""根據(jù)某網(wǎng)頁的url,下載該網(wǎng)頁的jpg""" 
def gDownloadHtmlJpg(downloadUrl,savePath): 
  lines= gGetHtmlLines(downloadUrl) # 'get the page source' 
  regx = r"""src\s*="?(\S+)\.jpg""" 
  lists =gGetRegList(lines,regx) #'get the links which match regular express' 
  if lists==None: return 
  for jpg in lists: 
    jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg' 
    gDownload(jpg,savePath) 
    print gGetFileName(jpg) 
"""根據(jù)url取主站地址""" 
def gGetHttpAddr(url): 
  if url== '' : return '' 
  arr=url.split("/") 
  return arr[0]+"http://"+arr[2] 
"""根據(jù)url取上級目錄""" 
def gGetHttpAddrFather(url): 
  if url=='' : return '' 
  arr=url.split("/") 
  addr = arr[0]+'//'+arr[2]+ '/' 
  if len(arr)-1>3 : 
    for i in range(3,len(arr)-1): 
      addr = addr + arr[i] + '/' 
  return addr 
"""根據(jù)url和上級的link取link的絕對地址""" 
def gGetHttpAddrFatherAssign(url,link): 
  if url=='' : return '' 
  if link=='': return '' 
  linkArray=link.split("/") 
  urlArray = url.split("/") 
  partLink ='' 
  partUrl = '' 
  for i in range(len(linkArray)): 
    if linkArray[i]=='..': 
      numOfFather = i + 1 #上級數(shù) 
    else: 
      partLink = partLink + '/' + linkArray[i] 
  for i in range(len(urlArray)-1-numOfFather): 
    partUrl = partUrl + urlArray[i] 
    if i < len(urlArray)-1-numOfFather -1 : 
      partUrl = partUrl + '/' 
  return partUrl + partLink 
"""根據(jù)url獲取其上的相關(guān)htm、html鏈接,返回list""" 
def gGetHtmlLink(url): 
  #參數(shù)檢查,現(xiàn)忽略 
  rtnList=[] 
  lines=gGetHtmlLines(url) 
  regx = r"""href="?(\S+)\.htm""" 
  for link in gGetRegList(lines,regx): 
    link = gGetAbslLink(url,link) + '.htm' 
    if link not in rtnList: 
      rtnList.append(link) 
      print link 
  return rtnList 
"""根據(jù)url,抓取其上的jpg和其鏈接htm上的jpg""" 
def gDownloadAllJpg(url,savePath): 
  #參數(shù)檢查,現(xiàn)忽略 
  gDownloadHtmlJpg(url,savePath) 
  #抓取link上的jpg 
  links=gGetHtmlLink(url) 
  for link in links: 
    gDownloadHtmlJpg(link,savePath) 
"""test""" 
def main(): 
  u='http://site.douban.com/196738/room/2462453/'#想要抓取圖片的地址
  save='/root/python/tmp/' #圖片所要存放的目錄
  print 'download pic from [' + u +']' 
  print 'save to [' +save+'] ...' 
  gDownloadHtmlJpg(u,save) 
  print "download finished" 
if __name__ == "__main__":
  main()
else:
  print "called from intern."

以上代碼是小編給大家介紹的python抓取網(wǎng)頁中圖片并保存到本地的全部內(nèi)容,希望大家喜歡。

相關(guān)文章

最新評論