python 寫的一個(gè)爬蟲程序源碼
寫爬蟲是一項(xiàng)復(fù)雜、枯噪、反復(fù)的工作,考慮的問題包括采集效率、鏈路異常處理、數(shù)據(jù)質(zhì)量(與站點(diǎn)編碼規(guī)范關(guān)系很大)等。整理自己寫一個(gè)爬蟲程序,單臺(tái)服務(wù)器可以啟用1~8個(gè)實(shí)例同時(shí)采集,然后將數(shù)據(jù)入庫。
#-*- coding:utf-8 -*- #!/usr/local/bin/python import sys, time, os,string import mechanize import urlparse from BeautifulSoup import BeautifulSoup import re import MySQLdb import logging import cgi from optparse import OptionParser #----------------------------------------------------------------------------# # Name: TySpider.py # # Purpose: WebSite Spider Module # # Author: 劉天斯 # # Email: liutiansi@gamil.com # # Created: 2010/02/16 # # Copyright: (c) 2010 # #----------------------------------------------------------------------------# """ |-------------------------------------------------------------------------- | 定義 loging class; |-------------------------------------------------------------------------- | | 功能:記錄系統(tǒng)相關(guān)日志信息。 | | """ class Pubclilog(): def __init__(self): self.logfile = 'website_log.txt' def iniLog(self): logger = logging.getLogger() filehandler = logging.FileHandler(self.logfile) streamhandler = logging.StreamHandler() fmt = logging.Formatter('%(asctime)s, %(funcName)s, %(message)s') logger.setLevel(logging.DEBUG) logger.addHandler(filehandler) logger.addHandler(streamhandler) return [logger,filehandler] """ |-------------------------------------------------------------------------- | 定義 tySpider class; |-------------------------------------------------------------------------- | | 功能:抓取分類、標(biāo)題等信息 | | """ class BaseTySpider: #初始化相關(guān)成員方法 def __init__(self,X,log_switch): #數(shù)據(jù)庫連接 self.conn = MySQLdb.connect(db='dbname',host='192.168.0.10', user='dbuser',passwd='SDFlkj934y5jsdgfjh435',charset='utf8') #分類及標(biāo)題頁面Community self.CLASS_URL = 'http://test.abc.com/aa/CommTopicsPage?' #發(fā)表回復(fù)頁 self.Content_URL = 'http://test.bac.com/aa/CommMsgsPage?' #開始comm值 self.X=X #當(dāng)前comm id取模,方面平均到表 self.mod=self.X%5 #Community文件下載頁 self.body="" #self.bodySoup對(duì)象 self.soup=None #發(fā)表回復(fù)頁下載內(nèi)容變量 self.Contentbody="" #發(fā)表回復(fù)頁內(nèi)容self.ContentbodySoup對(duì)象 self.Contentsoup=None #日志開關(guān) self.log_switch=log_switch #======================獲取名稱及分類方法========================== def _SpiderClass(self,nextpage=None): if nextpage==None: FIXED_QUERY = 'cmm='+str(self.X) else: FIXED_QUERY = nextpage[1:] try: rd = mechanize.Browser() rd.addheaders = [("User-agent", "Tianya/2010 (compatible; MSIE 6.0;Windows NT 5.1)")] rd.open(self.CLASS_URL + FIXED_QUERY) self.body=rd.response().read() #rd=mechanize.Request(self.CLASS_URL + FIXED_QUERY) #response = mechanize.urlopen(rd) #self.body=response.read() except Exception,e: if self.log_switch=="on": logapp=Pubclilog() logger,hdlr = logapp.iniLog() logger.info(self.CLASS_URL + FIXED_QUERY+str(e)) hdlr.flush() logger.removeHandler(hdlr) return self.soup = BeautifulSoup(self.body) NextPageObj= self.soup("a", {'class' : re.compile("fs-paging-item fs-paging-next")}) self.cursor = self.conn.cursor() if nextpage==None: try: Ttag=str(self.soup.table) #print Ttag """ ------------------分析結(jié)構(gòu)體----------------- <table cellspacing="0" cellpadding="0"> <tr> <td> <h1 title="Dunhill">Dunhill</h1> </td> <td valign="middle"> <div class="fs-comm-cat"> <span class="fs-icons fs-icon-cat"> </span> <a href="TopByCategoryPage?cid=211&ref=commnav-cat">中國</a> » <a href="TopByCategoryPage?cid=211&subcid=273&ref=commnav-cat">人民</a> </div> </td> </tr> </table> """ soupTable=BeautifulSoup(Ttag) #定位到第一個(gè)h1標(biāo)簽 tableh1 = soupTable("h1") #print self.X #print "Name:"+tableh1[0].string.strip().encode('utf-8') #處理無類型的 try: #定位到表格中符合規(guī)則“^TopByCategory”A鏈接塊,tablea[0]為第一個(gè)符合條件的連接文字,tablea[1]... tablea = soupTable("a", {'href' : re.compile("^TopByCategory")}) if tablea[0].string.strip()=="": pass #print "BigCLass:"+tablea[0].string.strip().encode('utf-8') #print "SubClass:"+tablea[1].string.strip().encode('utf-8') except Exception,e: if self.log_switch=="on": logapp=Pubclilog() logger,hdlr = logapp.iniLog() logger.info("[noClassInfo]"+str(self.X)+str(e)) hdlr.flush() logger.removeHandler(hdlr) self.cursor.execute("insert into baname"+str(self.mod)+" values('%d','%d','%s')" %(self.X,-1,tableh1[0].string.strip().encode('utf-8'))) self.conn.commit() self._SpiderTitle() if NextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderClass(NextPageURL) return else: return #獲取鏈接二對(duì)象的href值 classlink=tablea[1]['href'] par_dict=cgi.parse_qs(urlparse.urlparse(classlink).query) #print "CID:"+par_dict["cid"][0] #print "SubCID:"+par_dict["subcid"][0] #print "---------------------------------------" #插入數(shù)據(jù)庫 self.cursor.execute("insert into class values('%d','%s')" %(int(par_dict["cid"][0]),tablea[0].string.strip().encode('utf-8'))) self.cursor.execute("insert into subclass values('%d','%d','%s')" %(int(par_dict["subcid"][0]),int(par_dict["cid"][0]),tablea[1].string.strip().encode('utf-8'))) self.cursor.execute("insert into baname"+str(self.mod)+" values('%d','%d','%s')" %(self.X,int(par_dict["subcid"][0]),tableh1[0].string.strip().encode('utf-8'))) self.conn.commit() self._SpiderTitle() if NextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderClass(NextPageURL) self.body=None self.soup=None Ttag=None soupTable=None table=None table1=None classlink=None par_dict=None except Exception,e: if self.log_switch=="on": logapp=Pubclilog() logger,hdlr = logapp.iniLog() logger.info("[ClassInfo]"+str(self.X)+str(e)) hdlr.flush() logger.removeHandler(hdlr) else: self._SpiderTitle() if NextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderClass(NextPageURL) #====================獲取標(biāo)題方法========================= def _SpiderTitle(self): #查找標(biāo)題表格對(duì)象(table) soupTitleTable=self.soup("table", {'class' : "fs-topic-list"}) #查找標(biāo)題行對(duì)象(tr) TitleTr = soupTitleTable[0]("tr", {'onmouseover' : re.compile("^this\.className='fs-row-hover'")}) """ -----------分析結(jié)構(gòu)體-------------- <tr class="fs-alt-row" onmouseover="this.className='fs-row-hover'" onmouseout="this.className='fs-alt-row'"> <td valign="middle" class="fs-hot-topic-dots-ctn"> <div class="fs-hot-topic-dots" style="background-position:0 -0px" title="點(diǎn)擊量:12"></div> </td> <td valign="middle" class="fs-topic-name"> <a href="CommMsgsPage?cmm=16081&tid=2718969307756232842&ref=regulartopics" id="a53" title="【新人報(bào)到】歡迎美國人民加入" target="_blank">【新人報(bào)到】歡迎美國人民加入</a> <span class="fs-meta"> <span class="fs-icons fs-icon-mini-reply"> </span>0 / <span class="fs-icons fs-icon-pageview"> </span>12</span> </td> <td valign="middle"> <a class="fs-tiny-user-avatar umhook " href="ProfilePage?uid=8765915421039908242" title="中國人"><img src="http://img1.sohu.com.cn/aa/images/138/0/P/1/s.jpg" /></a> </td> <td valign="middle" style="padding-left:4px"> <a href="Profile?uid=8765915421039908242" id="b53" title="中國人" class="umhook">中國人</a> </td> <td valign="middle" class="fs-topic-last-mdfy fs-meta">2-14</td> </tr> """ for CurrTr in TitleTr: try: #初始化置頂及精華狀態(tài) Title_starred='N' Title_sticky='N' #獲取當(dāng)前記錄的BeautifulSoup對(duì)象 soupCurrTr=BeautifulSoup(str(CurrTr)) #BeautifulSoup分析HTML有誤,只能通過span的標(biāo)志數(shù)來獲取貼子狀態(tài),會(huì)存在一定誤差 #如只有精華時(shí)也會(huì)當(dāng)成置頂來處理。 TitleStatus=soupCurrTr("span", {'title' : ""}) TitlePhotoViewer=soupCurrTr("a", {'href' : re.compile("^PhotoViewer")}) if TitlePhotoViewer.__len__()==1: TitlePhotoViewerBool=0 else: TitlePhotoViewerBool=1 if TitleStatus.__len__()==3-TitlePhotoViewerBool: Title_starred='Y' Title_sticky='Y' elif TitleStatus.__len__()==2-TitlePhotoViewerBool: Title_sticky='Y' #獲取貼子標(biāo)題 Title=soupCurrTr.a.next.strip() #獲取貼子ID par_dict=cgi.parse_qs(urlparse.urlparse(soupCurrTr.a['href']).query) #獲取回復(fù)數(shù)及瀏覽器 TitleNum=soupCurrTr("td", {'class' : "fs-topic-name"}) TitleArray=string.split(str(TitleNum[0]),'\n') Title_ReplyNum=string.split(TitleArray[len(TitleArray)-4],'>')[2] Title_ViewNum=string.split(TitleArray[len(TitleArray)-2],'>')[2][:-6] #獲取貼子作者 TitleAuthorObj=soupCurrTr("td", {'style' : "padding-left:4px"}) Title_Author=TitleAuthorObj[0].next.next.next.string.strip().encode('utf-8') #獲取回復(fù)時(shí)間 TitleTime=soupCurrTr("td", {'class' : re.compile("^fs-topic-last-mdfy fs-meta")}) """ print "X:"+str(self.X) print "Title_starred:"+Title_starred print "Title_sticky:"+Title_sticky print "Title:"+Title #獲取貼子內(nèi)容連接URL print "Title_link:"+soupCurrTr.a['href'] print "CID:"+par_dict["tid"][0] print "Title_ReplyNum:"+Title_ReplyNum print "Title_ViewNum:"+Title_ViewNum print "Title_Author:"+Title_Author print "TitleTime:"+TitleTime[0].string.strip().encode('utf-8') """ #入庫 self.cursor.execute("insert into Title"+str(self.mod)+" values('%s','%d','%s','%d','%d','%s','%s','%s','%s')" %(par_dict["tid"][0], \ self.X,Title,int(Title_ReplyNum),int(Title_ViewNum),Title_starred,Title_sticky, \ Title_Author.decode('utf-8'),TitleTime[0].string.strip().encode('utf-8'))) self.conn.commit() self._SpiderContent(par_dict["tid"][0]) except Exception,e: if self.log_switch=="on": logapp=Pubclilog() logger,hdlr = logapp.iniLog() logger.info("[Title]"+str(self.X)+'-'+par_dict["tid"][0]+'-'+str(e)) hdlr.flush() logger.removeHandler(hdlr) #======================獲取發(fā)表及回復(fù)方法======================= def _SpiderContent(self,ID,nextpage=None): if nextpage==None: FIXED_QUERY = 'cmm='+str(self.X)+'&tid='+ID+'&ref=regulartopics' else: FIXED_QUERY = nextpage[9:] rd = mechanize.Browser() rd.addheaders = [("User-agent", "Tianya/2010 (compatible; MSIE 6.0;Windows NT 5.1)")] rd.open(self.Content_URL + FIXED_QUERY) self.Contentbody=rd.response().read() #rd=mechanize.Request(self.Content_URL + FIXED_QUERY) #response = mechanize.urlopen(rd) #self.Contentbody=response.read() self.Contentsoup = BeautifulSoup(self.Contentbody) NextPageObj= self.Contentsoup("a", {'class' : re.compile("fs-paging-item fs-paging-next")}) try: Tdiv=self.Contentsoup("div", {'class' : "fs-user-action"}) i=0 for Currdiv in Tdiv: if i==0: Ctype='Y' else: Ctype='N' #發(fā)表時(shí)間 soupCurrdiv=BeautifulSoup(str(Currdiv)) PosttimeObj=soupCurrdiv("span", {'class' : "fs-meta"}) Posttime=PosttimeObj[0].next[1:] Posttime=Posttime[0:-3] #IP地址 IPObj=soupCurrdiv("a", {'href' : re.compile("CommMsgAddress")}) if IPObj: IP=IPObj[0].next.strip() else: IP='' #發(fā)表/回復(fù)內(nèi)容 ContentObj=soupCurrdiv("div", {'class' :"fs-user-action-body"}) Content=ContentObj[0].renderContents().strip() """ print "ID:"+str(self.X) print "ID:"+ID print "Ctype:"+Ctype print "POSTTIME:"+Posttime print "IP:"+IP print "Content:"+Content """ self.cursor.execute("insert into Content"+str(self.mod)+" values('%s','%d','%s','%s','%s','%s')" %(ID,self.X,Ctype,Posttime,IP,Content.decode('utf-8'))) self.conn.commit() i+=1 except Exception,e: if self.log_switch=="on": logapp=Pubclilog() logger,hdlr = logapp.iniLog() logger.info("[Content]"+str(self.X)+'-'+ID+'-'+str(e)) hdlr.flush() logger.removeHandler(hdlr) #如“下一頁”有鏈接剛繼續(xù)遍歷 if NextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderContent(ID,NextPageURL) def __del__(self): try: self.cursor.close() self.conn.close() except Exception,e: pass #遍歷comm范圍 def initapp(StartValue,EndValue,log_switch): for x in range(StartValue,EndValue): app=BaseTySpider(x,log_switch) app._SpiderClass() app=None if __name__ == "__main__": #定義命令行參數(shù) MSG_USAGE = "TySpider.py [ -s StartNumber EndNumber ] -l [on|off] [-v][-h]" parser = OptionParser(MSG_USAGE) parser.add_option("-s", "--set", nargs=2,action="store", dest="comm_value", type="int", default=False, help="配置名稱ID值范圍。".decode('utf-8')) parser.add_option("-l", "--log", action="store", dest="log_switch", type="string", default="on", help="錯(cuò)誤日志開關(guān)".decode('utf-8')) parser.add_option("-v","--version", action="store_true", dest="verbose", help="顯示版本信息".decode('utf-8')) opts, args = parser.parse_args() if opts.comm_value: if opts.comm_value[0]>opts.comm_value[1]: print "終止值比起始值還???" exit(); if opts.log_switch=="on": log_switch="on" else: log_switch="off" initapp(opts.comm_value[0],opts.comm_value[1],log_switch) exit(); if opts.verbose: print "WebSite Scider V1.0 beta." exit;
- Python制作爬蟲采集小說
- Python制作簡(jiǎn)單的網(wǎng)頁爬蟲
- 簡(jiǎn)單實(shí)現(xiàn)python爬蟲功能
- 詳解Python爬蟲的基本寫法
- 使用Python編寫爬蟲的基本模塊及框架使用指南
- Python的Scrapy爬蟲框架簡(jiǎn)單學(xué)習(xí)筆記
- Python爬蟲抓取手機(jī)APP的傳輸數(shù)據(jù)
- Python爬蟲模擬登錄帶驗(yàn)證碼網(wǎng)站
- 詳解Python網(wǎng)絡(luò)爬蟲功能的基本寫法
- Python 爬蟲的工具列表大全
- python 網(wǎng)絡(luò)爬蟲初級(jí)實(shí)現(xiàn)代碼
相關(guān)文章
Python調(diào)用API接口實(shí)現(xiàn)人臉識(shí)別
本文主要介紹了Python調(diào)用API接口實(shí)現(xiàn)人臉識(shí)別,文中通過示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2023-02-02Python numpy實(shí)現(xiàn)二維數(shù)組和一維數(shù)組拼接的方法
今天小編就為大家分享一篇Python numpy實(shí)現(xiàn)二維數(shù)組和一維數(shù)組拼接的方法,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過來看看吧2018-06-06淺談Python 多進(jìn)程默認(rèn)不能共享全局變量的問題
今天小編就為大家分享一篇淺談Python 多進(jìn)程默認(rèn)不能共享全局變量的問題,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過來看看吧2019-01-01python deque模塊簡(jiǎn)單使用代碼實(shí)例
這篇文章主要介紹了python deque模塊簡(jiǎn)單使用代碼實(shí)例,文中通過示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友可以參考下2020-03-03基于Python實(shí)現(xiàn)傻瓜式GIF制作工具
有沒有什么內(nèi)容形式,比小視頻更小,比普通圖片更豐富?有,GIF動(dòng)態(tài)圖就是其中一種形式。本文將為大家介紹如何通過Python實(shí)現(xiàn)一個(gè)傻瓜式的gif生成工具,感興趣的可以了解一下2021-12-1210行Python代碼就能實(shí)現(xiàn)的八種有趣功能詳解
Python憑借其簡(jiǎn)潔的代碼,贏得了許多開發(fā)者的喜愛,因此也就促使了更多開發(fā)者用Python開發(fā)新的模塊。面我們來看看,我們用不超過10行代碼能實(shí)現(xiàn)些什么有趣的功能吧2022-03-03