# -*- coding: utf-8 -*-


from bs4 import BeautifulSoup
import urllib2

import datetime
import time
import PyRSS2Gen
from email.Utils import formatdate
import re
import sys
import os
reload(sys)
sys.setdefaultencoding('utf-8')

class RssSpider():
 def __init__(self):
 self.myrss = PyRSS2Gen.RSS2(title='OSChina',
link='http://my.oschina.net',
description=str(datetime.date.today()),
pubDate=datetime.datetime.now(),
 lastBuildDate = datetime.datetime.now(),
items=[]
)
self.xmlpath=r'/var/www/myrss/oschina.xml'

self.baseurl="http://www.oschina.net/blog"
 #if os.path.isfile(self.xmlpath):
#os.remove(self.xmlpath)
 def useragent(self,url):
 i_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) 
 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", 
"Referer": 'http://baidu.com/'}
 req = urllib2.Request(url, headers=i_headers)
 html = urllib2.urlopen(req).read()
 return html
 def enterpage(self,url):
 pattern = re.compile(r'd{4}Sd{2}Sd{2}sd{2}Sd{2}')
rsp=self.useragent(url)
soup=BeautifulSoup(rsp)
timespan=soup.find('div',{'class':'BlogStat'})
timespan=str(timespan).strip().replace('n','').decode('utf-8')
match=re.search(r'd{4}Sd{2}Sd{2}sd{2}Sd{2}',timespan)
timestr=str(datetime.date.today())
 if match:
timestr=match.group()
 #print timestr
ititle=soup.title.string
div=soup.find('div',{'class':'BlogContent'})
rss=PyRSS2Gen.RSSItem(
title=ititle,
link=url,
 description = str(div),
 pubDate = timestr
)

 return rss
 def getcontent(self):
rsp=self.useragent(self.baseurl)
soup=BeautifulSoup(rsp)
ul=soup.find('div',{'id':'RecentBlogs'})
 for li in ul.findAll('li'):
div=li.find('div')
 if div is not None:
alink=div.find('a')
 if alink is not None:
link=alink.get('href')
 print link
html=self.enterpage(link)
self.myrss.items.append(html)
 def SaveRssFile(self,filename):
finallxml=self.myrss.to_xml(encoding='utf-8')
file=open(self.xmlpath,'w')
file.writelines(finallxml)
file.close()



if __name__=='__main__':
rssSpider=RssSpider()
rssSpider.getcontent()
rssSpider.SaveRssFile('oschina.xml')

以上所述就是本文的全部?jī)?nèi)容了，希望大家能夠喜歡。

您可能感興趣的文章:

相關(guān)文章

Python 鍵盤事件詳解
這篇文章主要為大家詳細(xì)介紹了Python的鍵盤事件，文中示例代碼介紹的非常詳細(xì)，具有一定的參考價(jià)值，感興趣的小伙伴們可以參考一下
2021-11-11
將python字符串轉(zhuǎn)化成長(zhǎng)表達(dá)式的函數(shù)eval實(shí)例
這篇文章主要介紹了將python字符串轉(zhuǎn)化成長(zhǎng)表達(dá)式的函數(shù)eval實(shí)例，具有很好的參考價(jià)值，希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧
2020-05-05
簡(jiǎn)單實(shí)現(xiàn)python數(shù)獨(dú)游戲
這篇文章主要為大家詳細(xì)介紹了如何簡(jiǎn)單實(shí)現(xiàn)python數(shù)獨(dú)游戲，具有一定的參考價(jià)值，感興趣的小伙伴們可以參考一下
2018-03-03
Python中判斷input()輸入的數(shù)據(jù)的類型
在pyhton中，經(jīng)常會(huì)用到input()語(yǔ)句，但是input()語(yǔ)句輸入的內(nèi)容只能是字符串類型，而我們經(jīng)常要輸入int類型的數(shù)據(jù)等，這個(gè)時(shí)候就需要用到int()方法給輸入的內(nèi)容強(qiáng)制轉(zhuǎn)換，今天小編給大家介紹下Python中判斷input()輸入的數(shù)據(jù)的類型，感興趣的朋友跟隨小編一起看看吧
2022-11-11
Python reques接口測(cè)試框架實(shí)現(xiàn)代碼
這篇文章主要介紹了Python reques接口測(cè)試框架實(shí)現(xiàn)代碼,文中通過(guò)示例代碼介紹的非常詳細(xì)，對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友可以參考下
2020-07-07
python FTP編程基礎(chǔ)入門
這篇文章主要介紹了python FTP編程基礎(chǔ)入門的的相關(guān)資料，幫助大家更好的理解和學(xué)習(xí)使用python，感興趣的朋友可以了解下
2021-02-02
Numpy 數(shù)組操作之元素添加、刪除和修改的實(shí)現(xiàn)
本文主要介紹了Numpy 數(shù)組操作之元素添加、刪除和修改的實(shí)現(xiàn)，文中通過(guò)示例代碼介紹的非常詳細(xì)，對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值，需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧
2023-03-03
Python開(kāi)發(fā)之利用re模塊去除代碼塊注釋
Python的re模塊主要是正則表達(dá)式的操作函數(shù),下面這篇文章主要給大家介紹了關(guān)于Python開(kāi)發(fā)之利用re模塊去除代碼塊注釋的相關(guān)資料,文中通過(guò)實(shí)例代碼介紹的非常詳細(xì),需要的朋友可以參考下
2022-11-11
一文帶你掌握Python內(nèi)置reversed函數(shù)的使用
Python作為一門強(qiáng)大的編程語(yǔ)言,提供了許多內(nèi)置函數(shù)來(lái)處理各種數(shù)據(jù)結(jié)構(gòu)和對(duì)象,本文將詳細(xì)探討reversed函數(shù)的用法、示例代碼以及實(shí)際應(yīng)用場(chǎng)景,需要的可以參考下
2024-01-01
Python接口自動(dòng)化淺析logging日志原理及模塊操作流程
這篇文章主要為大家介紹了Python接口自動(dòng)化系列文章淺析logging日志原理及模塊操作流程，文中詳細(xì)說(shuō)明了為什么需要日志？日志是什么？以及日志用途等基本的原理
2021-08-08