快捷導(dǎo)航
python 解析html之BeautifulSoup

更新時間：2009年07月07日 17:07:23 作者：
項目里需要解析html，采用python語言實現(xiàn)，發(fā)現(xiàn)了BeautifulSoup這個好用的東西，寫了一個程序，可能大家不知道，干什么用的，目的是讓大家知道如何使用 BeautifulSoup 當然我這個是用都是很初級的，高級的使用，偶也沒有學(xué)會呢，太高深了
復(fù)制代碼代碼如下:
# coding=utf-8 
from BeautifulSoup import BeautifulSoup, Tag, NavigableString 
from SentenceSpliter import SentenceSpliter 
from os.path import basename,dirname,isdir,isfile 
from os import makedirs 
from shutil import copyfile 
import io 
import time 
import re 

class build_tpl: 
    def __init__(self,parse_file,build_tpl_name,cp_pic_dir,show_pic_dir,js_path,set_lang=2052): 
        '''參數(shù)說明：解析文件名，模版名稱，保存圖片路徑，圖片顯示路徑，js路徑，當前語言（分句使用）''' 

        #取得解析文件目錄路徑 
        if len(dirname(parse_file))>1: 
            self.cur_dir = dirname(parse_file)+"/"; 
        else: 
            self.cur_dir ="./"; 

        #建立的模版文件文件名 
        self.build_tpl_name = build_tpl_name; 
        #圖片cp到得目錄 
        self.cp_pic_dir = cp_pic_dir; 
        #通過http展現(xiàn)圖片的目錄 
        self.show_pic_dir = show_pic_dir; 
        #加載js的路徑 
        self.js_path = js_path; 

        #句段組 
        self.get_text_arr = []; 
        #當前圖片名數(shù)組 
        self.cur_pic_arr = []; 

        #解析文件 取得soup 資源 
        self.soup = self.get_soup(parse_file); 
        #取得html文檔中，段文檔 
        self.get_text_arr = self.soup.body.findAll(text=lambda(x): len(x.strip()) > 0); 
        #取得句對 
        self.get_sentence_arr = self.parse_text(self.get_text_arr,set_lang); 
        #取得替換數(shù)組 
        self.replace_list = self.get_replace_list(self.get_text_arr,set_lang); 
        #取得圖片數(shù)組 
        self.cur_pic_arr = self.soup.findAll('img'); 

        #self.write_file_by_list("no.txt",self.get_text_arr); 
        #self.write_file_by_list("yes.txt",self.get_sentence_arr); 

    #保存詞組到文件 
    def save_data_file(self): 
        file_name = self.build_tpl_name+".data"; 
        self.write_file_by_list(file_name,self.get_data()); 
    #取得詞組 
    def get_data(self): 
        return self.get_sentence_arr; 
    #數(shù)組寫入到文檔 
    def write_file_by_list(self,file_name,write_arr): 
        file=io.FileIO(file_name,"w"); 
        file.write(('\n'.join(write_arr)).encode('utf-8')); 
        file.close(); 
    #字符串寫入到文檔 
    def write_file(self,file_name,file_contents): 
        file=io.FileIO(file_name,"w"); 
        file.write(file_contents.encode('utf-8')); 
        file.close(); 
    #建立圖片hash目錄 
    def get_pic_hash(self): 
        return time.strftime("%Y/%m/%d/"); 
    #建立模版文件 
    def builder(self): 
        #沒能發(fā)生替換的單詞 
        bug_msg = []; 
        #進行內(nèi)容模版替換 
        for i in range(len(self.get_text_arr)): 
            #替換 
            rep_str = "$rep_arr[{0}]".format(i); 
            try: 
                self.soup.body.find(text=self.get_text_arr[i]).replaceWith(self.replace_list[i]); 
            except AttributeError: 
                bug_msg.append(self.get_text_arr[i]); 

        #取得圖片hash路徑 
        hash_dir = self.get_pic_hash(); 
        #構(gòu)造展示圖片路徑 
        show_pic_dir = self.show_pic_dir+hash_dir; 
        #構(gòu)造圖片保存路徑 
        cp_pic_dir = self.cp_pic_dir+hash_dir; 

        #判斷保存圖片的目錄是否存在 不存在建立 
        if not isdir(cp_pic_dir): 
            makedirs(cp_pic_dir); 

        for pic_name in self.cur_pic_arr: 
            #進行圖片路徑替換 
            old_pic_src = pic_name['src']; 
            pic_name['src'] = show_pic_dir+old_pic_src; 
            #進行圖片拷貝 
            cp_src_file = self.cur_dir+old_pic_src; 
            cp_dis_file = cp_pic_dir+old_pic_src; 
            copyfile(cp_src_file,cp_dis_file); 

        #建立bug信息的文檔 
        #self.write_file_by_list("bug.txt",bug_msg); 

        #添加js 
        tag = Tag(self.soup,"script"); 
        tag['type'] = "text/javascript"; 
        tag['src'] =self.js_path+"jquery.js"; 

        tag2 = Tag(self.soup,"script"); 
        tag2['type'] = "text/javascript"; 
        tag2['src'] =self.js_path+"init.js"; 

        self.soup.head.insert(2,tag2); 
        self.soup.head.insert(2,tag); 


        #建立模版 
        self.write_file(self.build_tpl_name,self.soup); 
    #取得替換的html文件     
    def get_replace_html(self,rep_id,rep_data=""): 
        ''' 
        參數(shù)說明：替換id，替換內(nèi)容（為空的采用模版模式替換） 
        ''' 
        if len(rep_data) > 0 : 
            rep_str = rep_data; 
        else: 
            rep_str = "$rep_arr[{0}]".format(rep_id); 
        return "<span sty=\"data\" id=\"rep_"+str(rep_id)+"\">"+rep_str+"</span>"; 
    #取得替換數(shù)組 
    def get_replace_list(self,text_arr,set_lang): 
        Sp = SentenceSpliter(); 
        Sp.SetLang(set_lang); 
        temp_sentence = []; 
        jump_i = 0; 
        for text in text_arr: 
            SList = Sp.Split(text); 
            replace_temp = ""; 
            if SList != None: 
                for item in SList: 
                    replace_temp = replace_temp+self.get_replace_html(jump_i,item); 
                    jump_i=jump_i+1; 
            else: 
                replace_temp = self.get_replace_html(jump_i,text); 
                jump_i=jump_i+1; 
            temp_sentence.append(replace_temp); 
        return temp_sentence; 
    #分句 
    def parse_text(self,text_arr,set_lang): 
        Sp = SentenceSpliter(); 
        Sp.SetLang(set_lang); 
        temp_sentence = []; 
        for text in text_arr: 
            SList = Sp.Split(text); 
            if SList != None: 
                for item in SList: 
                    temp_sentence.append(item); 
            else: 
                temp_sentence.append(text); 

        return temp_sentence; 

    #取得解析資源 
    def get_soup(self,parse_file): 
        try: 
            file=io.FileIO(parse_file,"r"); 
            doc = file.readall(); 
            file.close(); 
        except IOError: 
            print 'ERROR: %s file not found!' %parse_file; 
            return False; 
        #開始解析html文檔 
        return BeautifulSoup(''.join(doc)); 

if __name__ == "__main__": 
    from sys import argv, exit; 

    if len(argv) < 3: 
        print "USAGE: python %s <input-file> <output-file>" % argv[0] 
        exit(255); 

    if not isfile(argv[1]): 
        print "no such input file: %s" % argv[1] 
        exit(1) 


    paser_file = argv[1];#"html/testpic.html"; 
    tpl_file = argv[2]; 
    save_pic_path = argv[3]; 
    show_pic_path = argv[4]; 
    load_js_path = argv[5]; 
    #解析開始 設(shè)置解析文件，模版名，圖片保存路徑，圖片顯示路徑 
    so = build_tpl(paser_file,tpl_file,save_pic_path,show_pic_path,load_js_path); 
    #建立模版 
    so.builder(); 
    #保存分句的句對 
    so.save_data_file();