# coding:utf-8

from docx import Document

class ReadDoc(object):              # 定義一個 ReadDoc ，用以讀取 word 文件
    def __init__(self, path):       # 構(gòu)造函數(shù)默認(rèn)傳入讀取 word 文件的路徑
        self.doc = Document(path)
        self.p_text = ''
        self.table_text = ''

        self.get_para()
        self.get_table()

    def get_para(self):             # 定義 get_para 函數(shù)用以讀取 word 文件的段落
        for p in self.doc.paragraphs:
            self.p_text += p.text + '\n'    # 讀取的段落內(nèi)容進行換行
        print(self.p_text)

    def get_table(self):            # 定義 get_table 函數(shù)循環(huán)讀取表格內(nèi)容
        for table in self.doc.tables:
            for row in table.rows:
                _cell_str = ''      # 獲取每一行的完整信息
                for cell in row.cells:
                    _cell_str += cell.text + ','    # 每一行加一個 "," 隔開
                self.table_text += _cell_str + '\n'     # 讀取的表格內(nèi)容進行換行
        print(self.table_text)

if __name__ == '__main__':
    path = glob.os.path.join(glob.os.getcwd(), 'test_file/簡歷1.docx')
    doc = ReadDoc(path)
    print(doc)

看一下 ReadDoc 類的運行結(jié)果

定義 search_word 函數(shù)用以篩選 word 文件內(nèi)容符合想要的簡歷

OK，上文已經(jīng)成功讀取了簡歷的 word 文檔，接下來我們要將讀取到的內(nèi)容通過帥選關(guān)鍵字信息的方式，過濾出包含有關(guān)鍵字的簡歷。

實操案例腳本如下：

# coding:utf-8

import glob

from docx import Document

class ReadDoc(object):              # 定義一個 ReadDoc ，用以讀取 word 文件
    def __init__(self, path):       # 構(gòu)造函數(shù)默認(rèn)傳入讀取 word 文件的路徑
        self.doc = Document(path)
        self.p_text = ''
        self.table_text = ''

        self.get_para()
        self.get_table()

    def get_para(self):             # 定義 get_para 函數(shù)用以讀取 word 文件的段落
        for p in self.doc.paragraphs:
            self.p_text += p.text + '\n'    # 讀取的段落內(nèi)容進行換行
        # print(self.p_text)        # 調(diào)試打印輸出 word 文件的段落內(nèi)容

    def get_table(self):            # 定義 get_table 函數(shù)循環(huán)讀取表格內(nèi)容
        for table in self.doc.tables:
            for row in table.rows:
                _cell_str = ''      # 獲取每一行的完整信息
                for cell in row.cells:
                    _cell_str += cell.text + ','    # 每一行加一個 "," 隔開
                self.table_text += _cell_str + '\n'     # 讀取的表格內(nèi)容進行換行
        # print(self.table_text)    # 調(diào)試打印輸出 word 文件的表格內(nèi)容


def search_word(path, targets):     # 定義 search_word 用以篩選符合內(nèi)容的簡歷；傳入 path 與 targets（targets 為列表）
    result = glob.glob(path)
    final_result = []               # 定義一個空列表，用以后續(xù)存儲文件的信息

    for i in result:             # for 循環(huán)獲取 result 內(nèi)容

        isuse = True                # 是否可用

        if glob.os.path.isfile(i):       # 判斷是否是文件
            if i.endswith('.docx'):      # 判斷文件后綴是否是 "docx" ，若是，則利用 ReadDoc類 實例化該文件對象
                doc = ReadDoc(i)
                p_text = doc.p_text         # 獲取 word 文件內(nèi)容
                table_text = doc.table_text
                all_text = p_text + table_text

                for target in targets:      # for 循環(huán)判斷關(guān)鍵字信息內(nèi)容是否存在
                    if target not in all_text:
                        isuse = False
                        break

                if not isuse:
                    continue
                final_result.append(i)
    return final_result

if __name__ == '__main__':
    path = glob.os.path.join(glob.os.getcwd(), '*')
    result = search_word(path, ['python', 'golang', 'react', '埋點'])      # 埋點是為了演示效果，故意在 "簡歷1.docx" 加上的
    print(result)

運行結(jié)果如下：