快捷導(dǎo)航

Python讀取pdf文件的簡單代碼示例

更新時(shí)間：2024年02月18日 10:28:22 作者：JSON_L

PDF文件的數(shù)據(jù)主要是文本、圖片、表格,這三部分組成,但是也會(huì)穿插流程圖、各種柱狀圖等,這篇文章主要給大家介紹了關(guān)于Python讀取pdf文件的簡單代碼示例,需要的朋友可以參考下

安裝命令

需要安裝操作pdf的三方類庫，命令如下：

pip install pdfminer3K

安裝過程如下：

引入類庫

需要引入很多的類庫。

示例如下：

import sys
import importlib
importlib.reload(sys)

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import  PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

讀取pdf實(shí)現(xiàn)

實(shí)現(xiàn)步驟為：先通過二進(jìn)制方式打開測(cè)試pdf文檔，創(chuàng)建pdf文檔解析測(cè)試文檔內(nèi)容，

最后讀取文件內(nèi)容，保存到另一個(gè)文件中。

示例如下：

import sys
import importlib

importlib.reload(sys)

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
import os

def read_pdf(path, toPath):
    # 以二進(jìn)制方式打開pdf文件
    f = open(path, 'rb')

    # 創(chuàng)建一個(gè)pdf文檔分析器
    parser = PDFParser(f)
    # 創(chuàng)建pdf文檔
    pdfFile = PDFDocument()
    # 鏈接分析器與文檔對(duì)象
    parser.set_document(pdfFile)
    pdfFile.set_parser(parser)
    # 提供初始化密碼
    pdfFile.initialize()

    # 檢測(cè)文檔是否提供txt轉(zhuǎn)換
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 解析數(shù)據(jù)
        # 數(shù)據(jù)管理器
        manager = PDFResourceManager()
        # 創(chuàng)建一個(gè)PDF設(shè)備對(duì)象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        # 解釋器對(duì)象
        interpreter = PDFPageInterpreter(manager, device)
        for page in pdfFile.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    with open(toPath, 'a', encoding='utf-8') as f:
                        print(x.get_text())
                        f.write(x.get_text() + "\n")

path = os.path.join(os.getcwd(), 'test_1.pdf')
toPath = os.path.join(os.getcwd(), 'test_2.txt')
read_pdf(path, toPath)

注意：無法讀取中文，貌似需要加載中文字體。還有就是在寫入pdf文件，格式不對(duì)無法打開暫時(shí)沒找到原因。

附：python讀取PDF文件并做詞云可視化

import pdfplumber  # 導(dǎo)入庫
import jieba
from wordcloud import WordCloud
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
# 用pdf文件解析器讀取文件
with pdfplumber.open('中華文化.pdf') as f:
    # 用for循環(huán)讀取文件中的每一頁
    for page in f.pages:
        text = page.extract_text()
        txt_f = open(r'中華文化.txt', mode='a', encoding='utf-8')  # 創(chuàng)建txt文件
        txt_f.write(text)  # 寫入txt文件
 
file = open('中華文化.txt',encoding='utf-8')
file = file.read()  #讀取txt文件
txtlist = jieba.lcut(file) 
string = " ".join(txtlist) 
stop_words = {}  
counts = {}  
for txt in txtlist:
    if len(txt) == 1:  
        stop_words[txt] = stop_words.get(txt, 0) + 1
    else:
        counts[txt] = counts.get(txt, 0) + 1 
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)  
y1 = []
labels = []
for i in range(1,10):
    y1.append(items[i][1])
    labels.append(items[i][0])
# plt.figure(figsize=(8,4))
width = 0.3
x = np.arange(len(y1))
a = [i for i in range(0,9)]
plt.xticks(a,labels,rotation = 30)
plt.bar(x=x,height=y1,width=width)
plt.title('PDF文件中熱詞統(tǒng)計(jì)分析')
plt.savefig("熱詞統(tǒng)計(jì)分析.png")
plt.show()
print("-------熱詞統(tǒng)計(jì)分析完成！-------")
stoplist=[]   
item = list(stop_words.items())
for i in range(len(item)): 
    txt,count = item[i]
    stoplist.append(txt)  
#print(stoplist)
setlist = set(stoplist)  
wcd = WordCloud(width=1000, height=700, background_color='white', font_path='msyh.ttc', scale=15, stopwords=setlist)
wcd.generate(string)
wcd.to_image()
print("-------熱詞詞云生成完成！-------")
wcd.to_file('詞云.png')  # 導(dǎo)出圖片