python利用pdfplumber進(jìn)行pdf文檔解析提取

更新時(shí)間：2025年05月14日 09:59:37 作者：去追風(fēng)，去看海

pdfplumber是一個(gè)純 python 第三方庫,適合 python 3.x 版本,可以用來查看pdf各類信息,下面小編就來和大家詳細(xì)講講如何使用pdfplumber進(jìn)行pdf文檔解析提取吧

pdfplumber 的特點(diǎn)

1、它是一個(gè)純 python 第三方庫，適合 python 3.x 版本

、它用來查看pdf各類信息，能有效提取文本、表格

3、它不支持修改或生成pdf，也不支持對pdf掃描件的處理

import glob
import pdfplumber
import re
from collections import defaultdict
import json

class PDFProcessor:
    def __init__(self, filepath):
        self.filepath = filepath
        #打開文檔，注意存放的位置
        self.pdf = pdfplumber.open(filepath)
        self.all_text = defaultdict(dict)
        self.allrow = 0
        self.last_num = 0

    def check_lines(self, page, top, buttom):
    	"""
        用于檢查頁面中的行，并根據(jù)給定的頂部和底部位置來合并行。
        """
    	# 文本數(shù)據(jù)
        lines = page.extract_words()[::]
        text = ''
        last_top = 0
        last_check = 0
        for l in range(len(lines)):
            each_line = lines[l]
            check_re = '(?:。|；|單位：元|單位：萬元|幣種：人民幣|\d|報(bào)告(?:全文)?(?:（修訂版）|（修訂稿）|（更正后）)?)$'
            if top == '' and buttom == '':
                if abs(last_top - each_line['top']) <= 2:
                    text = text + each_line['text']
                #elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):
                elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text):

                    text = text + each_line['text']
                else:
                    text = text + '\n' + each_line['text']
            elif top == '':
                if each_line['top'] > buttom:
                    if abs(last_top - each_line['top']) <= 2:
                        text = text + each_line['text']
                    elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,
                                                                                                          text):
                        text = text + each_line['text']
                    else:
                        text = text + '\n' + each_line['text']
            else:
                if each_line['top'] < top and each_line['top'] > buttom:
                    if abs(last_top - each_line['top']) <= 2:
                        text = text + each_line['text']
                    elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,
                                                                                                          text):
                        text = text + each_line['text']
                    else:
                        text = text + '\n' + each_line['text']
            last_top = each_line['top']
            last_check = each_line['x1'] - page.width * 0.85

        return text

    def drop_empty_cols(self, data):
        # 刪除所有列為空數(shù)據(jù)的列
        transposed_data = list(map(list, zip(*data)))# 轉(zhuǎn)置數(shù)據(jù)
        filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]# 過濾掉空列
        result = list(map(list, zip(*filtered_data)))# 再次轉(zhuǎn)置數(shù)據(jù)
        return result

    @staticmethod
    def keep_visible_lines(obj):
        """
        保留可見的線條。
        If the object is a ``rect`` type, keep it only if the lines are visible.

        A visible line is the one having ``non_stroking_color`` not null.
        """
        if obj['object_type'] == 'rect':
            if obj['non_stroking_color'] is None:
                return False
            if obj['width'] < 1 and obj['height'] < 1:
                return False
            # return obj['width'] >= 1 and obj['height'] >= 1 and obj['non_stroking_color'] is not None
        if obj['object_type'] == 'char':
            return obj['stroking_color'] is not None and obj['non_stroking_color'] is not None
        return True

    def extract_text_and_tables(self, page):
    	"""
        從給定的頁面中提取文本和表格。
        """
        buttom = 0
        page = page.filter(self.keep_visible_lines)
        tables = page.find_tables()
        if len(tables) >= 1:
        	# 表格數(shù)據(jù)
            count = len(tables)
            for table in tables:
                if table.bbox[3] < buttom:
                    pass
                else:
                    count -= 1
                    top = table.bbox[1]
                    text = self.check_lines(page, top, buttom)
                    text_list = text.split('\n')
                    for _t in range(len(text_list)):
                        self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                      'type': 'text', 'inside': text_list[_t]}
                        self.allrow += 1

                    buttom = table.bbox[3]
                    new_table = table.extract()
                    r_count = 0
                    for r in range(len(new_table)):
                        row = new_table[r]
                        if row[0] is None:
                            r_count += 1
                            for c in range(len(row)):
                                if row[c] is not None and row[c] not in ['', ' ']:
                                    if new_table[r - r_count][c] is None:
                                        new_table[r - r_count][c] = row[c]
                                    else:
                                        new_table[r - r_count][c] += row[c]
                                    new_table[r][c] = None
                        else:
                            r_count = 0

                    end_table = []
                    for row in new_table:
                        if row[0] != None:
                            cell_list = []
                            cell_check = False
                            for cell in row:
                                if cell != None:
                                    cell = cell.replace('\n', '')
                                else:
                                    cell = ''
                                if cell != '':
                                    cell_check = True
                                cell_list.append(cell)
                            if cell_check == True:
                                end_table.append(cell_list)
                    end_table = self.drop_empty_cols(end_table)

                    for row in end_table:
                        self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                      'type': 'excel', 'inside': str(row)}
                        # self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'excel',
                        #                               'inside': ' '.join(row)}
                        self.allrow += 1

                    if count == 0:
                        text = self.check_lines(page, '', buttom)
                        text_list = text.split('\n')
                        for _t in range(len(text_list)):
                            self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                          'type': 'text', 'inside': text_list[_t]}
                            self.allrow += 1

        else:
        	#文本數(shù)據(jù)
            text = self.check_lines(page, '', '')
            text_list = text.split('\n')
            for _t in range(len(text_list)):
                self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                              'type': 'text', 'inside': text_list[_t]}
                self.allrow += 1
		# 處理頁眉和頁腳
        first_re = '[^計(jì)](?:報(bào)告(?:全文)?(?:（修訂版）|（修訂稿）|（更正后）)?)$'
        end_re = '^(?:\d|\\|\/|第|共|頁|-|_| ){1,}'
        if self.last_num == 0:
            try:
                first_text = str(self.all_text[1]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if re.search(first_re, first_text) and not '[' in end_text:
                    self.all_text[1]['type'] = '頁眉'
                    if re.search(end_re, end_text) and not '[' in end_text:
                        self.all_text[len(self.all_text) - 1]['type'] = '頁腳'
            except:
                print(page.page_number)
        else:
            try:
                first_text = str(self.all_text[self.last_num + 2]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if re.search(first_re, first_text) and '[' not in end_text:
                    self.all_text[self.last_num + 2]['type'] = '頁眉'
                if re.search(end_re, end_text) and '[' not in end_text:
                    self.all_text[len(self.all_text) - 1]['type'] = '頁腳'
            except:
                print(page.page_number)

        self.last_num = len(self.all_text) - 1


    def process_pdf(self):
    	"""
        處理整個(gè)PDF文檔。
        """
        for i in range(len(self.pdf.pages)):
            self.extract_text_and_tables(self.pdf.pages[i])


    def save_all_text(self, path):
    	"""
        將提取的所有文本保存到指定路徑的文件中。
        """
        with open(path, 'w', encoding='utf-8') as file:
            for key in self.all_text.keys():
                file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n')


def process_all_pdfs_in_folder(folder_path):
	"""
    處理指定文件夾下的所有PDF文件。
    """
    file_paths = glob.glob(f'{folder_path}/*')
    file_paths = sorted(file_paths, reverse=True)

    for file_path in file_paths:
        print(file_path)
        try:
            processor = PDFProcessor(file_path)
            processor.process_pdf()
            save_path = 'RAG_ASMPLE_DATAS_TXTS/' + file_path.split('/')[-1].replace('.pdf', '.txt')
            processor.save_all_text(save_path)
        except:
            print('check')

if __name__ == '__main__':
    # 需要解析的pdf文件路徑
    pdf_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海愛旭新能源股份有限公司__600732__愛旭股份__2019年__年度報(bào)告.pdf'
    # pdf解析后的txt內(nèi)容文件
    out_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海愛旭新能源股份有限公司__600732__愛旭股份__2019年__年度報(bào)告.txt'
    processor = PDFProcessor(pdf_path)
    processor.process_pdf()
    processor.save_all_text(out_path)

提取PDF中的圖片

提取PDF中的圖片并保存到本地

import pdfplumber
import os

# 定義函數(shù)用于提取PDF中的圖片并保存
def extract_images_from_pdf(pdf_file, output_folder):
    # 創(chuàng)建輸出文件夾，如果不存在的話
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    with pdfplumber.open(pdf_file) as pdf:
        # 遍歷每一頁
        for page_number, page in enumerate(pdf.pages, start=1):
            print(f'頁碼：{page.page_number}')
            print(f'頁面寬度：{page.width}')
            print(f'頁面高度：{page.height}')
            
            # 獲取該頁的所有圖片
            images = page.images
            
            # 遍歷該頁的所有圖片
            for idx, image in enumerate(images, start=1):
                # 獲取圖片的二進(jìn)制數(shù)據(jù)
                image_data = image['stream'].get_data()
                
                # 構(gòu)建圖片文件名
                image_filename = os.path.join(output_folder, f'image_{page_number}_{idx}.png')
                
                # 保存圖片到文件
                with open(image_filename, 'wb') as f:
                    f.write(image_data)
                    print(f'圖片已保存至：{image_filename}')

# 示例使用
pdf_file = 'example.pdf'
output_folder = 'extracted_images'
extract_images_from_pdf(pdf_file, output_folder)

提取pdf 表格文本

保存為excel文件

import pdfplumber
from openpyxl import Workbook

# 定義函數(shù)用于提取PDF中的表格并保存為Excel文件
def extract_tables_to_excel(pdf_file, excel_output_file):
    with pdfplumber.open(pdf_file) as pdf:
        workbook = Workbook()
        sheet = workbook.active
        
        # 遍歷每一頁
        for page in pdf.pages:
            # 提取該頁的表格
            table = page.extract_table()
            
            # 如果表格存在，則將其寫入Excel文件
            if table:
                for row in table:
                    sheet.append(row)
        
        # 保存Excel文件
        workbook.save(excel_output_file)

# 示例使用
pdf_file = 'example.pdf'
excel_output_file = 'tables.xlsx'
extract_tables_to_excel(pdf_file, excel_output_file)

保存為文本文件

import pdfplumber

# 定義函數(shù)用于提取PDF中的表格并保存為文本文件
def extract_tables_to_text(pdf_file, text_output_file):
    with pdfplumber.open(pdf_file) as pdf:
        with open(text_output_file, 'w', encoding='utf-8') as output:
            # 遍歷每一頁
            for page in pdf.pages:
                # 提取該頁的表格
                table = page.extract_table()
                
                # 如果表格存在，則將其寫入文本文件
                if table:
                    for row in table:
                        output.write('\t'.join(str(cell) for cell in row) + '\n')

# 示例使用
pdf_file = 'example.pdf'
text_output_file = 'tables.txt'
extract_tables_to_text(pdf_file, text_output_file)

提取PDF純文本

import pdfplumber

# 定義函數(shù)用于提取PDF中的純文本并保存為文本文件
def extract_text_to_file(pdf_file, text_output_file):
    with pdfplumber.open(pdf_file) as pdf:
        with open(text_output_file, 'w', encoding='utf-8') as output:
            # 遍歷每一頁
            for page in pdf.pages:
                # 提取該頁的文本
                text = page.extract_text()
                
                # 如果文本存在，則將其寫入文本文件
                if text:
                    output.write(text)

# 示例使用
pdf_file = 'example.pdf'
text_output_file = 'text.txt'
extract_text_to_file(pdf_file, text_output_file)

讀取富文本txt

python 讀取文件函數(shù)有三種 read()、readline()、readlines()

read() 一次性讀取所有文本
readline() 讀取第一行的內(nèi)容
readlines() 讀取全部內(nèi)容，以數(shù)列的格式返回

# 一次性讀取所有文本
with open('story.txt', 'r', encoding='utf-8') as f:
    data = f.read()
    print(data)

# 讀取第一行的內(nèi)容
with open('story.txt', 'r', encoding='utf-8') as f:
    data = f.readline()
    print(data)

# 讀取全部內(nèi)容，逐行讀取并去除換行符
with open('story.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        line = line.strip('\n')
        print(line)

以上就是python利用pdfplumber進(jìn)行pdf文檔解析提取的詳細(xì)內(nèi)容，更多關(guān)于python pdfplumber解析pdf的資料請關(guān)注腳本之家其它相關(guān)文章！

您可能感興趣的文章: