詳解如何通過Python實現(xiàn)批量數(shù)據(jù)提取

更新時間：2023年03月21日 09:58:41 作者：ncq的小舔狗

每天面對成堆的發(fā)票，無論是發(fā)票還是承兌單據(jù)，抑或是其他各類公司數(shù)據(jù)要從照片、PDF等不同格式的內(nèi)容中提取，我們都有必要進(jìn)行快速辦公的能力提升。本文就教你如何利用Python實現(xiàn)批量數(shù)據(jù)提取吧

配置需求

1.ImageMagick

2.tesseract-OCR

3.Python3.7

4.from PIL import Image as PI

5.import io

6.import os

7.import pyocr.builders

8.from cnocr import CnOcr

9.import xlwt

分析上圖發(fā)現(xiàn)票據(jù)金額為“貳拾萬元整”，數(shù)據(jù)金額為大寫中文，因此在導(dǎo)入Excel之前我們需要將金額票據(jù)的數(shù)據(jù)轉(zhuǎn)換成數(shù)字的格式，基于此，我們需要首先完成大寫漢字和數(shù)字的轉(zhuǎn)換。

def chineseNumber2Int(strNum: str):
    result = 0
    temp = 1  # 存放一個單位的數(shù)字如：十萬
    count = 0  # 判斷是否有chArr
    cnArr = ['壹', '貳', '叁', '肆', '伍', '陸', '柒', '捌', '玖']
    chArr = ['拾', '佰', '仟', '萬', '億']
    for i in range(len(strNum)):
        b = True
        c = strNum[i]
        for j in range(len(cnArr)):
            if c == cnArr[j]:
                if count != 0:
                    result += temp
                    count = 0
                temp = j + 1
                b = False
                break
        if b:
            for j in range(len(chArr)):
                if c == chArr[j]:
                    if j == 0:
                        temp *= 10
                    elif j == 1:
                        temp *= 100
                    elif j == 2:
                        temp *= 1000
                    elif j == 3:
                        temp *= 10000
                    elif j == 4:
                        temp *= 100000000
                count += 1
        if i == len(strNum) - 1:
            result += temp
    return result

通過上述代碼即可實現(xiàn)大寫字母與數(shù)字的轉(zhuǎn)換，例如輸入“貳拾萬元整”即可導(dǎo)出“200000”，再將其轉(zhuǎn)換成數(shù)字后即可極大地簡化表格的操作，也可以在完成表格操作的同時有利于數(shù)據(jù)歸檔。

接下來，我們需要分析發(fā)票的內(nèi)部內(nèi)容，分析下圖可知，我們需要獲取以下幾個數(shù)據(jù)內(nèi)容：“出票日期”、“匯票到賬日期”、“票據(jù)號碼”、“收款人”、“票據(jù)金額”、“出票人”，可以通過畫圖軟件獲取精準(zhǔn)定位。

如圖，小黑點即鼠標(biāo)所在地，畫圖軟件左下角即他的坐標(biāo)。

提取出票日期

def text1(new_img):
    #提取出票日期
    left = 80
    top = 143
    right = 162
    bottom = 162
    image_text1 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text1.show()
    txt1 = tool.image_to_string(image_text1)
    print(txt1)
    return str(txt1)

提取金額

def text2(new_img):
    #提取金額
    left = 224
    top = 355
    right = 585
    bottom = 380
    image_text2 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text2.show()
    image_text2.save("img/tmp.png")
    temp = ocr.ocr("img/tmp.png")
    temp="".join(temp[0])
    txt2=chineseNumber2Int(temp)
    print(txt2)
    return txt2

提取出票人

def text3(new_img):
    #提取出票人
    left = 177
    top = 207
    right = 506
    bottom = 231
    image_text3 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text3.show()
    image_text3.save("img/tmp.png")
    temp = ocr.ocr("img/tmp.png")
    txt3="".join(temp[0])
    print(txt3)
    return txt3

提取付款行

def text4(new_img):
    #提取付款行
    left = 177
    top = 274
    right = 492
    bottom = 311
    image_text4 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text4.show()
    image_text4.save("img/tmp.png")
    temp = ocr.ocr("img/tmp.png")
    txt4="".join(temp[0])
    print(txt4)
    return txt4

提取匯票到賬日期

def text5(new_img):
    #提取匯票到日期
    left = 92
    top = 166
    right = 176
    bottom = 184
    image_text5 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text5.show()
    txt5 = tool.image_to_string(image_text5)
    print(txt5)
    return txt5

提取票據(jù)單據(jù)

def text6(new_img):
    #提取票據(jù)號碼
    left = 598
    top = 166
    right = 870
    bottom = 182
    image_text6 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text6.show()
    txt6 = tool.image_to_string(image_text6)
    print(txt6)
    return txt6

在將數(shù)據(jù)全部提取完成之后，即進(jìn)入設(shè)置環(huán)節(jié)，我們需要首先將所有賬單文件進(jìn)行提取，獲取他們的文件名和路徑。

ocr=CnOcr()
tool = pyocr.get_available_tools()[0]
filePath='img'
img_name=[]
for i,j,name in os.walk(filePath):
    img_name=name

在獲取完整后，即可進(jìn)行數(shù)據(jù)導(dǎo)入Excel的操作。

count=1
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('test',cell_overwrite_ok=True)
for i in img_name:
    img_url = filePath+"/"+i
    with open(img_url, 'rb') as f:
        a = f.read()
    new_img = PI.open(io.BytesIO(a))
    ## 寫入csv
    col = ('年份','出票日期','金額','出票人','付款行全稱','匯票到日期','備注')
    for j in range(0,7):
        sheet.write(0,j,col[j])
    book.save('1.csv')
    shijian=text1(new_img)
    sheet.write(count,0,shijian[0:4])
    sheet.write(count,1,shijian[5:])
    sheet.write(count,2,text2(new_img))
    sheet.write(count,3,text3(new_img))
    sheet.write(count,4,text4(new_img))
    sheet.write(count,5,text5(new_img))
    sheet.write(count,6,text6(new_img))
    count = count + 1

至此，完整流程結(jié)束。

附上源碼全部

from  wand.image import  Image
from PIL import Image as PI
import pyocr
import io
import re
import os
import shutil
import pyocr.builders
from cnocr import CnOcr
import requests
import xlrd
import xlwt
from openpyxl import load_workbook
 
def chineseNumber2Int(strNum: str):
    result = 0
    temp = 1  # 存放一個單位的數(shù)字如：十萬
    count = 0  # 判斷是否有chArr
    cnArr = ['壹', '貳', '叁', '肆', '伍', '陸', '柒', '捌', '玖']
    chArr = ['拾', '佰', '仟', '萬', '億']
    for i in range(len(strNum)):
        b = True
        c = strNum[i]
        for j in range(len(cnArr)):
            if c == cnArr[j]:
                if count != 0:
                    result += temp
                    count = 0
                temp = j + 1
                b = False
                break
        if b:
            for j in range(len(chArr)):
                if c == chArr[j]:
                    if j == 0:
                        temp *= 10
                    elif j == 1:
                        temp *= 100
                    elif j == 2:
                        temp *= 1000
                    elif j == 3:
                        temp *= 10000
                    elif j == 4:
                        temp *= 100000000
                count += 1
        if i == len(strNum) - 1:
            result += temp
    return result
 
 
def text1(new_img):
    #提取出票日期
 
    left = 80
    top = 143
    right = 162
    bottom = 162
    image_text1 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text1.show()
    txt1 = tool.image_to_string(image_text1)
 
    print(txt1)
    return str(txt1)
def text2(new_img):
    #提取金額
 
    left = 224
    top = 355
    right = 585
    bottom = 380
    image_text2 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text2.show()
    image_text2.save("img/tmp.png")
 
    temp = ocr.ocr("img/tmp.png")
 
    temp="".join(temp[0])
    txt2=chineseNumber2Int(temp)
    print(txt2)
 
    return txt2
 
def text3(new_img):
    #提取出票人
 
    left = 177
    top = 207
    right = 506
    bottom = 231
    image_text3 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text3.show()
    image_text3.save("img/tmp.png")
 
    temp = ocr.ocr("img/tmp.png")
    txt3="".join(temp[0])
 
    print(txt3)
    return txt3
def text4(new_img):
    #提取付款行
 
    left = 177
    top = 274
    right = 492
    bottom = 311
    image_text4 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text4.show()
    image_text4.save("img/tmp.png")
 
    temp = ocr.ocr("img/tmp.png")
    txt4="".join(temp[0])
 
    print(txt4)
    return txt4
def text5(new_img):
    #提取匯票到日期
 
    left = 92
    top = 166
    right = 176
    bottom = 184
    image_text5 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text5.show()
    txt5 = tool.image_to_string(image_text5)
 
    print(txt5)
    return txt5
def text6(new_img):
    #提取票據(jù)號碼
 
    left = 598
    top = 166
    right = 870
    bottom = 182
    image_text6 = new_img.crop((left, top, right, bottom))
    #展示圖片
    #image_text6.show()
    txt6 = tool.image_to_string(image_text6)
 
    print(txt6)
    return txt6
 
 
 
ocr=CnOcr()
 
tool = pyocr.get_available_tools()[0]
 
filePath='img'
img_name=[]
for i,j,name in os.walk(filePath):
    img_name=name
count=1
 
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('test',cell_overwrite_ok=True)
 
for i in img_name:
    img_url = filePath+"/"+i
    with open(img_url, 'rb') as f:
        a = f.read()
    new_img = PI.open(io.BytesIO(a))
    ## 寫入csv
    col = ('年份','出票日期','金額','出票人','付款行全稱','匯票到日期','備注')
    for j in range(0,7):
        sheet.write(0,j,col[j])
    book.save('1.csv')
    shijian=text1(new_img)
    sheet.write(count,0,shijian[0:4])
    sheet.write(count,1,shijian[5:])
    sheet.write(count,2,text2(new_img))
    sheet.write(count,3,text3(new_img))
    sheet.write(count,4,text4(new_img))
    sheet.write(count,5,text5(new_img))
    sheet.write(count,6,text6(new_img))
    count = count + 1

以上就是詳解如何通過Python實現(xiàn)批量數(shù)據(jù)提取的詳細(xì)內(nèi)容，更多關(guān)于Python批量數(shù)據(jù)提取的資料請關(guān)注腳本之家其它相關(guān)文章！

您可能感興趣的文章: