詳解如何通過Python實(shí)現(xiàn)批量數(shù)據(jù)提取
每天面對成堆的發(fā)票,無論是發(fā)票還是承兌單據(jù),抑或是其他各類公司數(shù)據(jù)要從照片、PDF等不同格式的內(nèi)容中提取,我們都有必要進(jìn)行快速辦公的能力提升。
因此,我們的目標(biāo)要求就十分明顯了,首先要從圖片中獲取數(shù)據(jù),其次將數(shù)據(jù)統(tǒng)一導(dǎo)入到EXCEL中。
配置需求
1.ImageMagick
2.tesseract-OCR
3.Python3.7
4.from PIL import Image as PI
5.import io
6.import os
7.import pyocr.builders
8.from cnocr import CnOcr
9.import xlwt
分析上圖發(fā)現(xiàn)票據(jù)金額為“貳拾萬元整”,數(shù)據(jù)金額為大寫中文,因此在導(dǎo)入Excel之前我們需要將金額票據(jù)的數(shù)據(jù)轉(zhuǎn)換成數(shù)字的格式,基于此,我們需要首先完成大寫漢字和數(shù)字的轉(zhuǎn)換。
def chineseNumber2Int(strNum: str): result = 0 temp = 1 # 存放一個(gè)單位的數(shù)字如:十萬 count = 0 # 判斷是否有chArr cnArr = ['壹', '貳', '叁', '肆', '伍', '陸', '柒', '捌', '玖'] chArr = ['拾', '佰', '仟', '萬', '億'] for i in range(len(strNum)): b = True c = strNum[i] for j in range(len(cnArr)): if c == cnArr[j]: if count != 0: result += temp count = 0 temp = j + 1 b = False break if b: for j in range(len(chArr)): if c == chArr[j]: if j == 0: temp *= 10 elif j == 1: temp *= 100 elif j == 2: temp *= 1000 elif j == 3: temp *= 10000 elif j == 4: temp *= 100000000 count += 1 if i == len(strNum) - 1: result += temp return result
通過上述代碼即可實(shí)現(xiàn)大寫字母與數(shù)字的轉(zhuǎn)換,例如輸入“貳拾萬元整”即可導(dǎo)出“200000”,再將其轉(zhuǎn)換成數(shù)字后即可極大地簡化表格的操作,也可以在完成表格操作的同時(shí)有利于數(shù)據(jù)歸檔。
接下來,我們需要分析發(fā)票的內(nèi)部內(nèi)容,分析下圖可知,我們需要獲取以下幾個(gè)數(shù)據(jù)內(nèi)容:“出票日期”、“匯票到賬日期”、“票據(jù)號(hào)碼”、“收款人”、“票據(jù)金額”、“出票人”,可以通過畫圖軟件獲取精準(zhǔn)定位。
如圖,小黑點(diǎn)即鼠標(biāo)所在地,畫圖軟件左下角即他的坐標(biāo)。
提取出票日期
def text1(new_img): #提取出票日期 left = 80 top = 143 right = 162 bottom = 162 image_text1 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text1.show() txt1 = tool.image_to_string(image_text1) print(txt1) return str(txt1)
提取金額
def text2(new_img): #提取金額 left = 224 top = 355 right = 585 bottom = 380 image_text2 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text2.show() image_text2.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") temp="".join(temp[0]) txt2=chineseNumber2Int(temp) print(txt2) return txt2
提取出票人
def text3(new_img): #提取出票人 left = 177 top = 207 right = 506 bottom = 231 image_text3 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text3.show() image_text3.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt3="".join(temp[0]) print(txt3) return txt3
提取付款行
def text4(new_img): #提取付款行 left = 177 top = 274 right = 492 bottom = 311 image_text4 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text4.show() image_text4.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt4="".join(temp[0]) print(txt4) return txt4
提取匯票到賬日期
def text5(new_img): #提取匯票到日期 left = 92 top = 166 right = 176 bottom = 184 image_text5 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text5.show() txt5 = tool.image_to_string(image_text5) print(txt5) return txt5
提取票據(jù)單據(jù)
def text6(new_img): #提取票據(jù)號(hào)碼 left = 598 top = 166 right = 870 bottom = 182 image_text6 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text6.show() txt6 = tool.image_to_string(image_text6) print(txt6) return txt6
在將數(shù)據(jù)全部提取完成之后,即進(jìn)入設(shè)置環(huán)節(jié),我們需要首先將所有賬單文件進(jìn)行提取,獲取他們的文件名和路徑。
ocr=CnOcr() tool = pyocr.get_available_tools()[0] filePath='img' img_name=[] for i,j,name in os.walk(filePath): img_name=name
在獲取完整后,即可進(jìn)行數(shù)據(jù)導(dǎo)入Excel的操作。
count=1 book = xlwt.Workbook(encoding='utf-8',style_compression=0) sheet = book.add_sheet('test',cell_overwrite_ok=True) for i in img_name: img_url = filePath+"/"+i with open(img_url, 'rb') as f: a = f.read() new_img = PI.open(io.BytesIO(a)) ## 寫入csv col = ('年份','出票日期','金額','出票人','付款行全稱','匯票到日期','備注') for j in range(0,7): sheet.write(0,j,col[j]) book.save('1.csv') shijian=text1(new_img) sheet.write(count,0,shijian[0:4]) sheet.write(count,1,shijian[5:]) sheet.write(count,2,text2(new_img)) sheet.write(count,3,text3(new_img)) sheet.write(count,4,text4(new_img)) sheet.write(count,5,text5(new_img)) sheet.write(count,6,text6(new_img)) count = count + 1
至此,完整流程結(jié)束。
附上源碼全部
from wand.image import Image from PIL import Image as PI import pyocr import io import re import os import shutil import pyocr.builders from cnocr import CnOcr import requests import xlrd import xlwt from openpyxl import load_workbook def chineseNumber2Int(strNum: str): result = 0 temp = 1 # 存放一個(gè)單位的數(shù)字如:十萬 count = 0 # 判斷是否有chArr cnArr = ['壹', '貳', '叁', '肆', '伍', '陸', '柒', '捌', '玖'] chArr = ['拾', '佰', '仟', '萬', '億'] for i in range(len(strNum)): b = True c = strNum[i] for j in range(len(cnArr)): if c == cnArr[j]: if count != 0: result += temp count = 0 temp = j + 1 b = False break if b: for j in range(len(chArr)): if c == chArr[j]: if j == 0: temp *= 10 elif j == 1: temp *= 100 elif j == 2: temp *= 1000 elif j == 3: temp *= 10000 elif j == 4: temp *= 100000000 count += 1 if i == len(strNum) - 1: result += temp return result def text1(new_img): #提取出票日期 left = 80 top = 143 right = 162 bottom = 162 image_text1 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text1.show() txt1 = tool.image_to_string(image_text1) print(txt1) return str(txt1) def text2(new_img): #提取金額 left = 224 top = 355 right = 585 bottom = 380 image_text2 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text2.show() image_text2.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") temp="".join(temp[0]) txt2=chineseNumber2Int(temp) print(txt2) return txt2 def text3(new_img): #提取出票人 left = 177 top = 207 right = 506 bottom = 231 image_text3 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text3.show() image_text3.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt3="".join(temp[0]) print(txt3) return txt3 def text4(new_img): #提取付款行 left = 177 top = 274 right = 492 bottom = 311 image_text4 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text4.show() image_text4.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt4="".join(temp[0]) print(txt4) return txt4 def text5(new_img): #提取匯票到日期 left = 92 top = 166 right = 176 bottom = 184 image_text5 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text5.show() txt5 = tool.image_to_string(image_text5) print(txt5) return txt5 def text6(new_img): #提取票據(jù)號(hào)碼 left = 598 top = 166 right = 870 bottom = 182 image_text6 = new_img.crop((left, top, right, bottom)) #展示圖片 #image_text6.show() txt6 = tool.image_to_string(image_text6) print(txt6) return txt6 ocr=CnOcr() tool = pyocr.get_available_tools()[0] filePath='img' img_name=[] for i,j,name in os.walk(filePath): img_name=name count=1 book = xlwt.Workbook(encoding='utf-8',style_compression=0) sheet = book.add_sheet('test',cell_overwrite_ok=True) for i in img_name: img_url = filePath+"/"+i with open(img_url, 'rb') as f: a = f.read() new_img = PI.open(io.BytesIO(a)) ## 寫入csv col = ('年份','出票日期','金額','出票人','付款行全稱','匯票到日期','備注') for j in range(0,7): sheet.write(0,j,col[j]) book.save('1.csv') shijian=text1(new_img) sheet.write(count,0,shijian[0:4]) sheet.write(count,1,shijian[5:]) sheet.write(count,2,text2(new_img)) sheet.write(count,3,text3(new_img)) sheet.write(count,4,text4(new_img)) sheet.write(count,5,text5(new_img)) sheet.write(count,6,text6(new_img)) count = count + 1
以上就是詳解如何通過Python實(shí)現(xiàn)批量數(shù)據(jù)提取的詳細(xì)內(nèi)容,更多關(guān)于Python批量數(shù)據(jù)提取的資料請關(guān)注腳本之家其它相關(guān)文章!
相關(guān)文章
python備份文件以及mysql數(shù)據(jù)庫的腳本代碼
最近正在學(xué)習(xí)python,看了幾天了,,所以寫個(gè)小腳本練習(xí)練習(xí),沒什么含金量,只當(dāng)練手2013-06-06Keras搭建Mask?R-CNN實(shí)例分割平臺(tái)實(shí)現(xiàn)源碼
這篇文章主要為大家介紹了Keras搭建Mask?R-CNN實(shí)例分割平臺(tái)實(shí)現(xiàn)源碼,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2022-05-05淺析python 中__name__ = ''__main__'' 的作用
這篇文章主要介紹了python 中__name__ = '__main__' 的作用,對于初學(xué)者來說很有幫助,需要的朋友可以參考下2014-07-07詳解Python 爬取13個(gè)旅游城市,告訴你五一大家最愛去哪玩?
這篇文章主要介紹了Python 爬取13個(gè)旅游城市,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2019-05-05Python使用struct處理二進(jìn)制(pack和unpack用法)
這篇文章主要介紹了Python使用struct處理二進(jìn)制(pack和unpack用法),幫助大家更好的理解和使用python,感興趣的朋友可以了解下2020-11-11Python區(qū)塊鏈Creating?Miners教程
這篇文章主要為大家介紹了Python區(qū)塊鏈Creating?Miners教程,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2022-05-05