将有大量表格图片的Word/PDF识别成文本Word(Python调用阿里云读光接口)
将有大量表格图片的word转换成文本格式(依然是word格式),暂且记录下,一时间写的比较乱,有空了再好好梳理和调整。
主要是通过“阿里云读光PDF识别”(官方网址)的API实现的,这边的流程是:先把包含大量表格图片的word转换成PDF,然后将PDF拆分(因为API每次调用最多识别20页),然后请求识别,返回拆分后对应的word,再将word合并起来。
#!/usr/bin/env python
# coding=utf-8
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import urllib.request
import urllib.parse
import json
import time
import base64
from os.path import abspath
from win32com import client
url_request="https://generalpdf.market.alicloudapi.com/ocrservice/pdf" #api
AppCode = "" #appcode(需个人实名购买,500次免费)
headers = {
'Authorization': 'APPCODE ' + AppCode,
'Content-Type': 'application/json; charset=UTF-8'
}
# PDF文件分割,由于每次
def split_pdf(read_file, pagelenth): #pagelenth每一次分割的页数
try:
fp_read_file = open(read_file, 'rb')
pdf_input = PdfFileReader(fp_read_file) # 将要分割的PDF内容格式话
page_count = pdf_input.getNumPages() # 获取PDF页数
print(page_count) # 打印页数
start_page=1
write_file=0
while start_page<=page_count:
end_page=start_page+pagelenth-1
if end_page>page_count:end_page=page_count #控制有效页数
write_file+=1
pdf_file = f'{write_file}.pdf'
try:
print(f'开始分割{start_page}页-{end_page}页,保存为{pdf_file}......')
pdf_output = PdfFileWriter() # 实例一个 PDF文件编写器
for i in range(start_page-1, end_page):
pdf_output.addPage(pdf_input.getPage(i))
with open(pdf_file, 'wb') as sub_fp:
pdf_output.write(sub_fp)
print(f'完成分割{start_page}页-{end_page}页,保存为{pdf_file}!')
except IndexError:
print(f'分割页数超过了PDF的页数')
start_page=end_page+1
except Exception as e:
print(e)
finally:
fp_read_file.close()
#请求阿里云读光PDF识别
def posturl(url,data={}):
try:
params=json.dumps(data).encode(encoding='UTF8') #data为请求数据,参考官方格式
req = urllib.request.Request(url, params, headers)
r = urllib.request.urlopen(req)
html =r.read()
r.close();
return html.decode("utf8")
except urllib.error.HTTPError as e:
print(e.code)
print(e.read().decode("utf8"))
time.sleep(1)
#返回jason的str
#word合并
def docx_merge(files,final_docx):
word=client.gencache.EnsureDispatch('Word.Application')
word.Visible=True
new_document=word.Documents.Add() #创建新文档
for fn in files:
fn = abspath(fn)
temp_document = word.Documents.Open(fn)
word.Selection.WholeStory()
word.Selection.Copy() #将分文档的内容全部复制进剪贴板
temp_document.Close()
new_document.Range() #粘贴到新文档
word.Selection.Delete()
word.Selection.Paste()
new_document.SaveAs(final_docx)
new_document.Close()
word.Quit()
#word合并可能会出现com_error: (-2147417848, '被调用的对象已与其客户端断开连接。', None, None),此时手动打开一下word客户端再执行就可以了
if __name__=="__main__":
#参数
path = "../Untitled Folder/" #中间拆分文件过渡的文件夹位置(只放拆分文件)
final_docx = r'C:/Users/XXX/Desktop/result.docx' #最终目标
pdf_source = 'C:/Users/XXX/Desktop/XXX.pdf' #PDF源
split_pdf(pdf_source, pagelenth=20) #每次切20页
files = os.listdir(path)
#print(files)
filenames = [file for file in files if file[-3:]=='pdf']
#filenames.sort(key=lambda x:x[0])
print(filenames) #打印PDF拆分的文件名
dicts=[]
for body in filenames:
with open(f'../Untitled Folder/{body}', 'rb') as f: # 挨个读取PDF拆分后的文件
data = f.read()
encodestr = str(base64.b64encode(data),'utf-8') #b64编码后需要转成str格式
dicts.append({'fileBase64': encodestr,"table": True,'fileType': 'word'})
docx_list=[]
for d in dicts:
html = posturl(url_request, data=d)
a=eval(html)
result=a['fileBase64'] #获取返回的word内容(ba64编码)
ans=base64.b64decode(result) #解码
tmp=f"{dicts.index(d)}.docx" #分文档
docx_list.append(tmp)
with open(tmp, 'wb') as f:
f.write(base64.b64decode(result))
f.close()
print(docx_list) #打印生成的docx文件名
docx_merge(docx_list,final_docx)
赞 (0)