将有大量表格图片的Word/PDF识别成文本Word(Python调用阿里云读光接口)

将有大量表格图片的word转换成文本格式(依然是word格式),暂且记录下,一时间写的比较乱,有空了再好好梳理和调整。

主要是通过“阿里云读光PDF识别”(官方网址)的API实现的,这边的流程是:先把包含大量表格图片的word转换成PDF,然后将PDF拆分(因为API每次调用最多识别20页),然后请求识别,返回拆分后对应的word,再将word合并起来。

  1. #!/usr/bin/env python
  2. # coding=utf-8
  3. from PyPDF2 import PdfFileReader, PdfFileWriter
  4. import os
  5. import urllib.request
  6. import urllib.parse
  7. import json
  8. import time
  9. import base64
  10. from os.path import abspath
  11. from win32com import client
  12. url_request="https://generalpdf.market.alicloudapi.com/ocrservice/pdf" #api
  13. AppCode = "" #appcode(需个人实名购买,500次免费)
  14. headers = {
  15. 'Authorization': 'APPCODE ' + AppCode,
  16. 'Content-Type': 'application/json; charset=UTF-8'
  17. }
  18. # PDF文件分割,由于每次
  19. def split_pdf(read_file, pagelenth): #pagelenth每一次分割的页数
  20. try:
  21. fp_read_file = open(read_file, 'rb')
  22. pdf_input = PdfFileReader(fp_read_file) # 将要分割的PDF内容格式话
  23. page_count = pdf_input.getNumPages() # 获取PDF页数
  24. print(page_count) # 打印页数
  25. start_page=1
  26. write_file=0
  27. while start_page<=page_count:
  28. end_page=start_page+pagelenth-1
  29. if end_page>page_count:end_page=page_count #控制有效页数
  30. write_file+=1
  31. pdf_file = f'{write_file}.pdf'
  32. try:
  33. print(f'开始分割{start_page}页-{end_page}页,保存为{pdf_file}......')
  34. pdf_output = PdfFileWriter() # 实例一个 PDF文件编写器
  35. for i in range(start_page-1, end_page):
  36. pdf_output.addPage(pdf_input.getPage(i))
  37. with open(pdf_file, 'wb') as sub_fp:
  38. pdf_output.write(sub_fp)
  39. print(f'完成分割{start_page}页-{end_page}页,保存为{pdf_file}!')
  40. except IndexError:
  41. print(f'分割页数超过了PDF的页数')
  42. start_page=end_page+1
  43. except Exception as e:
  44. print(e)
  45. finally:
  46. fp_read_file.close()
  47. #请求阿里云读光PDF识别
  48. def posturl(url,data={}):
  49. try:
  50. params=json.dumps(data).encode(encoding='UTF8') #data为请求数据,参考官方格式
  51. req = urllib.request.Request(url, params, headers)
  52. r = urllib.request.urlopen(req)
  53. html =r.read()
  54. r.close();
  55. return html.decode("utf8")
  56. except urllib.error.HTTPError as e:
  57. print(e.code)
  58. print(e.read().decode("utf8"))
  59. time.sleep(1)
  60. #返回jason的str
  61. #word合并
  62. def docx_merge(files,final_docx):
  63. word=client.gencache.EnsureDispatch('Word.Application')
  64. word.Visible=True
  65. new_document=word.Documents.Add() #创建新文档
  66. for fn in files:
  67. fn = abspath(fn)
  68. temp_document = word.Documents.Open(fn)
  69. word.Selection.WholeStory()
  70. word.Selection.Copy() #将分文档的内容全部复制进剪贴板
  71. temp_document.Close()
  72. new_document.Range() #粘贴到新文档
  73. word.Selection.Delete()
  74. word.Selection.Paste()
  75. new_document.SaveAs(final_docx)
  76. new_document.Close()
  77. word.Quit()
  78. #word合并可能会出现com_error: (-2147417848, '被调用的对象已与其客户端断开连接。', None, None),此时手动打开一下word客户端再执行就可以了
  79. if __name__=="__main__":
  80. #参数
  81. path = "../Untitled Folder/" #中间拆分文件过渡的文件夹位置(只放拆分文件)
  82. final_docx = r'C:/Users/XXX/Desktop/result.docx' #最终目标
  83. pdf_source = 'C:/Users/XXX/Desktop/XXX.pdf' #PDF源
  84. split_pdf(pdf_source, pagelenth=20) #每次切20页
  85. files = os.listdir(path)
  86. #print(files)
  87. filenames = [file for file in files if file[-3:]=='pdf']
  88. #filenames.sort(key=lambda x:x[0])
  89. print(filenames) #打印PDF拆分的文件名
  90. dicts=[]
  91. for body in filenames:
  92. with open(f'../Untitled Folder/{body}', 'rb') as f: # 挨个读取PDF拆分后的文件
  93. data = f.read()
  94. encodestr = str(base64.b64encode(data),'utf-8') #b64编码后需要转成str格式
  95. dicts.append({'fileBase64': encodestr,"table": True,'fileType': 'word'})
  96. docx_list=[]
  97. for d in dicts:
  98. html = posturl(url_request, data=d)
  99. a=eval(html)
  100. result=a['fileBase64'] #获取返回的word内容(ba64编码)
  101. ans=base64.b64decode(result) #解码
  102. tmp=f"{dicts.index(d)}.docx" #分文档
  103. docx_list.append(tmp)
  104. with open(tmp, 'wb') as f:
  105. f.write(base64.b64decode(result))
  106. f.close()
  107. print(docx_list) #打印生成的docx文件名
  108. docx_merge(docx_list,final_docx)

 

 

(0)

相关推荐