GitHub 又一 OCR 神器面世!让你快速告别「复制 粘贴」!

对很多人来说,将 PDF 转换为可编辑的文本是个刚需,却苦于没有简单方法。在本文介绍的项目中,来自 K1 Digital 的高级机器学习工程师 Lucas Soares,尝试使用 OCR(光学字符识别)自动转录 pdf 幻灯片,转录效果还不错。

将 pdf 转换为图片;
检测和识别图像中的文本;
展示示例输出。

from pdf2image import convert_from_path
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
pdf_path = 'path/to/file/intro_RL_Lecture1.pdf'
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
fname = 'image' + str(i) + '.png'
image.save(fname, 'PNG')

# adapted from this source: https://github.com/courao/ocr.pytorch%load_ext autoreload%autoreload 2import osfrom ocr import ocrimport timeimport shutilimport numpy as npimport pathlibfrom PIL import Imagefrom glob import globimport matplotlib.pyplot as pltimport seaborn as snssns.set()import pytesseract
def single_pic_proc(image_file): image = np.array(Image.open(image_file).convert('RGB')) result, image_framed = ocr(image) return result,image_framed
image_files = glob('./input_images/*.*')result_dir = './output_images_with_boxes/'
# If the output folder exists we will remove it and redo it.if os.path.exists(result_dir): shutil.rmtree(result_dir)os.mkdir(result_dir)
for image_file in sorted(image_files): result, image_framed = single_pic_proc(image_file) # detecting and recognizing the text filename = pathlib.Path(image_file).name output_file = os.path.join(result_dir, image_file.split('/')[-1]) txt_file = os.path.join(result_dir, image_file.split('/')[-1].split('.')[0]+'.txt') txt_f = open(txt_file, 'w') Image.fromarray(image_framed).save(output_file) for key in result: txt_f.write(result[key][1]+'\n') txt_f.close()
import cv2 as cv
output_dir = pathlib.Path('./output_images_with_boxes')
# image = cv.imread(str(np.random.choice(list(output_dir.iterdir()),1)[0]))
image = cv.imread(f'{output_dir}/image7.png')
size_reshaped = (int(image.shape[1]),int(image.shape[0]))
image = cv.resize(image, size_reshaped)
cv.imshow('image', image)
cv.waitKey(0)
cv.destroyAllWindows()

filename = f'{output_dir}/image7.txt'with open(filename, 'r') as text: for line in text.readlines(): print(line.strip('\n'))
赞 (0)