python读取pdf转⽂字和提取⽬录代码
import pyocr
import importlib
import sys
import time
from io import StringIO
time1 = time.time()
# print("初始时间为:",time1)
import os.path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument,PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
verter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from PyPDF2 import PdfFileReader as pdf_read
directory_str = ''
def bookmark_listhandler(list):
global directory_str
for message in list:
if isinstance(message, dict):
directory_str += message['/Title'] + '\n'
# print(message['/Title'])
#print(message)
else:
bookmark_listhandler(message)
# text_path = r'photo-words.pdf'
def file_name(file_dir):
L=[]
for i,j,files in os.walk(file_dir):
L=files
for file in files:
print(file)
return L
def _parse_toc(doc):
"""With an open PDFDocument object, get the table of contents (toc) data
[this is a higher-order function to be passed to with_pdf()]"""
toc = []
try:
outlines = _outlines()
for (level,title,dest,a,se) in outlines:
print(level, title)
toc.append((level, title))
python怎么读取txt
except PDFNoOutlines:
pass
#print(toc)
return toc
def parse(pathtxt,text_path):
'''解析PDF⽂本,并保存到TXT⽂件中'''
print(text_path)
fp = open(text_path,'rb')
#⽤⽂件对象创建⼀个PDF⽂档分析器
parser = PDFParser(fp)
#创建⼀个PDF⽂档
doc = PDFDocument(parser)
#连接分析器,与⽂档对象
parser.set_document(doc)
#提供初始化密码,如果没有密码,就创建⼀个空的字符串
#检测⽂档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
print("不提供")
else:
#创建PDF,资源管理器,来共享资源
rsrcmgr = PDFResourceManager()
#创建⼀个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr,laparams=laparams)
#创建⼀个PDF解释其对象
interpreter = PDFPageInterpreter(rsrcmgr,device)
#循环遍历列表,每次处理⼀个page内容
# _pages() 获取page列表
for page ate_pages(doc):
interpreter.process_page(page)
#接受该页⾯的LTPage对象
layout = _result()
# 这⾥layout是⼀个LTPage对象⾥⾯存放着这个page解析出的各种对象
# ⼀般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
# 想要获取⽂本就获得对象的text属性,
for x in layout:
if(isinstance(x,LTTextBoxHorizontal)):
with open(pathtxt,'a',encoding='utf-8') as f:
results = x.get_text()
#print(results)
f.write(results  +"\n")
if __name__ == '__main__':
path='C:\\Users\\chenqi\\Desktop\\test\\QA-CivilAviationKG-master\\raw'    files=file_name(path)
i=1
for file in files:
names = file.split('.')
print(names[0]+".pdf")
pathtxt=names[0]+'text'+'.txt'
print(pathtxt)
parse(pathtxt,path+'\\'+file)
i=i+1
for i in range(len(files)):
print(i, files[i])
with open(path+'/'+files[i], 'rb') as f:
pdf = pdf_read(f)
# 检索⽂档中存在的⽂本⼤纲,返回的对象是⼀个嵌套的列表
text_outline_list = Outlines()
bookmark_listhandler(text_outline_list)
names = files[i].split('.')
with open(names[0]+'title'+'.txt', 'w', encoding='utf-8') as f:
f.write(directory_str)
time2 = time.time()
print("总共消耗时间为:",time2-time1)
效果

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。