python读取pdf转文字和提取目录--688IT编程网

python读取pdf转⽂字和提取⽬录代码

import pyocr

import importlib

import sys

import time

from io import StringIO

time1 = time.time()

# print("初始时间为：",time1)

import os.path

from pdfminer.pdfparser import PDFParser

from pdfminer.pdfdocument import PDFDocument,PDFNoOutlines

from pdfminer.pdfpage import PDFPage

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

verter import PDFPageAggregator

from pdfminer.layout import LTTextBoxHorizontal,LAParams

from pdfminer.pdfpage import PDFTextExtractionNotAllowed

from PyPDF2 import PdfFileReader as pdf_read

directory_str = ''

def bookmark_listhandler(list):

global directory_str

for message in list:

if isinstance(message, dict):

directory_str += message['/Title'] + '\n'

# print(message['/Title'])

#print(message)

else:

bookmark_listhandler(message)

# text_path = r'photo-words.pdf'

def file_name(file_dir):

L=[]

for i,j,files in os.walk(file_dir):

L=files

for file in files:

print(file)

return L

def _parse_toc(doc):

"""With an open PDFDocument object, get the table of contents (toc) data

[this is a higher-order function to be passed to with_pdf()]"""

toc = []

try:

outlines = _outlines()

for (level,title,dest,a,se) in outlines:

print(level, title)

toc.append((level, title))

python怎么读取txt

except PDFNoOutlines:

pass

#print(toc)

return toc

def parse(pathtxt,text_path):

'''解析PDF⽂本，并保存到TXT⽂件中'''

print(text_path)

fp = open(text_path,'rb')

#⽤⽂件对象创建⼀个PDF⽂档分析器

parser = PDFParser(fp)

#创建⼀个PDF⽂档

doc = PDFDocument(parser)

#连接分析器，与⽂档对象

parser.set_document(doc)

#提供初始化密码，如果没有密码，就创建⼀个空的字符串

#检测⽂档是否提供txt转换，不提供就忽略

if not doc.is_extractable:

raise PDFTextExtractionNotAllowed

print("不提供")

else:

#创建PDF，资源管理器，来共享资源

rsrcmgr = PDFResourceManager()

#创建⼀个PDF设备对象

laparams = LAParams()

device = PDFPageAggregator(rsrcmgr,laparams=laparams)

#创建⼀个PDF解释其对象

interpreter = PDFPageInterpreter(rsrcmgr,device)

#循环遍历列表，每次处理⼀个page内容

# _pages() 获取page列表

for page ate_pages(doc):

interpreter.process_page(page)

#接受该页⾯的LTPage对象

layout = _result()

# 这⾥layout是⼀个LTPage对象⾥⾯存放着这个page解析出的各种对象

# ⼀般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等

# 想要获取⽂本就获得对象的text属性，

for x in layout:

if(isinstance(x,LTTextBoxHorizontal)):

with open(pathtxt,'a',encoding='utf-8') as f:

results = x.get_text()

#print(results)

f.write(results +"\n")

if __name__ == '__main__':

path='C:\\Users\\chenqi\\Desktop\\test\\QA-CivilAviationKG-master\\raw' files=file_name(path)

i=1

for file in files:

names = file.split('.')

print(names[0]+".pdf")

pathtxt=names[0]+'text'+'.txt'

print(pathtxt)

parse(pathtxt,path+'\\'+file)

i=i+1

for i in range(len(files)):

print(i, files[i])

with open(path+'/'+files[i], 'rb') as f:

pdf = pdf_read(f)

# 检索⽂档中存在的⽂本⼤纲，返回的对象是⼀个嵌套的列表

text_outline_list = Outlines()

bookmark_listhandler(text_outline_list)

names = files[i].split('.')

with open(names[0]+'title'+'.txt', 'w', encoding='utf-8') as f:

f.write(directory_str)

time2 = time.time()

print("总共消耗时间为:",time2-time1)

效果

688IT编程网

python读取pdf转文字和提取目录

发表评论

推荐文章

应用程序的安全检测方法、装置、电子设备和存储介质

nginx map用法正则

VBA之正则表达式(1)--基础篇

Prometheus监控学习笔记之初识PromQL

关于PHP中的webshell

热门文章

m函数数字提取

jest断言方法大全

中兴ZXSEC US 管理员手册

keras系列(一):参数设置

Qt从QString中提取出数字

element input 金额千分位格式化

freemaker 参数解析正则

C#正则验证数字

form表单验证正则

scanf正则表达式用法

grafana value的正则表达式

Android平台浮点数运算应用

js-(JS正则表达式验证数字)

判断Python输入是否是整数,字符,或浮点数

c语言 sscanf 正则规则

从文本中提取数值技巧

js将整数转换成两位浮点数的方法

vue正则限制浮点数

8到20的结尾的正则

shell 正则表达式最后一行

最新文章

应用程序的安全检测方法、装置、电子设备和存储介质

VBA之正则表达式(1)--基础篇

代码编辑的辅助方法、装置及电子设备

SHELL查字符串中包含字符的命令

String方法中replace和replaceAll的区别详解(源码分析)

双字节符号正则

标签列表

688IT编程网

python读取pdf转文字和提取目录

发表评论

推荐文章

应用程序的安全检测方法、装置、电子设备和存储介质

nginx map用法 正则

VBA之正则表达式(1)--基础篇

Prometheus监控学习笔记之初识PromQL

关于PHP中的webshell

热门文章

m函数数字提取

jest断言方法大全

中兴ZXSEC US 管理员手册

keras系列(一):参数设置

Qt从QString中提取出数字

element input 金额千分位格式化

freemaker 参数解析正则

C#正则验证数字

form表单验证正则

scanf正则表达式用法

grafana value的正则表达式

Android平台浮点数运算应用

js-(JS正则表达式验证数字)

判断Python输入是否是整数,字符,或浮点数

c语言 sscanf 正则规则

从文本中提取数值技巧

js将整数转换成两位浮点数的方法

vue正则限制浮点数

8到20的结尾的正则

shell 正则表达式 最后一行

最新文章

应用程序的安全检测方法、装置、电子设备和存储介质

VBA之正则表达式(1)--基础篇

代码编辑的辅助方法、装置及电子设备

SHELL查字符串中包含字符的命令

String方法中replace和replaceAll的区别详解(源码分析)

双字节符号正则

标签列表

nginx map用法正则

shell 正则表达式最后一行