利⽤pdfminer3k使⽤python语⾔提取PDF中的⽂本毕业设计需要⽤到⾃然语⾔处理,需要将PDF转化为⽂本进⾏提取信息。
⾸先安装 pdfminer3k (在Python3下进⾏安装,python2.7),使⽤pip安装:pip install pdfminer3k
(如果同时安装了python3.6 和 python 2.7 使⽤
pip3 install pdfminer3k
或者
py -3 -m pip install pdfminer3k
进⾏提取本地PDF中⽂字:
一串好看的乱码# encoding: utf-8
"""
@file: prase_pdf.py
@time: 2018/4/19
"""
import sys
import importlib
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
verter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
'''
解析pdf ⽂本,保存到txt⽂件中
'''
path ="C:\\Users\\admin\\Desktop\\t.pdf"
def parse():
fp = open(path, 'rb') # 以⼆进制读模式打开
#⽤⽂件对象来创建⼀个pdf⽂档分析器
praser = PDFParser(fp)
# 创建⼀个PDF⽂档
doc = PDFDocument()
# 连接分析器 与⽂档对象
praser.set_document(doc)
doc.set_parser(praser)
# 提供初始化密码
# 如果没有密码 就创建⼀个空的字符串
doc.initialize()
# 检测⽂档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建PDf 资源管理器 来管理共享资源
# 创建PDf 资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建⼀个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建⼀个PDF解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 循环遍历列表,每次处理⼀个page的内容
for page _pages(): # _pages() 获取page列表
interpreter.process_page(page)
# 接受该页⾯的LTPage对象
layout = _result()
# 这⾥layout是⼀个LTPage对象 ⾥⾯存放着 这个page解析出的各种对象 ⼀般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取⽂本就获得对象的text属性,
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
#需要写出编码格式
with open(r'C:\Users\admin\', 'a',encoding='utf-8') as f:
results = x.get_text()
print(results)
f.write(results + '\n')
if __name__ == '__main__':
parse()
打印结果(桌⾯【或者你存储的位置】上会有⼀个.txt⽂档):
如果直接在线的⽂档可以这样做:
以上代码 import 模块中,加⼊
quest import urlopen
quest import Request
声明全局的⽤户代理:
user_agent = ['Mozilla/5.0 (Windows NT 10.0; WOW64)', 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC              'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.3              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 '
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50
727; .NET CLR 3.5.30729; .NET CLR 3.0.3              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
request = Request(url=_path, headers={'User-Agent': random.choice(user_agent)})  # 随机从user_agent列表中抽取⼀个元素
fp = urlopen(request) #打开在线PDF⽂档
主函数:添加pdf的连接

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。