python识别⽂字颜⾊_python读取word⽂档识别字段颜⾊,解
析字段
python版本3.7.3,读取的⽂档格式为.docx
⽂中带有简单注释
看不懂的百度⽹盘下载直接查看,更改运⾏⾥⾯的py⽂件
提取码:nngw
import os
import sys
import xlrd
import codecs
import collections
import json
import io
import docx
import string
from docx import Document
from docx.shared import RGBColor #这个是docx的颜⾊类
maxLength = 0
id = 1
convert_list = []
type_list = []
curPath = os.path.dirname(os.path.abspath(__file__))
# coding=utf-8
#获取⽂档对象
def readDocx(fileName,type):
xlsFile = curPath + '\\'+fileName+'.docx' #地理(葡)Respueda G .es.pt
print("xlsFile: "+xlsFile)springmvc选择题
file=docx.Document(xlsFile)
# print("段落数:"+str(len(file.paragraphs)))
index = 0
data = {}
i = 0
global id
global maxLength
for p in file.paragraphs:
i = i + 1
if i <= 1: #跳过第⼀⾏
continue
== "" or (strip()):
continue
# print("读取第 "+str(i)+" ⾏,⽂件名:"+fileName+" ID:"+str(id)+" 内容:"+p.text) if index == 0: #提取题⽬
# find("-"),"题的内容是:", p.text)
length = )
idx = p.text.find("Número")
aggressive是什么意思if idx != -1 and idx < 2:
idx = idx + len("Número") + 1
# print("Número: "+str(idx)+" text: "+p.text)
< = p.text[idx:(length)]
# print("Número: "+str(idx)+" text: "+p.text)
indexStr = "-" #分隔符
find(indexStr) == -1:
indexStr = "."
find(indexStr) == -1:
indexStr = " "
# print("题的内容是:", p.text)
汉字的unicode编码值范围
idx = p.text.index(indexStr)+len(indexStr)
length = )
if length > maxLength:
maxLength = length
# print(id,"最⼤字符数",maxLength)
# print(str(idx)+str(length)+"第"+str(id)+"题的内容是:"+p.text) questionAndsubType = p.text[idx:(length)]
questionAndsubTypeList = questionAndsubType.split("|")
data["question"] = questionAndsubTypeList[0] #题⽬
# if len(questionAndsubTypeList) > 1 : #类型
# subType = questionAndsubTypeList[1].replace("\n", "")
# print("---类型---",unt(subType))
# if unt(subType) <= 0 :
# type_list.append(subType)
data["subType"] = type#escape(subType) #类型
css设置文字间距属性else: #提取选项,以及正确答案
# print("第"+str(id)+"题 选项"+ str(index) +"是:"+p.text) length = )
for n in p.runs:
rgb = str(b) #读取段落颜⾊
# print("runs"+rgb)
if rgb == "00FF00":
# print("正确答案: ",index)
win7退出程序快捷键
data["rightIndex"] = index
#删除段落中不必要⽂字
idx = p.text.find("(Direito)")
if idx != -1:
< = p.text[0:idx]
idx = p.text.find("(Correcta)")
if idx != -1:
< = p.text[0:idx]
idx = p.text.find("(Right)")
if idx != -1:
< = p.text[0:idx]
idx = p.text.find("(Correct)")
if idx != -1:
< = p.text[0:idx]
#删除段落中不必要⽂字
data["option"+str(index)] = p.text
index = index + 1
python入门教程网盘if index >= 5:
data["_id"] = id
# print("data: "+str(data))
convert_list.append(data)
index = 0
id = id + 1
data = {}
def writeDocx(fileList,name):
global id
global convert_list
global type_list
id = 1
convert_list = []
type_list = []
for p in fileList:
readDocx(p["path"],p["type"])
#题库
jsonPath = os.path.join(curPath,"topic",name+".txt") #写⼊路径
dirname = os.path.dirname(jsonPath)
if not ists(dirname):
os.makedirs(dirname)
with io.open(jsonPath, 'w', encoding='utf-8') as f: #按照对应路径写⼊
f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))
def main():
en_fileList = [{"path":"en_us_topic\\地理(英)Respueda G .es.en", "type":"World"}, {"path":"en_us_topic\\科学与技术(英)", "type":"Technology"},
{"path":"en_us_topic\\历史(英)Resupeda ", "type":"History"},
{"path":"en_us_topic\\艺术和⽂学(英)Respueda A&", "type":"ArtAndLiterature"}, {"path":"en_us_topic\\娱乐(英)Respueda ", "type":"Fashion"},
{"path":"en_us_topic\\运动(英)Respueda ", "type":"Sports"}]
en_name = "en_us_topic"
es_fileList = [{"path":"es_es_topic\\地理(西)Respueda G ", "type":"World"}, {"path":"es_es_topic\\科学与技术(西)Respueda C&T", "type":"Technology"}, {"path":"es_es_topic\\历史(西)Resupeda H", "type":"History"},
{"path":"es_es_topic\\艺术和⽂学(西)Respueda A&L", "type":"ArtAndLiterature"}, {"path":"es_es_topic\\娱乐(西)Respueda E", "type":"Fashion"},
{"path":"es_es_topic\\运动(西)Respueda D", "type":"Sports"}]
es_name = "es_es_topic"
pt_fileList = [{"path":"pt_br_topic\\地理(葡)Respueda G .es.pt", "type":"World"},

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。