[python]利用Python生成xml文件--688IT编程网

[python]利⽤Python⽣成xml⽂件

写在前⾯：

github上的代码是在场景⽂字识别上⾯的扩展，基本只是把数据集和分类类别进⾏了改变。在样例中，将原有的pascal_voc数据集换成了coco_text。⽽如果想要在代码中不进⾏⼤的改动，最好的⽅式就是统⼀两个数据集的格式。

在github中，提供了format data的⽅式：

# format the raw image and label into the type of pascal_voc

# follow the code in $Text-Detection-with-FRCN/datasets/script/format_annotation.p

cd$Text-Detection-with-FRCN/datasets/script

./format_annotation.py --dataset coco-text

我们来看⼀下format_annotation.py的倒数第⼆⾏， os.system('./ann2voc2007.sh'+ args.dataset)

‘./ann2voc2007.sh'的内容为：matlab -nodisplay -nodesktop -r"ann2voc2007('$1'); quit"　

也就是⽤matlib来运⾏。下载matlib要花费的时间很长，如果不想安装，怎么办呢？可以利⽤python来对matlib⽂件进⾏改写。

matlib的内容是⽣成xml⽂件。那么相应的，也可以利⽤python⽣成xml⽂件。

原matlib代码：

function ann2voc2007(input_dir)

curpath = mfilename('fullpath');

[pathstr,~,~] = fileparts(curpath)

if input_dir(end) == '/'

input_dir = input_dir(1:end-1);

end

[~,input_dir,~] = fileparts(input_dir);

input_dir = [pathstr '/../' input_dir '/formatted_dataset']

imgpath = [input_dir '/JPEGImages/']

txtpath = [input_dir '/images.annotations']

xmlpath_new = [input_dir '/Annotations/'];

foldername = 'VOC2007';

coco = containers.Map();

fidin = fopen(txtpath, 'r');

cnt = 0;

while ~feof(fidin)

tline = fgetl(fidin);

str = regexp(tline, ' ', 'split');

xmlname = strrep(str{1},'.jpg','.xml');

info = imfinfo([imgpath '/' str{1}]);

str{3} = max(str2double(str{3}), 1);

str{4} = max(str2double(str{4}), 1);

str{5} = min(str2double(str{5}), info.Width);

str{6} = min(str2double(str{6}), info.Height);

if str{3} >= str{5} || str{4} >= str{6} || str{3} <= 0 || str{4} <= 0 || str{5} >

str{6} > info.Height

continue;

end

cnt = cnt + 1

if exist([imgpath '/' str{1}])

if isKey(coco,xmlname)

Createnode = coco(xmlname);

object_node = ateElement('object');

Root = DocumentElement;

Root.appendChild(object_node);

ateElement('name');

node.ateTextNode(str{2}));

ateElement('pose');

node.ateTextNode('Unspecified'));

object_node.appendChild(node);

ateElement('truncated');

node.ateTextNode('0'));

object_node.appendChild(node);

ateElement('difficult');

node.ateTextNode('0'));

object_node.appendChild(node);

bndbox_ateElement('bndbox');

object_node.appendChild(bndbox_node);

ateElement('xmin');

node.ateTextNode(num2str(str{3})));

bndbox_node.appendChild(node);

ateElement('ymin');

node.ateTextNode(num2str(str{4})));

bndbox_node.appendChild(node);

ateElement('xmax');

node.ateTextNode(num2str(str{5})));

bndbox_node.appendChild(node);

ateElement('ymax');

node.ateTextNode(num2str(str{6})));

bndbox_node.appendChild(node);

else

Createnode = ateDocument('annotation'); Root = DocumentElement;

node = ateElement('folder');

node.ateTextNode(foldername));

Root.appendChild(node);

node = ateElement('filename');

node.ateTextNode(str{1}));

Root.appendChild(node);

source_node = ateElement('source');

Root.appendChild(source_node);

node = ateElement('database');

node.ateTextNode('MS COCO-Text'));

source_node.appendChild(node);

node = ateElement('annotation');

node.ateTextNode('MS COCO-Text 2014'));

source_node.appendChild(node);

ateElement('image');

node.ateTextNode('NULL'));

source_node.appendChild(node);

ateElement('flickrid');

node.ateTextNode('NULL'));

source_node.appendChild(node);

owner_ateElement('owner');

Root.appendChild(owner_node);

ateElement('flickrid');

node.ateTextNode('NULL'));

owner_node.appendChild(node);

ateElement('name');

node.ateTextNode('ligen'));

owner_node.appendChild(node);

size_ateElement('size');

Root.appendChild(size_node);

ateElement('width');

node.ateTextNode(num2str(info.Width)));

size_node.appendChild(node);

ateElement('height');

node.ateTextNode(num2str(info.Height)));

size_node.appendChild(node);

ateElement('depth');

python处理xml文件node.ateTextNode(num2str(info.BitDepth / 8)));

ateElement('segmented');

node.ateTextNode('0'));

Root.appendChild(node);

object_ateElement('object');

Root.appendChild(object_node);

ateElement('name');

node.ateTextNode(str{2}));

object_node.appendChild(node);

ateElement('pose');

node.ateTextNode('Unspecified')); object_node.appendChild(node);

ateElement('truncated');

node.ateTextNode('0'));

object_node.appendChild(node);

ateElement('difficult');

node.ateTextNode('0'));

object_node.appendChild(node);

bndbox_ateElement('bndbox');

object_node.appendChild(bndbox_node);

ateElement('xmin');

node.ateTextNode(num2str(str{3}))); bndbox_node.appendChild(node);

ateElement('ymin');

node.ateTextNode(num2str(str{4}))); bndbox_node.appendChild(node);

ateElement('xmax');

node.ateTextNode(num2str(str{5}))); bndbox_node.appendChild(node);

ateElement('ymax');

node.ateTextNode(num2str(str{6}))); bndbox_node.appendChild(node);

coco(xmlname) = Createnode;

end

fclose(fidin);

keyss = keys(coco);

for i = 1:length(keyss)

xmlwrite([xmlpath_new '/' keyss{i}], coco(keyss{i}));

end

改写后的python代码：

#coding:utf-8

from PIL import Image

from xml.dom.minidom import Document

import os

def main():

imgpath = 'JPEGImages/'

txtpath = 'images.annotations'

xmlpath_new = 'Annotations/'

coco = {}

#　得到图像的标注信息

file_object = open(txtpath,'rU')

try:

for line in file_object:

strs = line.split(' ')

print strs[0]

foldername = 'VOC2007'

#　⽤xml替换jpg，得到同名⽂件

xmlname = strs[0].replace('.jpg','.xml')

info = Image.open(imgpath + strs[0])

# read image size

(width,height) = info.size

strs[2] = max(float(strs[2]), 1)

strs[3] = max(float(strs[3]), 1)

strs[4] = min(float(strs[4]), width);

strs[5] = min(float(strs[5]), height);

# 过滤异常

if strs[2] >= strs[4] or strs[3] >= strs[5] or strs[2] <=0 or strs[3] <= 0 or strs[4] > width or strs[5] > height: continue

if ists(imgpath + strs[0]):

if xmlname in coco:

Createnode = coco[xmlname]

object_node = ateElement('object')

Root = ElementsByTagName('annotation')[0]

Root.appendChild(object_node)

ateElement('name')

node.ateTextNode(strs[1]))

object_node.appendChild(node)

ateElement('pose')

node.ateTextNode('Unspecified'))

object_node.appendChild(node)

ateElement('truncated')

node.ateTextNode('0'))

object_node.appendChild(node)

ateElement('difficult')

node.ateTextNode('0'))

object_node.appendChild(node)

bndbox_ateElement('bndbox')

object_node.appendChild(bndbox_node)

ateElement('xmin')

node.ateTextNode(str(strs[2])))

bndbox_node.appendChild(node)

ateElement('ymin')

node.ateTextNode(str(strs[3])))

bndbox_node.appendChild(node)

ateElement('xmax')

node.ateTextNode(str(strs[4])))

bndbox_node.appendChild(node)

ateElement('ymax')

node.ateTextNode(str(strs[5])))

bndbox_node.appendChild(node)

else:

Createnode=Document() #创建DOM⽂档对象

ateElement('annotation') #创建根元素

# folder

ateElement('folder')

folder.ateTextNode(foldername))

Root.appendChild(folder)

# filename

filename = ateElement('filename')

filename.ateTextNode(strs[0]))

Root.appendChild(filename)

# source

source_node = ateElement('source')

Root.appendChild(source_node)

node = ateElement('database')

node.ateTextNode('MS COCO-Text'))

source_node.appendChild(node)

node = ateElement('annotation')

node.ateTextNode('MS COCO-Text 2014')) source_node.appendChild(node)

ateElement('image')

node.ateTextNode('NULL'))

source_node.appendChild(node)

ateElement('flickrid');

node.ateTextNode('NULL'));

source_node.appendChild(node);

# owner

owner_ateElement('owner')

Root.appendChild(owner_node)

ateElement('flickrid')

node.ateTextNode('NULL'))

owner_node.appendChild(node)

ateElement('name')

node.ateTextNode('ligen'))

owner_node.appendChild(node)

# size

size_ateElement('size')

Root.appendChild(size_node)

ateElement('width')

node.ateTextNode(str(width)))

size_node.appendChild(node)

ateElement('height');

node.ateTextNode(str(height)))

size_node.appendChild(node)

ateElement('depth')

node.ateTextNode('3'))

size_node.appendChild(node)

# segmented

ateElement('segmented')

node.ateTextNode('0'))

Root.appendChild(node)

688IT编程网

[python]利用Python生成xml文件

发表评论

推荐文章

java正则表达式选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符回溯引用和前后查匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式选择题

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

688IT编程网

[python]利用Python生成xml文件

发表评论

推荐文章

java正则表达式 选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符 回溯引用和前后查 匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式 选择题

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

java正则表达式选择题

非零金额正则表达式

基本的元字符回溯引用和前后查匹配模式

java正则表达式选择题

非零金额正则表达式