Python之ElementTree模块的使⽤
基于事件和基于⽂档的APID来解析XML,可以使⽤XPath表达式搜索已解析的⽂件,具有对⽂档的增删改查的功能,该⽅式需要注意⼤xml⽂件,因为是⼀次性加载到内存,所以如果是⼤xml⽂件,不推荐使⽤该模块解析,应该使⽤sax⽅式。
测试解析的内容
<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank>1</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank>4</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Panama">
<rank>68</rank>
<year>2011</year>
<gdppc>13600</gdppc>
<neighbor name="Costa Rica" direction="W"/>
<neighbor name="Colombia" direction="E"/>
</country>
</data>
1、解析xml⽂档
import ElementTree
with open('l', 'tr', encoding='utf-8') as rf:
tree = ElementTree.parse(rf)
print(tree)
ElementTree_parse_xml.py
运⾏结果
#返回ElementTree对象
&ElementTree.ElementTree object at 0x0000020A1F090088>
2、遍历解析XML树,获取节点名字
import ElementTree
with open('l', 'tr', encoding='utf-8') as rf:
tree = ElementTree.parse(rf)
for node in tree.iter():
print(node.tag)
ElementTree_dump_xml.py
运⾏结果
#打印所有节点名字
data
country
rank
year
gdppc
neighbor
neighbor
country
rank
year
gdppc
neighbor
country
rank
year
gdppc
neighbor
neighbor
3、遍历解析XML树,获取属性值
import ElementTree
with open('l', 'tr', encoding='utf-8') as rf:
tree = ElementTree.parse(rf)
for node in tree.iter('neighbor'):
attr_name = ('name')
attr_direction = ('direction')
# 如果两个值都不为空,则打印两个值,否则打印⼀个值
if attr_name and attr_direction:
print('{:<25}{:<25}'.format(attr_name, attr_direction))
else:
print('{:<25}'.format(attr_name))
ElementTree_show_name_direction.py
运⾏结果
Austria E
Switzerland W
Malaysia N
Costa Rica W
Colombia E
4、利⽤XPath在XML⽂档中查节点
import ElementTree
with open('l', 'tr', encoding='utf-8') as rf:
tree = ElementTree.parse(rf)
for node in tree.findall('.//neighbor'):
name = ('name')
if name:
print(name)
ElementTree_find_feeds_by_tag.py
运⾏结果
Austria
Switzerland
Malaysia
Costa Rica
Colombia
5、利⽤XPath在XML⽂档中查更深⼀层的节点
import ElementTree
with open('l', 'tr', encoding='utf-8') as rf:
tree = ElementTree.parse(rf)
for node in tree.findall('.//neighbor/neighbor'):
name = ('name')
if name:
print(name)
ElementTree_find_feeds_by_structure.py
运⾏结果
Malaysia
6、利⽤XPath表达式,查询节点的属性名和值
import ElementTree
with open('l', 'tr', encoding='utf-8') as rf:
tree = ElementTree.parse(rf)
node = tree.find('./country')
print('标签名:', node.tag)
for name, value in node.attrib.items():
print('属性名:{name},属性值:{value}'.format(name=name, value=value)) ElementTree_node_attributes.py
运⾏结果
标签名: country
属性名:name,属性值:Liechtenstein
7、利⽤XPath表达式,查询多个路径的⽂本即text
import ElementTree
with open('l', 'tr', encoding='utf-8') as rf:
tree = ElementTree.parse(rf)
for path in ['./country/year', './country/gdppc']:
node = tree.find(path)
print('节点名字', node.tag)
)
print(node.tail)
ElementTree_node_text.py
运⾏结果
节点名字 year
2008
节点名字 gdppc
141100
8、解析监听标签的事件
ElementTree import iterparse
# 计算深度值
depth = 0
# 前缀的长度
prefix_width = 8
# 前缀的圆点数量
prefix_dots = '.' * prefix_width
# 拼接格式化字符串模板
line_template = ''.join([
'{prefix:<0.{prefix_len}}',
'{event:<8}',
'{suffix:<{suffix_len}} ',
'{node.tag:<12} ',
'{node_id}',
])
EVENT_NAMES = ['start', 'end', 'start-ns', 'end-ns']
for (event, node) in iterparse('l', EVENT_NAMES):
# 如果是结束,深度减1
if event == 'end':
depth -= 1
# 前缀的长度
prefix_len = depth * 2
print(line_template.format(
prefix=prefix_dots, # 前缀显⽰的内容
prefix_len=prefix_len, # 前缀的长度
suffix='', # 后缀显⽰的内容
suffix_len=(prefix_width - prefix_len), # 后缀的长度=前缀总长度-前缀实际的长度 event=event, # 当前的事件
node_id=id(node), # 显⽰内存的ID
node=node, # ElementTree的对象
))
# 如果是开始,深度加1
if event == 'start':
depth += 1
ElementTree_show_all_events.py
运⾏结果
start data 3102087901736
..start country 3102087901816
....start rank 3102087901896
....end rank 3102087901896
.
...start year 3102087901976
....end year 3102087901976
....start gdppc 3102087902056
....end gdppc 3102087902056
....start neighbor 3102087902136
....end neighbor 3102087902136
....start neighbor 3102087902216
....end neighbor 3102087902216
..end country 3102087901816
..start country 3102087902296
....start rank 3102087902376
.
...end rank 3102087902376
....start year 3102087902456
....end year 3102087902456
....start gdppc 3102087902536
python处理xml文件....end gdppc 3102087902536
....start neighbor 3102087902616
......start neighbor 3102087902776
......end neighbor 3102087902776
....end neighbor 3102087902616
..end country 3102087902296
..start country 3102087902936
.
...start rank 3102087903016
....end rank 3102087903016
....start year 3102087903096
....end year 3102087903096
....start gdppc 3102087903176
....end gdppc 3102087903176
....start neighbor 3102087903336
....end neighbor 3102087903336
....start neighbor 3102087903496
....end neighbor 3102087903496
..end country 3102087902936
end data 3102087901736
9、XML转为CVS的⽂件格式,这⾥只存到内存中测试,⽣产中是存到硬盘上import csv
import sys
ElementTree import iterparse
writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONNUMERIC)
group_name = ''
parsing = iterparse('l', events=['start'])
for event, node in parsing:
#去除不想获取的标签
if node.tag in ['rank', 'year', 'gdppc']:
continue
#如果没有属性名为name的话,则为⽗标签,否则为⼦标签
if not ('name'):
group_name = ('text')
else:
writer.writerow(
(group_name, ('name'), ('direction'))
)
ElementTree_write_podcast_csv.py
测试效果
"Liechtenstein","Austria","E"
"Liechtenstein","Switzerland","W"
"Singapore","Malaysia","N"
"Singapore","Malaysia","N"
"Panama","Costa Rica","W"
"Panama","Colombia","E"
10、创建⼀个定制的树的构造器
import csv
import sys
ElementTree import XMLParser
class PodcastListToCSV(object):
def__init__(self, output_file):
self.writer = csv.writer(
output_file,
quoting=csv.QUOTE_NONNUMERIC
)
def start(self, tag, attrib):
if tag in ['rank', 'year', 'gdppc']:
return
if ('name'):
else:
self.writer.writerow(
(up_name,
tag,
attrib['name'],
attrib['direction'])
)
def end(self, tag):
"""忽略关闭标签"""
pass
def data(self, data):
"""忽略节点内部的数据"""
pass
def close(self):
"""在这⾥没什么特别的"""
pass
target = PodcastListToCSV(sys.stdout)
parser = XMLParser(target=target)
with open('l', 'rt') as rf:
for line in rf:
parser.feed(line)
parser.close()
ElementTree_podcast_csv_treebuilder.py 数据源
<?xml version="1.0"?>
<country text="Liechtenstein">
<rank>1</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/> </country>
运⾏效果"Liechtenstein","neighbor","Austria","E" "Liechtenstein","neighbor","Switzerland","W"
11、利⽤递归的⽅法,解析XML
ElementTree import XML
def show_node(node):
is not None strip():
print('⽂本内容: %s' % )
if node.tail is not None and node.tail.strip():
print('尾部内容: %s' % node.tail)
for name, value in sorted(node.attrib.items()):
print('%s=%s' % (name, value))
for child in node:
show_node(child)
parsed = XML("""
<root>
<group>
<child id="a">This is child "a".</child>
<child id="b">This is child "b".</child>
</group>
<group>
<child id="c">This is child "c".</child>
</group>
</root>
""")
print('parsed = ', parsed)
for elem in parsed:
show_node(elem)
ElementTree_XML.py
运⾏结果
parsed = <Element 'root' at 0x00000240F004AB38>⽂本内容: This is child "a".
id=a
⽂本内容: This is child "b".
id=b
⽂本内容: This is child "c".
id=c
12、利⽤属性节点为标识,解析XML⼦节点ElementTree import XMLID
tree, id_map = XMLID('''
<root>
<group>
<child id="a">This is child "a".</child>
<child id="b">This is child "b".</child>
</group>
<group>
<child id="c">This is child "c".</child>
</group>
</root>
''')
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论