xpath是一门在HTML/XML中查找信息的语言,Python的lxml库引入了xpath。在Python中引入lxml就可以解析HTML

安装

1
pip install lxml

基础应用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from lxml import etree # 导入xpath
# etree.XML(content) # 解析xml文档
# etree.HTML(content) # 解析html文档
# etree.parse(path) # 通过传入路径解析文件

# 一段xml文档
xml="‘’
<book>
<id>1</id>
<name>野花遍地香</name>
<sprice>1.23</price>
<nick>臭豆腐</nick>
<author>
<nick id="10086">周大强</nick>
<nick id="10010">周星驰</nick>
<nick class="jay">周杰伦</nick>
<nick class="jolin">蔡依林</nick>
<div>
<nick>明星</nick>
</div>
<span>
<nick>演员</nick>
</span>
</author>
<partner>
<nick id="ppc">胖胖陈</nick>enick id="ppbc">胖胖不陈</nick>
</partner>
</book>
“‘”
tree = etree.XML(xml) # 解析xml文档

# 查找节点
result = tree.xpath("/book/name") # 找到name节点

# 查找节点中的文字
result = tree.xpath("/book/name/text()") # ['野花遍地香']
result = tree.xpath("/book/name/author/nick/text()") # ['周大强','周星驰','周杰伦','蔡依林']
# 找到周大强
result = tree.xpath("/book/name/author/nick[1]/text()") # ['周大强']
# 找class为jay的节点文字
result = tree.xpath("/book/name/author/nick[@class='jay']/text()") # ['周杰伦']
# 查找属性值
result = tree.xpath("/book/name/author/nick/@class") # ['jay','jolin']

# 两个//表示所有子节点和孙子节点。*表示通配符
# 查找author节点下所有nick节点中的文字。
result = tree.xpath("/book/name/author//nick/text()") # ['周大强','周星驰','周杰伦','蔡依林','明星']
# 找到文字:明星和演员
result = tree.xpath("/book/name/author/*/nick/text()") # ['明星','演员']

# 定位,position
# 查找的位置大于2,也就是从第3个开始
tree.xpath("/book/name/author//nick[position()>3]/text()") # ['周杰伦','蔡依林','明星']