-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNote_xpath.py
252 lines (211 loc) · 8.59 KB
/
Note_xpath.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python
# encoding: utf-8
# product.py
# product
# Created by txooo on 2018/11/19
# Copyright © 2018 txooo. All rights reserved.
"""
Description
Please run cmdline :
pip install lxml
"""
from lxml import html
""" 示例一
page_source = '''
<div>
<ul id="side-menu">
<li class="active">
<a href="http://www.baidu.com/ws/project1/index.html">
<i>图标</i>
电子账户
<span>箭头</span>
</a>
<ul class="nav">
<li>子菜单1</li>
<li>子菜单2</li>
</ul>
</li>
</ul>
<a href="http://www.baidu.com/ws/project2/index.html">1</a>
<a href="http://www.baidu.com/ws/project2/login.html">2</a>
<a href="http://www.baidu.com/xm/project2/index.htm ">2</a>
</div>
'''
html_etree = html.etree.HTML(page_source)
# 查找所有含ws的a标签
xpath_regx_ws = 'contains(%s, "%s")' % ('@href', 'ws')
href_ws = html_etree.xpath('//a[%s]' % xpath_regx_ws)
print(href_ws)
for each in href_ws:
print(each.xpath('@href'))
# 查找所有含ws或xm且含index.htm的标签
xpath_regx_ws_index = '(contains(%s, "%s") or contains(%s, "%s")) and (contains(%s, "%s"))' % ('@href', 'ws', '@href', 'xm', '@href', 'index.htm')
href_ws_index = html_etree.xpath('//a[%s]' % xpath_regx_ws_index)
print(href_ws_index)
for each in href_ws_index:
print(each.xpath('@href')[0].strip())
"""
""" 示例二
html_str = '''
<div class="sort_nr">
<div class="sort_cont1"> <img src="http://www.78.cn/imgs/2018-11-22/201811221824235892591.jpg"></div><div class="sort_cont3">累计时间:个月<br> 信誉积分:分<em></em></div><div class="sort_cont4">投资:¥<strong>3-5万</strong></div><div class="sort_cont5"><span class="fl">仅剩名额<font color="#ec384b"><i id="num_js">18</i></font>个</span><p><em style="width: 84%;"></em></p></div>
<script src="http://msg78.tbkf.net/pc/xxpf_phone.php?gid=8695"></script><div class="sort_cont6">
<ul style="margin-top: 0px;">
<li>广东梅州 1376666****<br>
正在与该项目进行<span class="axzx">在线咨询</span></li><li>宁波北伦 1820812****<br>
正在与该项目进行<span class="mfth">免费通话</span></li><li>河北唐山 1599490****<br>
正在与该项目进行<span class="mfth">免费通话</span></li><li>河北保定 1363514****<br>
正在与该项目进行<span class="mfth">免费通话</span></li><li>山西大同 1323821****<br>
正在与该项目进行<span class="axzx">在线咨询</span></li><li>北京昌平 1377117****<br>
正在与该项目进行<span class="mfth">免费通话</span></li><li>上海松江 1363514****<br>
正在与该项目进行<span class="axzx">在线咨询</span></li><li>天津和平 1523899****<br>
正在与该项目进行<span class="mfth">免费通话</span></li><li>安徽淮北 1396787****<br>
正在与该项目进行<span class="mfth">免费通话</span></li><li>安徽铜陵 1361454****<br>
正在与该项目进行<span class="mfth">免费通话</span></li></ul>
</div>
<div class="sort_cont11"><a href="http://gb.78.cn/huodong/hongbao/showHongbao.php?gid=8695&hongbao_type=1&hongbao=700&is_ggy=1&sid=61" class="_thickbox"><span><i>¥</i>700</span></a></div>
<div class="sort_cont9"><a data-tbchatlink="true" href="http://qudao.tbkf.net/TongBao/chatadp.php?adp=pc&channel_id=31604&proj_id=40056&plat_id=1&ids=31604&vpage=http%253A%252F%252Fwww.78.cn%252Fweb%252Fzhajixingqiu%252Findex.htm&adp_ver=2&chat_ver=2.0&extra_data=custom_id=8695&user_id=0&basekw=&uckloadid=80F5nB.4a030e4&tkid=80F5nB.XKHYvF&isweb=1&pinit=1556263179&rpage=&sid=61" target="_blank"></a></div>
<div class="sort_cont10"><span>区域:</span>北京市<br>
<span>名称:</span>北京快道网络有限公司</div>
</div>
'''
xpath_str = '//div[@class="sort_cont10"]'
etree_item = html.etree.HTML(html_str)
res = etree_item.xpath(xpath_str)
print(res)
for each in res:
res_arr = each.xpath('text()')
for earh in res_arr:
print(earh, type(earh))
print(res_arr, res_arr[0], res_arr[-1])
"""
""" 示例三
html_str = '''
<div class="header-bot laout clearfix zoom">
<ul>
<li class="bot-li1"><img src="/images/icon1.jpg"> 品牌名称:<span>乐堂口手工拉茶茶饮</span></li>
<li><img src="/images/icon2.jpg"> 投资额度:<span>5-10万</span></li>
<li><img src="/images/icon3.jpg"> 招商对象:创业者</li>
<li class=" bot-li8"><img src="/images/icon4.jpg"> 证件资质:三证齐全</li>
<li class="bot-li1"><img src="/images/icon5.jpg"> 品牌发源地:上海市</li>
<li><img src="/images/icon6.jpg"> 所属行业:餐饮娱乐</li>
<li><img src="/images/icon7.jpg"> 经营模式:全国连锁</li>
<li class=" bot-li8"><img src="/images/icon8.jpg"> 公司名称:上海少权餐饮企业管理有限公司</li>
</ul>
<div>
<a href="#igbook" class="input bot-liuyan">给我留言</a>
<a href="#igbook" class="bot-zixun input">免费咨询</a>
</div>
</div>
'''
xpath_str = '//div[@class="header-bot laout clearfix zoom"]/ul'
xpath_str_1 = 'li[@class="bot-li1"]/text()'
xpath_str_2 = 'li[@class=" bot-li8"]/text()'
etree_item = html.etree.HTML(html_str)
res = etree_item.xpath(xpath_str)[0]
print(res)
res1 = res.xpath(xpath_str_1)[-1]
res2 = res.xpath(xpath_str_2)[-1]
print(res1, type(res1), str(res1).strip())
res1 = str(res1).split(':')
print(res1)
print(res2)
"""
html_str = """
<ul style="width: 873px;">
<li class="xmxq_SPX_img">
<img src="https://static.2958.cn/mypic/2017-03/20170315102702.gif" width="96" height="58" alt="食里留香小吃车">
</li>
<li class="xmxq_SPX_tit">
<h1>食里留香小吃车</h1>
<span class="xmxq_show-pro-but">
<a href="javascript:void(0);" class="show-pro-but-gbok S-click-gok">给我留言</a>
<a href="#igbook" class="show-pro-but-tel S-click-tel">免费咨询</a>
</span>
</li>
<li class="xmxq_SPX_xmbq">
<span>
<i>所属行业:</i>
<a href="javascript:;" target="_blank">餐饮娱乐</a>
</span>
<span>
<i>投资金额:</i>
<a href="javascript:;" class="red" target="_blank">1-3万</a>
</span>
</li>
<li class="xmxq_SPX_xmbq">
<span>
<i>所在区域:</i>
<a href="javascript:;" target="_blank">丰台区</a>
</span>
<span>
<i>在线咨询:</i>
<a href="javascript:;" target="_blank">108人</a>
</span>
</li>
<li class="xmxq_SPX_xmbq">
<span>
<i>创业类型:</i>
<a href="javascript:;" target="_blank">加盟开店</a>
</span>
<span>
<i>消费人群:</i>
<a href="javascript:;" target="_blank">白领工人</a>*
<a href="javascript:;" target="_blank">成人</a>
</span>
</li>
</ul>
"""
xpath_str = "//ul"
xpath_str_1 = 'li[@class="xmxq_SPX_xmbq"][position()=2]/span/a/text()'
etree_item = html.etree.HTML(html_str)
res = etree_item.xpath(xpath_str)[0]
print(res)
res1 = res.xpath(xpath_str_1)
print(res1, type(res1))
#
# res1 = str(res1).split(':')
# print(res1)
# print(res2)
# 打印每一个字符
# for fds in xpath_str:
# print(fds)
html_str = """
<ul data-qhtml="true" class="data-list sty2 sp">
<li data-qhtml="true" class="active">
<a data-qhtml="true" href="/web/qswd/index.htm?id=684476&args=79.1996.799.3631.0.204279.15532.3....0&Sid=148536" title="">
<i data-qhtml="true">1</i>
<span data-qhtml="true">3-5万</span>
陕西美食 四季客满 回头客多
</a>
</li>
<li data-qhtml="true" class="active">
<a data-qhtml="true" href="/web/tdunclexc/index.htm?id=619702&args=79.1996.799.3631.0.186421.14504.3....0&Sid=148536" title="">
<i data-qhtml="true">2</i>
<span data-qhtml="true">1-3万</span>
七彩土豆 无大厨 轻松立店
</a>
</li>
</ul>
"""
etree_item = html.etree.HTML(html_str)
res = etree_item.xpath("//ul/li")
for item in res:
res = item.xpath("a/text()")
print(res)
html_str = """
<div>a
<p>b</p>
<p>c</p>
d
</div>
"""
res1 = html.etree.HTML(html_str)
res = res1.xpath("//div/text()")
print(res)
res = res1.xpath("normalize-space(//div)")
# res = res1.xpath("normalize-space(.)")
print(res, type(res))
# text()返回节点集, normalize-space会失败 如果节点集只有一个元素,它将自动转换为字符串
res = res1.xpath("normalize-space(//div/text())")
print(res, type(res))