-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBaike_extract.py
380 lines (331 loc) · 14.2 KB
/
Baike_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#!/usr/bin/env python3
# coding: utf-8
# File: Baike_extract.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-8-11
from urllib3 import request
from lxml import etree
from urllib import parse
import jieba.posseg as pseg
import os
'''构造显示图谱'''
class CreatePage:
def __init__(self, html_name):
self.html_name = html_name
self.base = '''
<html>
<head>
<script type="text/javascript" src="VIS/dist/vis.js"></script>
<link href="VIS/dist/vis.css" rel="stylesheet" type="text/css">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>
<div id="VIS_draw"></div>
<script type="text/javascript">
var nodes = data_nodes;
var edges = data_edges;
var container = document.getElementById("VIS_draw");
var data = {
nodes: nodes,
edges: edges
};
var options = {
nodes: {
shape: 'dot',
size: 30,
font: {
size: 14
}
},
edges: {
font: {
size: 14,
align: 'middle'
},
color: 'red',
arrows: {
to: {enabled: true, scaleFactor: 1.0}
},
smooth: {enabled: true}
},
physics: {
enabled: true
}
};
var network = new vis.Network(container, data, options);
</script>
</body>
</html>
'''
'''生成数据'''
def collect_data(self, nodes, edges):
node_dict = {node: index for index, node in enumerate(nodes)}
data_nodes = []
data_edges = []
for node, id in node_dict.items():
data = {}
data["group"] = 'Event'
data["id"] = id
data["label"] = node
data_nodes.append(data)
for edge in edges:
data = {}
data['from'] = node_dict.get(edge[0])
data['label'] = 'is-a'
data['to'] = node_dict.get(edge[1])
data_edges.append(data)
return data_nodes, data_edges
'''生成html文件'''
def create_html(self, data_nodes, data_edges):
f = open('{0}.html'.format(self.html_name), 'w+')
html = self.base.replace('data_nodes', str(data_nodes)).replace('data_edges', str(data_edges))
f.write(html)
f.close()
'''图谱展示'''
class EventGraph:
def __init__(self, relfile, html_name):
relfile = relfile
self.html_name = html_name
self.event_dict, self.node_dict = self.collect_events(relfile)
'''统计事件频次'''
def collect_events(self, relfile):
event_dict = {}
node_dict = {}
for line in open(relfile):
event = line.strip()
print(event)
if not event:
continue
nodes = event.split('->')
for node in nodes:
if node not in node_dict:
node_dict[node] = 1
else:
node_dict[node] += 1
if event not in event_dict:
event_dict[event] = 1
else:
event_dict[event] += 1
return event_dict, node_dict
'''过滤低频事件,构建事件图谱'''
def filter_events(self, event_dict, node_dict):
edges = []
nodes = []
for event in sorted(event_dict.items(), key=lambda asd: asd[1], reverse=True)[:2000]:
e1 = event[0].split('->')[0]
e2 = event[0].split('->')[1]
if e1 in node_dict and e2 in node_dict:
nodes.append(e1)
nodes.append(e2)
edges.append([e1, e2])
else:
continue
return edges, nodes
'''调用VIS插件,进行事件图谱展示'''
def show_graph(self):
edges, nodes = self.filter_events(self.event_dict, self.node_dict)
handler = CreatePage(self.html_name)
data_nodes, data_edges = handler.collect_data(nodes, edges)
handler.create_html(data_nodes, data_edges)
class BaiduBaike:
def get_html(self, url):
return request.urlopen(url).read().decode('utf-8').replace(' ', '')
def info_extract_baidu(self, word): # 百度百科
url = "http://baike.baidu.com/item/%s" % parse.quote(word)
print(url)
selector = etree.HTML(self.get_html(url))
info_list = list()
info_list.append(self.extract_baidu(selector))
polysemantics = self.checkbaidu_polysemantic(selector)
if polysemantics:
info_list += polysemantics
infos = [info for info in info_list if len(info) > 2]
return infos
def extract_baidu(self, selector):
info_data = {}
if selector.xpath('//h2/text()'):
info_data['current_semantic'] = selector.xpath('//h2/text()')[0].replace(' ', '').replace('(','').replace(')','')
else:
info_data['current_semantic'] = ''
if info_data['current_semantic'] == '目录':
info_data['current_semantic'] = ''
info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//span[@class="taglist"]/text()')]
if selector.xpath("//div[starts-with(@class,'basic-info')]"):
for li_result in selector.xpath("//div[starts-with(@class,'basic-info')]")[0].xpath('./dl'):
attributes = [attribute.xpath('string(.)').replace('\n', '') for attribute in li_result.xpath('./dt')]
values = [value.xpath('string(.)').replace('\n', '') for value in li_result.xpath('./dd')]
for item in zip(attributes, values):
info_data[item[0].replace(' ', '')] = item[1].replace(' ', '')
return info_data
def checkbaidu_polysemantic(self, selector):
semantics = ['https://baike.baidu.com' + sem for sem in
selector.xpath("//ul[starts-with(@class,'polysemantList-wrapper')]/li/a/@href")]
names = [name for name in selector.xpath("//ul[starts-with(@class,'polysemantList-wrapper')]/li/a/text()")]
info_list = []
if semantics:
for item in zip(names, semantics):
selector = etree.HTML(self.get_html(item[1]))
info_data = self.extract_baidu(selector)
info_data['current_semantic'] = item[0].replace(' ', '').replace('(','').replace(')','')
if info_data:
info_list.append(info_data)
return info_list
class HudongBaike:
def get_html(self, url):
return request.urlopen(url).read().decode('utf-8').replace(' ', '')
def info_extract_hudong(self, word): # 互动百科
url = "http://www.baike.com/wiki/%s" % parse.quote(word)
print(url)
selector = etree.HTML(self.get_html(url))
info_list = list()
info_data = self.extract_hudong(selector)
if selector.xpath('//li[@class="current"]/strong/text()'):
info_data['current_semantic'] = selector.xpath('//li[@class="current"]/strong/text()')[0].replace(' ', '').replace('(','').replace(')','')
else:
info_data['current_semantic'] = ''
info_list.append(info_data)
polysemantics = self.checkhudong_polysemantic(selector)
if polysemantics:
info_list += polysemantics
infos = [info for info in info_list if len(info) > 2]
return infos
def extract_hudong(self, selector):
info_data = {}
info_data['desc'] = selector.xpath('//div[@id="content"]')[0].xpath('string(.)')
info_data['intro'] = selector.xpath('//div[@class="summary"]')[0].xpath('string(.)').replace('编辑摘要', '')
info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//p[@id="openCatp"]/a/text()')]
for info in selector.xpath('//td'):
attribute = info.xpath('./strong/text()')
val = info.xpath('./span')
if attribute and val:
value = val[0].xpath('string(.)')
info_data[attribute[0].replace(':','')] = value.replace('\n','').replace(' ','').replace(' ', '')
return info_data
def checkhudong_polysemantic(self, selector):
semantics = [sem for sem in selector.xpath("//ul[@id='polysemyAll']/li/a/@href") if 'doc_title' not in sem]
names = [name for name in selector.xpath("//ul[@id='polysemyAll']/li/a/text()")]
info_list = list()
if semantics:
for item in zip(names, semantics):
selector = etree.HTML(self.get_html(item[1]))
info_data = self.extract_hudong(selector)
info_data['current_semantic'] = item[0].replace('(','').replace(')','')
if info_data:
info_list.append(info_data)
return info_list
class SougouBaike:
def get_html(self, url):
return request.urlopen(url).read().decode('utf-8').replace(' ', '')
def find_sofouid(self, word):
url = "http://baike.sogou.com/Search.e?sp=S%s" % parse.quote(word)
print(url)
selector = etree.HTML(self.get_html(url))
id = selector.xpath('//h2/a/@href')[0].split(';')[0]
info_url = "http://baike.sogou.com/%s"%id
return info_url
def info_extract_sogou(self, word):
info_url = self.find_sofouid(word)
selector = etree.HTML(self.get_html(info_url))
info_list = list()
info_data = self.extract_sogou(selector)
if selector.xpath('//li[@class="current_item"]/text()'):
info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace(' ', '').replace('(','').replace(')','')
else:
info_data['current_semantic'] = ''
info_list.append(info_data)
polysemantics = self.checksogou_polysemantic(selector)
if polysemantics:
info_list += polysemantics
infos = [info for info in info_list if len(info) > 2]
return infos
def extract_sogou(self, selector):
info_data = {}
info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//div[@class="relevant_wrap"]/a/text()')]
if selector.xpath('//li[@class="current_item"]/text()'):
info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace(' ', '').replace('(','').replace(')','')
else:
info_data['current_semantic'] = ''
tables = selector.xpath('//table[@class="abstract_list"]')
for table in tables:
attributes = table.xpath('./tbody/tr/th/text()')
values = [td.xpath('string(.)') for td in table.xpath('./tbody/tr/td')]
for item in zip(attributes, values):
info_data[item[0].replace(' ', '').replace('\xa0','')] = item[1].replace(' ', '')
return info_data
def checksogou_polysemantic(self, selector):
semantics = ['http://baike.sogou.com' + sem.split('?')[0] for sem in selector.xpath("//ol[@class='semantic_item_list']/li/a/@href")]
names = [name for name in selector.xpath("//ol[@class='semantic_item_list']/li/a/text()")]
info_list = list()
if semantics:
for item in zip(names, semantics):
selector = etree.HTML(self.get_html(item[1]))
info_data = self.extract_sogou(selector)
info_data['current_semantic'] = item[0].replace('(','').replace(')','')
if info_data:
info_list.append(info_data)
return info_list
class SemanticBaike:
def __init__(self):
cur = '/'.join(os.path.realpath(__file__).split('/')[:-1])
self.tmp_file = os.path.join(cur, 'word_concept.txt')
'''根据instance本身抽取其概念'''
def extract_concept(self, word):
wds = [w.word for w in pseg.cut(word) if w.flag[0] in ['n']]
if not wds:
return ''
else:
return wds[-1]
'''对三大百科得到的semantic概念进行对齐'''
def extract_main(self, word):
f = open(self.tmp_file, 'w+')
baidu = BaiduBaike()
hudong = HudongBaike()
sogou = SougouBaike()
semantic_dict = {}
semantics = []
tuples = []
concepts_all = []
baidu_info = [[i['current_semantic'], i['tags']] for i in baidu.info_extract_baidu(word)]
hudong_info = [[i['current_semantic'], i['tags']] for i in hudong.info_extract_hudong(word)]
sogou_info = [[i['current_semantic'], i['tags']] for i in sogou.info_extract_sogou(word)]
semantics += baidu_info
semantics += hudong_info
semantics += sogou_info
for i in semantics:
instance = i[0]
concept = i[1]
if not instance:
continue
if instance not in semantic_dict:
semantic_dict[instance] = concept
else:
semantic_dict[instance] += concept
# 对从百科知识库中抽取得到的上下位关系进行抽取
for instance, concepts in semantic_dict.items():
concepts = set([i for i in concepts if i not in ['', ' ']])
concept_pre = self.extract_concept(instance)
concepts_all += concepts
concepts_all += [concept_pre]
tuples.append([word, instance])
tuples.append([instance, concept_pre])
for concept in concepts:
tuples.append([instance, concept])
# 对词汇本身上下位义进行上下位抽取
tmps = [[i, j] for i in concepts_all for j in concepts_all if j in i and i and j]
tuples += tmps
for tuple in tuples:
if tuple[0] != tuple[1]:
f.write('->'.join(tuple) + '\n')
f.close()
print(tuples)
handler = EventGraph(self.tmp_file, word)
handler.show_graph()
def show_graph():
cur = '/'.join(os.path.realpath(__file__).split('/')[:-1])
concept_file = os.path.join(cur, 'baike_concept.txt')
handler = EventGraph(concept_file, 'baike_concept')
handler.show_graph()
if __name__ == '__main__':
handler = SemanticBaike()
handler.extract_main('苹果')
# handler.walk_concept_chain('西瓜')