-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikigraph.py
214 lines (190 loc) · 7.96 KB
/
wikigraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
wikigraph.py:
Implements classes for finding paths between wikipedia articles
and other related functions using the WikiMedia API.
The `WikiGraph.find_path` method is better run in a shell
session or in a batch collection as its use of memoization will
speed up searches whilst it runs, reducing requests to the Wikimedia API.
Example session:
>>> import wikigraph
>>> w = wikigraph.WikiGraph()
>>> path = w.find_path(start="Tom Hanks", end="Kevin Bacon")
>>> print(path)
<wikigraph.Path: Tom Hanks -> Kevin Bacon>
>>> print(path.info)
Path:
Path: Tom Hanks -> Kevin Bacon
Separation: 1 steps
Time Taken: 0.578131 seconds
Requests: 2
>>> path.data
{'start': 'Tom Hanks', 'end': 'Kevin Bacon', 'path': 'Tom Hanks->Kevin Bacon', 'degree': 1}
>>> print(path.json(indent=2))
{
"start": "Tom Hanks",
"end": "Kevin Bacon",
"path": "Tom Hanks->Kevin Bacon",
"degree": 1
}
TODO:
Think about memoization. Some cases make this impractical, for example
the top50 most visited articles have a gigantic amount of links that
direct to them (top50 mean=47300). Perhaps change the memoizing to be
optional and editable within the WikiGraph object instances instead of
just as a function decorator. This would make all functions that need
to make requests to the api more flexable in terms of their memory usage.
TODO:
Investigate strategies for cases where inbound links really big for
similar reasons as above. For example 'United States' has around 463000
links to it, is it a reasonable to loop through of these before checking
the other end of the search.
TODO:
Handle edge cases for when start or end articles have no links pointing
to and from them. Policy now is just to run until the users patience runs
out. Of the 500 random sample collected this did not happen or if it did
wikipedia rediriected us to another page.
'''
from __future__ import print_function
import json
from collections import deque
from datetime import datetime
from wikiapi import WikiAPI
class WikiGraph(WikiAPI):
'''
WikiGraph implements methods for connecting to the WikiMedia API
orientated to finding paths between articles and information between
nodes. Public methods are: `find_path`, `indegree`. When using with
find_path it is better use in a session or in a batch collection as
memoization means it will speed up searches and reduce requests to
the Wikimedia API.
'''
def find_path(self, start, end):
'''Find a valid path between 2 given wikipedia articles
and return a Path object.'''
self.outbound_requests = 0
self.inbound_requests = 0
t1 = datetime.now()
return Path(start=start, end=end,
path=self._bidirectional_search(start, end, self.page_links),
time=(datetime.now() - t1).total_seconds(),
requests=self.outbound_requests + self.inbound_requests)
def indegree(self, title):
'''return the number of links to a given article. This corresponds
to using the "linkshere" property of the WikiMedia api.'''
params = self.inbound_params
params["titles"] = title
# iterate through requests and get a cumsum.
total = 0
for result in self._query(params):
node = next(iter(result['pages'].values()))
if not "linkshere" in node:
print("Title Error: Page missing results? '%s'" % title)
raise StopIteration
total += len(node["linkshere"])
return total
def _bidirectional_search(self, start, end, successors):
'''
Use a unweighted bidirectional search to form paths from start to
end args. This works as Breath-First-Search from both start and end nodes
at the same time balancing traversals by requests made to the api from each
side. It Keeps track of each sides explored nodes returning the shortest
merged path when a overlap occurs.
'''
if start == end:
return [start]
found_paths = []
l_explored, l_front = set(), deque([[start]]) # outbound links
r_explored, r_front = set(), deque([[end]]) # inbound links
while l_front and r_front:
# -> Advance forwards from start.
if self.outbound_requests < self.inbound_requests and l_front:
path = l_front.popleft()
for state in successors(path[-1], False):
if state in l_explored:
# ignore the already explored
continue
l_explored.add(state)
path2 = path + [state]
if state == end:
found_paths.append(path2)
elif state in r_explored:
found_paths.append(self._merge_paths(path2, r_front))
else:
l_front.append(path2)
elif r_front:
# <- Advance backwards from end.
path = r_front.popleft()
for state in successors(path[-1], True):
if state in r_explored:
# ignore the already explored.
continue
r_explored.add(state)
path2 = path + [state]
if state == start:
found_paths.append(path2[::-1])
elif state in l_explored:
# reverse the result so its the right way round.
found_paths.append(self._merge_paths(path2, l_front)[::-1])
else:
r_front.append(path2)
# we waited till we have done a full loop through the
# results of these requests so we can find the shortest
# possible in the data we have recieved.
if found_paths:
return min(found_paths, key=len)
return [] # failed search.
def _merge_paths(self, path, frontier):
'''Connect two overlapping paths from a bidirectional search
and return the shortest one.'''
overlap = path.pop()
return min([path + opposite[::-1]
for opposite in frontier
if overlap in opposite], key=len)
class Path(object):
'''
Path is returned by the WikiGraph.find_path method. It stores
the state of the path whilst implementing methods for displaying
and returning its data.
'''
def __init__(self, start, end, path=[], time=None, requests=None):
self.start = start
self.end = end
self.path = path
self.degree = len(self.path) - 1
self.time = time
self.requests = requests
def __nonzero__(self):
"is the path empty"
return bool(self.path)
def __str__(self):
"More infomative string representation"
return "<%s.%s: %s>" % (self.__class__.__module__,
self.__class__.__name__,
" -> ".join(self.path))
__repr__ = __str__
def __len__(self):
"return the degree (the path len)"
return self.degree
@property
def info(self):
"return infomaiton about a given path"
path = " -> ".join(self.path)
name = self.__class__.__name__
return ("%s:\n"
"\tPath: %s\n"
"\tSeparation: %d steps\n"
"\tTime Taken: %f seconds\n"
"\tRequests: %d\n"
% (name, path, self.degree, self.time, self.requests))
@property
def data(self):
"return path output as a dict."
return dict(start=self.start,
end=self.end,
path="->".join(self.path),
degree=self.degree)
def json(self, **kwargs):
"return result data as json"
return json.dumps(self.data, **kwargs)