-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiscover.py
123 lines (108 loc) · 4.66 KB
/
discover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from time import gmtime, strftime
import pandas as pd
import networkx as nx
from networkx.algorithms import community
from utils.filter_edgelist import filter_out_outsiders
from utils.job_load import (
get_edgelist_job_dir_path,
get_job_num,
load_edgelist,
)
def get_in_degree_centrality(G, n=-1):
"""
Compute in-degree centrality and return a Series of nodes' in-degree centrality. Use n for top N nodes, and n=-1 for all nodes.
"""
d = dict(G.in_degree())
data = {"id_str": [], "in_degree": []}
for node, in_degree in d.items():
data["id_str"].append(node)
data["in_degree"].append(in_degree)
if n == -1:
df = pd.DataFrame(data).sort_values(by="in_degree", ascending=False)
else:
df = pd.DataFrame(data).sort_values(by="in_degree", ascending=False).head(n)
return df
def initialize_graph(edgelist):
"""
Initialize and return NetworkX graph object. edgelist must be a DataFrame with columns 'source' and 'target'
"""
G = nx.from_pandas_edgelist(
edgelist, source="source", target="target", create_using=nx.DiGraph
)
print(
f"{strftime('%Y-%m-%d %H:%M:%S', gmtime())}\tInitialized graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges"
)
return G
def get_cutoff_perc():
"""
Specify cutoff percentile `cutoff_perc` such that for a given community, only nodes followed by at least `cutoff_perc` fraction of the community.
"""
try:
return float(input("Specify cutoff percentile (e.g. 0.8, 0.66, 0.5): "))
except:
get_cutoff_perc()
def explore_graph(Gf, edgelist, cutoff_perc, min_community_size=3):
"""
Explore graph and record results.
"""
# Detect communities using the Girvan-Newman algorithm (based on betweenness centrality)
communities_generator = community.girvan_newman(Gf)
to_explore = set()
failed = set()
for comm in next(communities_generator):
# From edgelist, filter out sources not in the community, keep outsiders
comm_member_set = set(comm)
comm_edgelist = edgelist[
edgelist["source"].apply(lambda x: x in comm_member_set)
]
print(f"Length of community edgelist: {len(comm_edgelist)}")
G_comm = nx.from_pandas_edgelist(
comm_edgelist, source="source", target="target", create_using=nx.DiGraph
)
in_deg_cent = get_in_degree_centrality(G_comm)
in_deg_cutoff = cutoff_perc * len(comm_member_set)
print(f"Community size: {len(comm_member_set)}")
print(f"In-degree cutoff: {round(in_deg_cutoff, 2)}")
in_deg_cent_cutoff = in_deg_cent[in_deg_cent["in_degree"] > (in_deg_cutoff)]
print(
f"Number of members before filtering out community members: {len(in_deg_cent_cutoff)}"
)
if (len(in_deg_cent_cutoff) == 0):
print("There are no users to explore")
else:
# Compute user IDs to explore--filter out community members
in_deg_cent_cutoff_f = in_deg_cent_cutoff[
in_deg_cent_cutoff["id_str"].apply(lambda x: x not in comm_member_set)
]
if len(comm) < min_community_size:
failed.update(in_deg_cent_cutoff_f["id_str"])
print(
f"Excluded {len(in_deg_cent_cutoff_f['id_str'])} users from exploration because of the community's small size ({len(comm)})"
)
else:
to_explore.update(in_deg_cent_cutoff_f["id_str"])
print(f"Added {len(in_deg_cent_cutoff_f)} users for exploration from community")
fname = f"{job_dir}discover_{job_num}_{str(cutoff_perc).replace('.', '_')}.csv"
with open(fname, "w") as f:
f.writelines([i + "\n" for i in to_explore])
failed_fname = (
f"{job_dir}discover_failed_{job_num}_{str(cutoff_perc).replace('.', '_')}.csv"
)
with open(failed_fname, "w") as f:
f.writelines([i + "\n" for i in failed])
print(f"In total, added {len(to_explore)} users for exploration from all communities")
print(
f"Now run load.py, select option c (IDs from file) and paste this filename\n\n{fname}\n\nThen, select 0 (initial set only)"
)
print(f"{strftime('%Y-%m-%d %H:%M:%S', gmtime())}\tExplored community graphs")
return 0
job_num = get_job_num()
job_dir = get_edgelist_job_dir_path(job_num)
# Use unfiltered edgelist for exploration
edgelist = load_edgelist(job_dir)
# Use filtered edgelist (no outsiders) for community detection
filtered = filter_out_outsiders(edgelist)
Gf = initialize_graph(filtered)
# Explore graph
cutoff_perc = get_cutoff_perc()
explore_graph(Gf, edgelist, cutoff_perc, min_community_size=3)