-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processor.py
145 lines (119 loc) · 5.48 KB
/
data_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Data generator
import copy
import pickle
import csv
import json
import networkx as nx
import random
from random import randrange
def load_pickle(fname):
with open(fname, 'rb') as f:
return pickle.load(f)
class Eth_graph_dataset:
def __init__(self, file_name):
self.file_name = file_name
self.more_labels = None
def load(self):
self.G = load_pickle(self.file_name)
print(nx.info(self.G))
def _load_pickle(self, fname):
with open(fname, 'rb') as f:
return pickle.load(f)
def load_json_auxilary_label(self):
with open('addresses-darklist.json', 'r') as file:
more_labels = json.load(file)
self.more_labels = more_labels
def sample_nodes_edges(self, n_nodes, nodes_file=None, edges_file=None):
if not nodes_file:
nodes_file = f'nodes_eth_{n_nodes}.csv'
if not edges_file:
edges_file = f'edges_eth_{n_nodes}.csv'
header = ['node', 'address', 'isp', 'is_anchor']
G = self.G
data = G.nodes
is_anchors = [0] * (2973489 - 1165) + [1] * 1165
random.shuffle(is_anchors)
print(f'length: {len(data)}')
with open(nodes_file, 'w', encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
nds = set()
# add all label 1 from original dataset
for idx, nd in enumerate(data):
if data[nd]['isp'] == 1:
nds.add(nd)
writer.writerow([nd, nd, data[nd]['isp'], is_anchors[idx]])
print(f'{len(nds)} with label 1 added')
# add all label 2 from auxilary dataset
# you can provide more labels for the phishing nodes (with label 2)
# in addition to the phishing nodes from the dataset (with label 1)
# otherwise will be 0 nodes with label 2
nds_second_label = set()
if self.more_labels is not None:
more_labels_address = set([d_['address'] for d_ in self.more_labels])
for idx, nd in enumerate(data):
if (data[nd]['isp'] == 0) and (nd in more_labels_address):
nds_second_label.add(nd)
writer.writerow([nd, nd, 2, is_anchors[idx]])
print(f'{len(nds_second_label)} with label 2 added')
have_wrote = 0
number_limit = copy.deepcopy(n_nodes)
data_list = list(data)
for idx, _ in enumerate(data):
random_index = randrange(len(data))
nd = data_list[random_index]
have_wrote = have_wrote + 1
if have_wrote > number_limit:
break
nds.add(nd)
writer.writerow([nd, nd, data[nd]['isp'], is_anchors[idx]])
print(f'{len(nds)} nodes added in total')
neighbor_nds = set()
header = ['source', 'target', 'amount', 'timestamp']
data = nx.edges(G)
eds = set()
with open(edges_file, 'w', encoding='UTF8', newline='') as f, open(
f"{edges_file.split('.')[0]}_onlyEdges.csv", 'w', encoding='UTF8', newline='') as f_only_edges:
writer = csv.writer(f)
writer_only_edges = csv.writer(f_only_edges)
writer.writerow(header)
# G[u][v][0] is the gets the first edge from node u to node v.
# G[u][v][0]['amount'] is the transfered amount, G[u][v][0]['timestamp'] is the transfered timestep.
# getting the edges of the selected nodes (like 1165 negative + 1165 positive)
for Gind, u in enumerate(G):
for Vind, v in enumerate(G[u]):
if u in nds:
neighbor_nds.add(v)
elif v in nds:
neighbor_nds.add(u)
else:
continue
for Egind, eg_i in enumerate(G[u][v]):
eg = G[u][v][eg_i]
row = [u, v, eg['amount'], eg['timestamp']]
row_only_edges = [u, v]
if str(row) not in eds:
eds.add(str(row))
writer.writerow(row)
writer_only_edges.writerow(row_only_edges)
# getting the edges of the neighbors of the selected nodes
for Gind, u in enumerate(G):
for Vind, v in enumerate(G[u]):
if (u in neighbor_nds) and (v in neighbor_nds): #
for Egind, eg_i in enumerate(G[u][v]):
eg = G[u][v][eg_i]
row = [u, v, eg['amount'], eg['timestamp']]
row_only_edges = [u, v]
if str(row) not in eds:
eds.add(str(row))
writer.writerow(row)
writer_only_edges.writerow(row_only_edges)
print(f'{len(eds)} edges added')
if __name__ == '__main__':
# This will create three csv files:
# edges_eth_1165.csv
# nodes_eth_1165.csv
# edges_eth_1165_onlyEdges.csv
ethds = Eth_graph_dataset('./MulDiGraph.pkl')
ethds.load()
ethds.sample_nodes_edges(1165, edges_file=None)