forked from jdanceze/cg_parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_std_lib_edges.py
201 lines (172 loc) · 7.94 KB
/
remove_std_lib_edges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/env python3
'''
This script removes all standard library edges,
and adds those application edges which are in the transitive of the removed edges
'''
import sys
import os
import csv
from pathlib import Path
import queue
import json
from collections import defaultdict, namedtuple
import configparser
ROOT_METHOD = "<boot>"
Edge = namedtuple("Edge", "bytecodeOffset dest")
Node = namedtuple("Node",
["edges","reachable_app_nodes","reachableNodes","isStdLibNode"])
config = configparser.ConfigParser()
config.read('settings.ini')
def main():
""" The main method of the removestdlibedges.py script. """
# analysisfile = Path(sys.argv[1])
# methodsfile = Path(sys.argv[2])
# outputfile = Path(sys.argv[3])
analysisfile = Path(config.get('Paths', 'labeled_csv_file'))
methodsfile = Path(config.get('Paths', 'method_reference_file'))
outputfile = Path(config.get('Paths', 'combined_dataset_file'))
nodes_with_closure = remove_stdlib_edges(analysisfile, methodsfile, True)
nodes_without_closure = remove_stdlib_edges(analysisfile, methodsfile, False)
without_closure_edges = set()
for name, obj in nodes_without_closure.items():
for edge in obj.edges:
edge_id = (name, edge.bytecodeOffset, edge.dest)
without_closure_edges.add(edge_id)
# # Read the labels from the CSV file
# labels = {}
# with open(analysisfile, 'r') as file:
# reader = csv.reader(file)
# next(reader) # Skip the header row
# for row in reader:
# method = row[0]
# target = row[2]
# label = row[3]
# labels[(method, target)] = label
#Write output
printed = set()
with open(outputfile, "w") as filep:
writer = csv.writer(filep)
#writer.writerow(["label","method", "offset", "target","wala","wala-direct"])
writer.writerow(["method", "offset", "target","wala","wala-direct"])
for name, obj in nodes_with_closure.items():
for edge in obj.edges:
edge_id = (name, edge.bytecodeOffset, edge.dest)
if edge_id in printed:
continue
printed.add(edge_id)
#label = labels.get((name, edge.dest), "0")
if edge_id in without_closure_edges:
#writer.writerow((label,name, edge.bytecodeOffset, edge.dest,"1","1"))
writer.writerow((name, edge.bytecodeOffset, edge.dest,"1","1"))
else:
#writer.writerow((label,name, edge.bytecodeOffset, edge.dest,"1","0"))
writer.writerow((name, edge.bytecodeOffset, edge.dest,"1","0"))
def empty_node():
""" Create an empty node """
return Node([], set(), set(), False)
def remove_stdlib_nodes(nodes, stdlibnodes):
""" remove standard library nodes """
for node in stdlibnodes:
del nodes[node]
return nodes
def get_std_lib_nodes_directly_called(static_analysis_nodes, std_lib_nodes):
""" Computes the set of stdLib nodes which get directly called
from an application node
(does not include stdlib nodes with no outgoing edges) """
directly_called_nodes = set()
for node_object in static_analysis_nodes.values():
for edge in node_object.edges:
if edge.dest in std_lib_nodes:
directly_called_nodes.add(edge.dest)
return directly_called_nodes
def compute_reachable_application_nodes(std_lib_nodes,
application_method_list, static_analysis_nodes):
"""compute the set of reachable application nodes for all the std_lib_nodes
, by computing its transitive closure transitive closure for each nodes is
stored in the 'reachable_app_nodes' field """
std_lib_nodes_called_directly_from_application = \
get_std_lib_nodes_directly_called(static_analysis_nodes, std_lib_nodes)
#Compute reachability information for all stdlib nodes
#called directly called from the application
for node_name in std_lib_nodes_called_directly_from_application:
#BFS
nodes_to_be_explored = queue.Queue()
visited_list = set()
visited_list.add(node_name)
nodes_to_be_explored.put(node_name)
#BFS on 'node_name' to compute reachable nodes
while not nodes_to_be_explored.empty():
node = nodes_to_be_explored.get()
if node in std_lib_nodes:
for edge in std_lib_nodes[node].edges:
if edge.dest not in visited_list:
visited_list.add(edge.dest)
nodes_to_be_explored.put(edge.dest)
#Now compute which are the application nodes from this explored list
for node in visited_list:
if node in application_method_list:
std_lib_nodes[node_name].reachable_app_nodes.add(node)
return std_lib_nodes
def get_std_lib_nodes(static_analysis_nodes, application_method_list):
""" Gets a list of standard library nodes """
std_lib_nodes = {}
#if a node does not contain a class name from any of the classes
# given by javaq, it is considered a standard library node
for node_name, node_object in static_analysis_nodes.items():
if node_name not in application_method_list:
std_lib_nodes[node_name] = node_object
return std_lib_nodes
def replace_std_lib_edges_with_app_edges(
static_analysis_nodes, std_lib_nodes, application_method_list):
""" Replaces edges ending in stdLib nodes with their reachable application
nodes Assumes that wala nodes does not include stdLib nodes """
for node_object in static_analysis_nodes.values():
#Compute the standard libaray edges to remove, and the new application
#nodes to replace them with
new_edges_to_be_added = []
edges_to_std_lib_to_be_removed = set()
for edge in node_object.edges:
#Remove standard library edges
if edge.dest not in application_method_list:
edges_to_std_lib_to_be_removed.add(edge)
#If the destination is in std_lib_nodes, replace with the set of
#reachable application nodes
if edge.dest in std_lib_nodes:
for node in std_lib_nodes[edge.dest].reachable_app_nodes:
new_edges_to_be_added.append(Edge(edge.bytecodeOffset, node))
#Remove the standard library edges
for edge_name in edges_to_std_lib_to_be_removed:
node_object.edges.remove(edge_name)
#Replace the standard library edges with ones to application nodes
node_object.edges.extend(new_edges_to_be_added)
def remove_stdlib_edges(analysisfile, methodsfile, do_transitive_closure):
""" Remove edges to the std library, but keep the transitive closures."""
#Intialize some variables
nodes = defaultdict(empty_node)
#Adjacency list representation. However this does not include nodes with no
#outgoing edges
appmethodlist = set() #Needed to remove java standard library nodes
appmethodlist.add(ROOT_METHOD)
#Read the analysis reachable edges
with open(analysisfile) as filep:
#Loop through the edges
for edge in csv.DictReader(filep):
#Create new node if it doesn't exist
_method = edge["method"]
_offset = edge["offset"]
_target = edge["target"]
#_label = edge["label"]
nodes[_method].edges.append(Edge(_offset, _target))
#Read the nethod list from the methods.csv file
with open(methodsfile) as filep:
lines = [line.rstrip() for line in filep]
for line in lines:
appmethodlist.add(line)
stdlibnodes = get_std_lib_nodes(nodes, appmethodlist)
remove_stdlib_nodes(nodes, stdlibnodes)
if do_transitive_closure:
compute_reachable_application_nodes(stdlibnodes, appmethodlist, nodes)
replace_std_lib_edges_with_app_edges(nodes, stdlibnodes, appmethodlist)
return nodes
if __name__ == '__main__':
main()