forked from ntgiang71096/vfdetector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_formater.py
120 lines (92 loc) · 3.66 KB
/
dataset_formater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from email.quoprimime import body_check
import pandas as pd
import data_loader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import issue_linker
from entities import Record, GithubIssue, GithubIssueComment, GithubCommit, GithubCommitFile, EntityEncoder
import json
sap_dataset_name = 'full_dataset_with_all_features.txt'
sap_new_dataset_name = 'sap_patch_dataset.csv'
def convert_sap_to_huawei_format():
records = data_loader.load_records(sap_dataset_name)
patches, labels, urls = [], [], []
url_to_partition = {}
for record in records:
url = record.repo + '/commit/' + record.commit_id
url = url[len('https://github.com/') :]
# print(url)
urls.append(url)
url_train, url_test, _, _ = train_test_split(urls, [0] * len(urls), test_size=0.20, random_state=109)
print(len(url_train))
print(len(url_test))
for url in url_train:
url_to_partition[url] = 'train'
for url in url_test:
url_to_partition[url] = 'test'
# item format ('commit_id', 'repo', 'partition', 'diff', 'label', 'PL')
items = []
for record in tqdm(records):
url = record.repo + '/commit/' + record.commit_id
url = url[len('https://github.com/') :]
label = record.label
files = record.commit.files
repo = record.repo[len('https://github.com/') :]
has_file = False
for file in files:
if file.file_name.endswith(('.java', '.py')) and file.patch is not None:
has_file = True
items.append((record.commit_id, repo, url_to_partition[url], file.patch, label, 'N/A' ))
if not has_file:
items.append((record.commit_id, repo, url_to_partition[url], "empty", label, 'N/A' ))
df = pd.DataFrame(items, columns=['commit_id', 'repo', 'partition', 'diff', 'label', 'PL'])
df.to_csv(sap_new_dataset_name, index=False)
def convert_to_sap_format():
# dataset_name = 'tf_vuln_dataset.csv'
records = issue_linker.load_tensor_flow_records()
issues = issue_linker.load_tensor_flow_issues()
number_to_issue = {}
for issue in issues:
number = issue['number']
number_to_issue[number] = issue
url_to_number = {}
df = pd.read_csv('tf_issue_linking.csv')
for item in df.values.tolist():
url_to_number[item[0]] = item[1]
sap_data = []
for record in tqdm(records):
url = record[0]
msg = record[1]
diffs = record[2]
repo, commit_id = url.split('/commit/')
sap_record = Record()
sap_record.commit_message = msg
sap_record.repo = repo
sap_record.commit_id = commit_id
github_commit = GithubCommit()
commit_files = []
for diff in diffs:
file = GithubCommitFile()
file.patch = diff
commit_files.append(file)
github_commit.files = commit_files
sap_record.commit = github_commit
issue_json = number_to_issue[url_to_number[url]]
issue = GithubIssue()
issue.title = issue_json['title']
issue.body = issue_json['body']
comments = []
for comment_json in issue_json['comments']:
comment = GithubIssueComment()
comment.body = comment_json
comments.append(comment)
issue.comments = comments
sap_record.add_github_ticket(issue)
sap_data.append(sap_record)
entity_encoder = EntityEncoder()
json_value = entity_encoder.encode(sap_data)
with open('tf_dataset_sap_format.txt', 'w') as file:
file.write(json_value)
print("Finishing writing")
if __name__ == '__main__':
convert_to_sap_format()