forked from RTXteam/RTX-KG2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile-post-etl
166 lines (161 loc) · 7.09 KB
/
Snakefile-post-etl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
rule Merge:
input:
code = config['MERGE_SCRIPT'],
umls_nodes = config['UMLS_OUTPUT_NODES_FILE'],
umls_edges = config['UMLS_OUTPUT_EDGES_FILE'],
ont_nodes = config['ONT_OUTPUT_NODES_FILE'],
ont_edges = config['ONT_OUTPUT_EDGES_FILE'],
uniprot_nodes = config['UNIPROTKB_OUTPUT_NODES_FILE'],
uniprot_edges = config['UNIPROTKB_OUTPUT_EDGES_FILE'],
semmeddb_nodes = config['SEMMEDDB_OUTPUT_NODES_FILE'],
semmeddb_edges = config['SEMMEDDB_OUTPUT_EDGES_FILE'],
chembl_nodes = config['CHEMBL_OUTPUT_NODES_FILE'],
chembl_edges = config['CHEMBL_OUTPUT_EDGES_FILE'],
ensembl_nodes = config['ENSEMBL_OUTPUT_NODES_FILE'],
ensembl_edges = config['ENSEMBL_OUTPUT_EDGES_FILE'],
unichem_nodes = config['UNICHEM_OUTPUT_NODES_FILE'],
unichem_edges = config['UNICHEM_OUTPUT_EDGES_FILE'],
ncbigene_nodes = config['NCBIGENE_OUTPUT_NODES_FILE'],
ncbigene_edges = config['NCBIGENE_OUTPUT_EDGES_FILE'],
dgidb_nodes = config['DGIDB_OUTPUT_NODES_FILE'],
dgidb_edges = config['DGIDB_OUTPUT_EDGES_FILE'],
repodb_nodes = config['REPODB_OUTPUT_NODES_FILE'],
repodb_edges = config['REPODB_OUTPUT_EDGES_FILE'],
drugbank_nodes = config['DRUGBANK_OUTPUT_NODES_FILE'],
drugbank_edges = config['DRUGBANK_OUTPUT_EDGES_FILE'],
smpdb_nodes = config['SMPDB_OUTPUT_NODES_FILE'],
smpdb_edges = config['SMPDB_OUTPUT_EDGES_FILE'],
hmdb_nodes = config['HMDB_OUTPUT_NODES_FILE'],
hmdb_edges = config['HMDB_OUTPUT_EDGES_FILE'],
go_annotations_nodes = config['GO_ANNOTATIONS_OUTPUT_NODES_FILE'],
go_annotations_edges = config['GO_ANNOTATIONS_OUTPUT_EDGES_FILE'],
reactome_nodes = config['REACTOME_OUTPUT_NODES_FILE'],
reactome_edges = config['REACTOME_OUTPUT_EDGES_FILE'],
mirbase_nodes = config['MIRBASE_OUTPUT_NODES_FILE'],
mirbase_edges = config['MIRBASE_OUTPUT_EDGES_FILE'],
jensenlab_nodes = config['JENSENLAB_OUTPUT_NODES_FILE'],
jensenlab_edges = config['JENSENLAB_OUTPUT_EDGES_FILE'],
drugcentral_nodes = config['DRUGCENTRAL_OUTPUT_NODES_FILE'],
drugcentral_edges = config['DRUGCENTRAL_OUTPUT_EDGES_FILE'],
intact_nodes = config['INTACT_OUTPUT_NODES_FILE'],
intact_edges = config['INTACT_OUTPUT_EDGES_FILE'],
disgenet_nodes = config['DISGENET_OUTPUT_NODES_FILE'],
disgenet_edges = config['DISGENET_OUTPUT_EDGES_FILE'],
kegg_nodes = config['KEGG_OUTPUT_NODES_FILE'],
kegg_edges = config['KEGG_OUTPUT_EDGES_FILE']
output:
nodes = config['MERGED_OUTPUT_NODES_FILE'],
edges = config['MERGED_OUTPUT_EDGES_FILE'],
orph = config['OUTPUT_FILE_ORPHAN_EDGES']
log:
config['MERGE_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} " + config['TEST_ARG'] + \
" --kgFileOrphanEdges {output.orph}" + \
" --outputNodesFile {output.nodes} " + \
" --outputEdgesFile {output.edges} " + \
" --kgNodesFiles " + \
"{input.umls_nodes} " + \
"{input.ont_nodes} " + \
"{input.semmeddb_nodes} " + \
"{input.uniprot_nodes} " + \
"{input.ensembl_nodes} " + \
"{input.unichem_nodes} " + \
"{input.chembl_nodes} " + \
"{input.ncbigene_nodes} " + \
"{input.dgidb_nodes} " + \
"{input.repodb_nodes} " + \
"{input.smpdb_nodes} " + \
"{input.drugbank_nodes} " + \
"{input.hmdb_nodes} " + \
"{input.go_annotations_nodes} " + \
"{input.reactome_nodes} " + \
"{input.mirbase_nodes} " + \
"{input.jensenlab_nodes} " + \
"{input.drugcentral_nodes} " + \
"{input.intact_nodes} " + \
"{input.disgenet_nodes} " + \
"{input.kegg_nodes} " + \
" --kgEdgesFiles " + \
"{input.umls_edges} " + \
"{input.ont_edges} " + \
"{input.semmeddb_edges} " + \
"{input.uniprot_edges} " + \
"{input.ensembl_edges} " + \
"{input.unichem_edges} " + \
"{input.chembl_edges} " + \
"{input.ncbigene_edges} " + \
"{input.dgidb_edges} " + \
"{input.repodb_edges} " + \
"{input.smpdb_edges} " + \
"{input.drugbank_edges} " + \
"{input.hmdb_edges} " + \
"{input.go_annotations_edges} " + \
"{input.reactome_edges} " + \
"{input.mirbase_edges} " + \
"{input.jensenlab_edges} " + \
"{input.drugcentral_edges} " + \
"{input.intact_edges} " + \
"{input.disgenet_edges} " + \
"{input.kegg_edges} > {log} 2>&1"
rule Stats:
input:
code = config['REPORT_SCRIPT'],
nodes = config['MERGED_OUTPUT_NODES_FILE'],
edges = config['MERGED_OUTPUT_EDGES_FILE']
output:
config['REPORT_FILE']
log:
config['REPORT_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} {input.nodes} {input.edges} {output} > {log} 2>&1"
rule Simplify:
input:
code = config['SIMPLIFY_SCRIPT'],
nodes = config['MERGED_OUTPUT_NODES_FILE'],
edges = config['MERGED_OUTPUT_EDGES_FILE']
output:
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE']
log:
config['SIMPLIFY_LOG']
shell:
"bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['VERSION_FILE'] + " " + config['TEST_FLAG'] + " > {log} 2>&1"
rule Slim:
input:
code = config['SLIM_SCRIPT'],
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE']
output:
nodes = config['SLIM_OUTPUT_NODES_FILE'],
edges = config['SLIM_OUTPUT_EDGES_FILE']
log:
config['SLIM_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} " + config['TEST_ARG'] + " {input.nodes} {input.edges} {output.nodes} {output.edges} > {log} 2>&1"
rule Simplify_Stats:
input:
code = config['REPORT_SCRIPT'],
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE']
output:
config['SIMPLIFIED_REPORT_FILE']
log:
config['SIMPLIFIED_REPORT_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} --useSimplifiedPredicates {input.nodes} {input.edges} {output} > {log} 2>&1"
rule TSV:
input:
code = config['TSV_SCRIPT'],
nodes = config['SIMPLIFIED_OUTPUT_NODES_FILE'],
edges = config['SIMPLIFIED_OUTPUT_EDGES_FILE'],
mapping_file = config['INFORES_MAPPING_FILE']
output:
placeholder = config['TSV_PLACEHOLDER']
log:
config['TSV_LOG']
run:
shell("rm -rf " + config['KG2_TSV_DIR'])
shell("mkdir -p " + config['KG2_TSV_DIR'])
shell(config['PYTHON_COMMAND'] + " {input.code} {input.nodes} {input.edges} {input.mapping_file} " + config['KG2_TSV_DIR'] + " > {log} 2>&1")
shell("touch {output.placeholder}")