-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkb_iMAG-viz-workflow.py
executable file
·439 lines (398 loc) · 27.5 KB
/
kb_iMAG-viz-workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
#!/usr/bin/env python3
__author__ = "Sean P. Jungbluth"
__copyright__ = "Copyright 2019"
__license__ = "GPL 3.0"
__maintainer__ = "Sean P. Jungbluth"
__email__ = "jungbluth.sean@gmail.com"
import sys
import time
import pandas
import numpy
import os
import re
import argparse
from argparse import RawTextHelpFormatter
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
version = 0.1
# example quick command: kb_iMAG-viz-workflow.py -i /Applications/ResearchSoftware/kb_iMAG-viz/test/query-genomes/TARA-MAGs_Delmont-Archaea-only-2017.RAST.txt
# example long command: /Applications/ResearchSoftware/kb_iMAG-viz/kb_iMAG-viz-workflow.py -i /Applications/ResearchSoftware/kb_iMAG-viz/test/query-genomes/TARA-MAGs_Delmont-Archaea-only-2017.RAST.txt --taxa_level Phylum --save_master_table Yes --path_to_kb_imagviz /Applications/ResearchSoftware/kb_iMAG-viz --generate_count_tables n --dimensional_reduction_method pca --plotting_method ggplot
# todo
# convert matrix to presence/absence
# read_and_parse_rast_annotations only operating on query right now - other "clean" files generated elsewhere at the moment (need to fix obviously)
# check to see if python3 installed and being used; should be automatic because of the shebang, but useful if someone calls explicitly with python 2
def check_for_py3():
print("\n"+"Running check_for_py3")
time_start = time.time()
try:
if sys.version_info.major != 3:
sys.stderr.write('\nError: python version 3 is required - you have python version %d.\n\n' % sys.version_info.major)
sys.exit(-1)
except Exception:
sys.stderr.write("Unable to detect python version - assuming python 3 installed.\n\n")
print('check_for_py3 done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# From Miriam: function to return object ID based on object name
def get_obj_id(obj_name):
"""
Example: get_obj_id("MAG-QC_Archaea.SAGs.Prokka")
"""
from biokbase.workspace.client import Workspace
import os
ws = Workspace('https://kbase.us/services/ws')
ws_name = os.environ['KB_WORKSPACE_ID']
try:
obj_id = ws.get_object_info3({'objects': [{'workspace': ws_name, 'name': obj_name}]})['paths'][0][0]
return obj_id
except:
return False
# From Miriam: function to return object data based on object ID
def get_object_data(object_id):
"""
Fetch data from the workspace. Example1: get_object_data(get_obj_id("iMAG-viz_Archaea.SAGs.RAST")); Example2: get_object_data(u'43402/2132/1')
"""
from biokbase.workspace.client import Workspace
ws = Workspace('https://kbase.us/services/ws')
return ws.get_objects([{"ref": object_id}])[0]
# Function to extract genome annotation information from a KBase GenomeSets Object
# todo: double check to see if indexed properly
def extract_annotations_from_genomeset(genomeset_name, export_filename):
print("\n"+"Running extract_annotations_from_genomeset")
time_start = time.time()
from biokbase.narrative.jobs.appmanager import AppManager
ws = biokbase.narrative.clients.get("workspace")
for genome in get_object_data(get_obj_id(genomeset_name)).values()[3]:
functionList = ws.get_objects([{'ref': genome}])[0]['data']['cdss']
df = []
for function in range(len(functionList)):
df.append(functionList[function]['functions'])
with open(export_filename, 'a') as f:
if get_object_data(genome).values()[0][1] != "2528311097___MAG-QC_Archaea__Isolates.RAST": # this genome has something weird going on in the annotation; result is it breaks the JSON file structure, suspected ({,},') symbols, need to troubleshoot
f.write(get_object_data(genome).values()[0][1] + '\t' + str(df) + '\n')
f.close()
print('extract_annotations_from_genomeset done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# wont be needed when annotation data pulled directly from Genome Set Objects
# Function to clean up annotation data and parse as a tsv-like file
def read_and_parse_rast_annotations(query_annotation_table, output_query_annotation):
print("\n"+"Running read_and_parse_rast_annotations")
time_start = time.time()
f1 = open(query_annotation_table)
if os.path.exists(output_query_annotation):
os.remove(output_query_annotation)
f2 = open(output_query_annotation, "w+")
f2 = open(output_query_annotation, "a")
ftrim = f1.readlines()[1:]
for line in ftrim:
rep = {"], [u": "|", "', u'": "|", '"': "", "[[u": ""}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
line = pattern.sub(lambda m: rep[re.escape(m.group(0))], line)
for j in range(len(line.split('\t')[1].split("|")) - 1): # last entry needs to be removed
if "''" != str(line.split('\t')[1].split("|")[j]): # write only non-empty lines
if "hypothetical protein" not in line.split('\t')[1].split("|")[j]: # not working
if "unknown" not in line.split('\t')[1].split("|")[j]: # not working
if "conserved protein" not in line.split('\t')[1].split("|")[j]: # not working
if "Conserved protein" not in line.split('\t')[1].split("|")[j]: # not working
if "predicted protein" not in line.split('\t')[1].split("|")[j]: # not working
if (str(line.split('\t')[1].split("|")[j])[0] == "'") and (str(line.split('\t')[1].split("|")[j])[-1] == "'"):
f2.write(line.split('\t')[0].split('___')[0]+'\t'+line.split('\t')[1].split("|")[j][1:-1]+'\n')
elif (str(line.split('\t')[1].split("|")[j])[0] == "'") and (str(line.split('\t')[1].split("|")[j])[-1] != "'"):
f2.write(line.split('\t')[0].split('___')[0]+'\t'+line.split('\t')[1].split("|")[j][1:]+'\n')
elif (str(line.split('\t')[1].split("|")[j])[0] != "'") and (str(line.split('\t')[1].split("|")[j])[-1] == "'"):
f2.write(line.split('\t')[0].split('___')[0]+'\t'+line.split('\t')[1].split("|")[j][:-1]+'\n')
elif (str(line.split('\t')[1].split("|")[j])[0] != '"') and (str(line.split('\t')[1].split("|")[j])[-1] == '"'):
f2.write(line.split('\t')[0].split('___')[0]+'\t'+line.split('\t')[1].split("|")[j][1:-1]+'\n')
else:
f2.write(line.split('\t')[0].split('___')[0]+'\t'+line.split('\t')[1].split("|")[j]+'\n')
f1.close()
f2.close()
print('read_and_parse_rast_annotations done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Function to combine multiple external required tables into a single table
def combine_external_checkm_and_taxonomy_info(query_genome_data, reference_IMG_genome_data, reference_Other_genome_data):
print("\n" + "Running combine_external_checkm_and_taxonomy_info")
time_start = time.time()
filenames = [query_genome_data, reference_IMG_genome_data, reference_Other_genome_data]
with open(output_combined_genome_data, 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
print("\nTotal number of genomes in metadata table: " + str(len(open(output_combined_genome_data).readlines())))
print('combine_external_checkm_and_taxonomy_info done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Function to merge all tables together (query and reference) into a large parseable master table used for key opertations
def import_and_merge_tables(save_master_table):
print("\n" + "Running import_and_merge_tables")
time_start = time.time()
f1 = pandas.read_csv(query_genome_data, header=None, sep="\t", index_col=False, dtype=str)
global f2
f2 = pandas.read_csv(reference_IMG_genome_data, header=None, sep="\t", index_col=False, dtype=str)
f3 = pandas.read_csv(reference_Other_genome_data, header=None, sep="\t", index_col=False, dtype=str)
d1 = pandas.read_csv(query_isolate_annotation_data, header=None, sep="\t", index_col=False, dtype=str)
d2 = pandas.read_csv(query_MAG_annotation_data, header=None, sep="\t", index_col=False, dtype=str)
d3 = pandas.read_csv(query_SAG_annotation_data, header=None, sep="\t", index_col=False, dtype=str)
q1 = pandas.read_csv(query_annotation_data, header=None, sep="\t", index_col=False, dtype=str)
f123 = pandas.concat([f1, f2, f3], axis=0)
d123q1 = pandas.concat([d1, d2, d3, q1], axis=0)
global merge
merge = f123.merge(d123q1, how='outer', left_on=0, right_on=0)
merge.columns = ['genomeID', 'genomeSet', 'V1', 'V2', 'V3', 'V4', 'GenomeType', 'V6', 'Completeness', 'Contamination', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'RAST_Annotation']
if save_master_table == "Yes":
merge.to_csv("iMAG-viz-output_ALL_genomeQC-and-annotation-data.csv")
print('import_and_merge_tables done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
#Function to assist with logical selection of categorical data
def extract_lineages_for_selected_level(taxa_level, query_genome_data):
print("\n"+"Running extract_lineages_for_selected_level")
time_start = time.time()
if taxa_level == 'Domain':
taxa_level_list = merge.Domain.unique()
taxa_level_down1 = 'Phylum'
taxa_level_down2 = 'Class'
#query_unique = f2.Domain.unique()
elif taxa_level == 'Phylum':
taxa_level_list = merge.Phylum.unique()
taxa_level_down1 = 'Class'
taxa_level_down2 = 'Order'
#print(query_genome_data)
#query_unique = query_annotation_data[11].unique()
#print("query_unique"+query_unique)
elif taxa_level == 'Class':
taxa_level_list = merge.Class.unique()
taxa_level_down1 = 'Order'
taxa_level_down2 = 'Family'
#query_unique = f2.Class.unique()
elif taxa_level == 'Order':
taxa_level_list = merge.Order.unique()
taxa_level_down1 = 'Family'
taxa_level_down2 = 'Genus'
#query_unique = f2.Order.unique()
elif taxa_level == 'Family':
taxa_level_list = merge.Family.unique()
taxa_level_down1 = 'Genus'
taxa_level_down2 = 'Species'
#query_unique = f2.Family.unique()
else:
taxa_level_list = merge.Genus.unique()
taxa_level_down1 = 'Species'
taxa_level_down2 = ''
#query_unique = f2.Genus.unique()
return taxa_level_list, taxa_level_down1, taxa_level_down2
print('extract_lineages_for_selected_level done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# merge example
# genomeID genomeSet V1 V2 V3 ... Order Family Genus Species RAST_Annotation
# 1861359 GCA_002509575.1_ASM250957v1_genomic PRJNA348753-8000Genomes y y y ... o__Methanomicrobiales f__Methanocullaceae g__Methanoculleus s__GCA_002508705.1 Acyl-CoA synthetase (NDP forming)
# 1861360 GCA_002509575.1_ASM250957v1_genomic PRJNA348753-8000Genomes y y y ... o__Methanomicrobiales f__Methanocullaceae g__Methanoculleus s__GCA_002508705.1 Iron(III) dicitrate-binding protein
# 1861361 GCA_002509575.1_ASM250957v1_genomic PRJNA348753-8000Genomes y y y ... o__Methanomicrobiales f__Methanocullaceae g__Methanoculleus s__GCA_002508705.1 Iron(III) dicitrate transport system permease ...
# 1861362 GCA_002509575.1_ASM250957v1_genomic PRJNA348753-8000Genomes y y y ... o__Methanomicrobiales f__Methanocullaceae g__Methanoculleus s__GCA_002508705.1 Iron(III) dicitrate transport ATP-binding protein
# 1861363 GCA_002509575.1_ASM250957v1_genomic PRJNA348753-8000Genomes y y y ... o__Methanomicrobiales f__Methanocullaceae g__Methanoculleus s__GCA_002508705.1 Diadenosine 5'5'''-P1,P4-tetraphosphate pyroph...
# Function to subset data by lineage (aka: taxonomic level) (e.g. Phylum, Order, Family, Genus, etc.)
def subset_data_by_lineage(lineage, taxa_level):
print("\n\t"+"Running subset_data_by_lineage")
time_start = time.time()
global merge_reduced
global merge_reduced_trim
merge_reduced = merge.loc[merge[taxa_level] == lineage]
merge_reduced_trim = merge_reduced.drop(columns="RAST_Annotation").drop_duplicates() # per genomeID, extra information.
print('\tsubset_data_by_lineage done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Function to count annotation data for the subset lineage
def count_annotation_data_for_level(merge_reduced, output_annotation_count, lineage):
print("\n\t"+"Running count_annotation_data_for_level")
time_start = time.time()
if Path(output_annotation_count+"_"+lineage+".tsv").is_file():
os.remove(output_annotation_count+"_"+lineage+".tsv")
f1 = open(output_annotation_count+"_"+lineage+".tsv", "w+")
f1 = open(output_annotation_count+"_"+lineage+".tsv", "a")
genomelist = merge_reduced.genomeID.unique()
annotationlist = merge_reduced.RAST_Annotation.unique()
f1.write("Annotation" + "\t") # write header line
for genome in range(len(genomelist)):
f1.write(str(genomelist[genome]))
if genome != (len(genomelist) - 1):
f1.write("\t")
else:
f1.write("\n")
for annotation in range(len(annotationlist)): # takes long to run (up to hours)
#for annotation in range(0, 20):
f1.write(str(annotationlist[annotation]) + "\t")
temp_merge = merge_reduced.loc[merge_reduced['RAST_Annotation'] == annotationlist[annotation]]
#dat = temp_merge['genomeID'].value_counts().rename_axis('genomeID').reset_index(name='counts')
for genome in range(len(genomelist)):
dat = temp_merge['genomeID'].value_counts().rename_axis('genomeID').reset_index(name='counts')
f1.write(str(dat['genomeID'].astype(str).str.contains(str(genomelist[genome])).sum()))
if genome != (len(genomelist) - 1):
f1.write("\t")
else:
f1.write("\n")
print('\tcount_annotation_data_for_level done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Function to combine annotation count table with genome metadata
def import_count_and_combine_with_genome_metadata(lineage):
global dat
global dat_combined
global genome_number
with open(output_annotation_count+"_"+lineage+".tsv") as f:
ncols = len(f.readline().split('\t'))
dat = numpy.loadtxt(output_annotation_count+"_"+lineage+".tsv", delimiter="\t", skiprows=1, usecols=range(1, ncols))
dat = numpy.transpose(dat) # transpose data
genome_number = len(merge_reduced_trim)
#print(dat)
#print(merge_reduced_trim)
print("\tNumber of genomes: " + str(len(merge_reduced_trim)))
# Function to run TSNE (in dev, not really working to produce a meaningful visual)
def run_tsne_dimensional_reduction():
print("\n\t"+"Running run_tsne_dimensional_reduction")
time_start = time.time()
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=40.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=300, n_iter_without_progress=300, min_grad_norm=1e-07, metric="euclidean", init="random", verbose=1, random_state=None, method="barnes_hut", angle=0.5)
tsne_results = tsne.fit_transform(dat) # run tsne, fit into an embedded space
global dat1
dat1 = pandas.DataFrame(data=tsne_results, columns=['tsne-2d-one', 'tsne-2d-two'])
dat1_combined = pandas.DataFrame(numpy.concatenate((dat1, merge_reduced_trim),axis = 1)) # combine count table with genome metadata
dat1_combined.columns = ['tsne-2d-one', 'tsne-2d-two', 'genomeID', 'genomeSet', 'V1', 'V2', 'V3', 'V4', 'GenomeType', 'V6', 'Completeness', 'Contamination', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
print('\trun_tsne_dimensional_reduction done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Function to run PCA (working fairly well)
def run_principal_component_analysis():
print("\n\t"+"Running run_principal_component_analysis")
time_start = time.time()
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(dat)
global dat1_combined
dat1 = pandas.DataFrame(data=pca_result, columns=['PC1', 'PC2', 'PC3'])
dat1_combined = pandas.DataFrame(numpy.concatenate((dat1, merge_reduced_trim),axis = 1)) # combine count table with genome metadata
dat1_combined.columns = ['PC1', 'PC2', 'PC3', 'genomeID', 'genomeSet', 'V1', 'V2', 'V3', 'V4', 'GenomeType', 'V6', 'Completeness', 'Contamination', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
#print(dat1_combined)
#print(dat1_combined.shape)
dat1_combined.to_csv("iMAG-viz-output_PCA-input-data_"+str(lineage)+".tsv", sep='\t')
print('\trun_principal_component_analysis done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Function to plot with Seaborn (not used at the moment)
def plot_dimensional_reduction_results_seaborn(lineage, mode):
print("\n\t" + "Running plot_dimensional_reduction_results_seaborn")
time_start = time.time()
import seaborn
# list_genomeid = merge_reduced["genomeID"]
if mode == "tsne":
sns_plot = seaborn.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue="Class", style=None, size=None, data=dat1_combined, palette=seaborn.color_palette("hls", len(dat1_combined["Class"].unique())), hue_order=None, hue_norm=None, sizes=None, size_order=None, size_norm=None, markers=dat1_combined["Class"], style_order=None, x_bins=None, y_bins=None, units=None, estimator=None, ci=95, n_boot=1000, alpha=0.3, x_jitter=None, y_jitter=None, legend='full', ax=None)
else:
# print(type(merge_reduced["Class"]))
df = pandas.DataFrame(merge_reduced_trim)
#sns_plot = seaborn.scatterplot(x="PC1", y="PC2", hue=df["Class"], style=None, size=None, data=dat1, palette=seaborn.color_palette("hls", len(df["Class"].unique())), hue_order=None, hue_norm=None, sizes=None, size_order=None, size_norm=None, markers=True, style_order=None, x_bins=None, y_bins=None, units=None, estimator=None, ci=95, n_boot=1000, alpha=0.3, x_jitter=None, y_jitter=None, legend='full', ax=None)
sns_plot = seaborn.scatterplot(x="PC1", y="PC2", hue="Class", style=None, size=None, data=dat1_combined, palette=seaborn.color_palette("hls", len(dat1_combined["Class"].unique())), hue_order=None, hue_norm=None, sizes=None, size_order=None, size_norm=None, markers=dat1_combined["Class"], style_order=None, x_bins=None, y_bins=None, units=None, estimator=None, ci=95, n_boot=1000, alpha=0.3, x_jitter=None, y_jitter=None, legend='full', ax=None)
fig = sns_plot.get_figure()
fig.savefig("iMAG-viz-output_PCA_"+str(lineage)+".png")
print('\tplot_dimensional_reduction_results_seaborn done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Function to plot with ggplot by calling R externally
def plot_dimensional_reduction_results_rggplot(lineage, mode):
print("\n\t"+"Running plot_dimensional_reduction_results_rggplot")
import shutil
import os
shutil.copy("iMAG-viz-output_PCA-input-data_"+str(lineage)+".tsv", "R-input-data.tsv")
time_start = time.time()
import subprocess
command = "Rscript /Applications/ResearchSoftware/kb_iMAG-viz/lib/make_ggplot_scatterplot.R"
process=subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process.wait()
out, err = process.communicate()
if process.returncode != 0: sys.exit("*Error generating figure with ggplot2")
shutil.move("R-output-plot.pdf","iMAG-viz-output_PCA_"+str(lineage)+".pdf")
os.remove("R-input-data.tsv")
print('\tplot_dimensional_reduction_results_rggplot done! Time elapsed: ' + '{}'.format(time.time() - time_start)[:7] + ' seconds\n')
# Declare variables
def parse_args():
parser = argparse.ArgumentParser(prog='kb_iMAG-viz-workflow',usage='%(prog)s.py -i [query_annotation_table] --version', description="""
kb_iMAG-viz is software to evaluate microbial genome quality against reference genomes.
----------------------------------------------------------------------------------------------------------------
kb_iMAG-viz-workflow.py performs the following steps.
The workflow goes as follows:
STEP 1. Annotations data is read from a source (e.g. RAST), and converted into a tab-delimited text
STEP 1a. Raw data is cleaned to remove extraneous characters
STEP 2. Data are subsetted based on the taxonomic level of interest (e.g. domain, phylum, class, order, ...)
STEP 3. Annotation data is converted to presence/absence data (for that taxonomic level)
STEP 4: Dimensional reduction on annotation count data to identify major trends and outliers
STEP 5: Plotting dimensional reduction results and associated genome quality information."""
,formatter_class=RawTextHelpFormatter)
#requiredNamed = parser.add_argument_group('required arguments')
#requiredNamed.add_argument("-i", dest="query_annotation_table", help="""Indicate a tab-delimited table where genome ID is the first column and subset columns correspond to gene names.".""", required=False)
parser.add_argument("-i", dest="query_annotation_table", help="""Indicate a tab-delimited table where genome ID is the first column and subset columns correspond to gene names (default: /Applications/ResearchSoftware/kb_iMAG-viz/test/query-genomes/TARA-MAGs_Delmont-Archaea-only-2017.RAST.txt)""", default="/Applications/ResearchSoftware/kb_iMAG-viz/test/query-genomes/TARA-MAGs_Delmont-Archaea-only-2017.RAST.txt")
parser.add_argument("--taxa_level", dest="taxa_level", help="""Indicate the taxonomic lineage. (default: Phylum)""", default="Phylum")
parser.add_argument("--save_master_table", dest="save_master_table", help="""Save the master merged table; warning it can be large as it contains an unflattened version of all the input data. (default: Yes)""", default="Yes")
parser.add_argument("--generate_count_tables", dest="generate_count_tables", help="""Regenerate annotation count tables (time-consuming step). (default: Yes)""", default="Yes")
parser.add_argument("--dimensional_reduction_method", dest="dimensional_reduction_method", help="""Pick a dimensional reduction method. (default: pca)""", default="pca")
parser.add_argument("--plotting_method", dest="plotting_method", help="""Pick a ploting method. (default: ggplot)""", default="ggplot")
parser.add_argument("--path_to_kb_imagviz", dest="path_to_kb_imagviz", help="""Indicate the path to kb_imagviz software""", default="/Applications/ResearchSoftware/kb_iMAG-viz")
parser.add_argument('--version', action='version', version='%(prog)s v0.1')
return parser.parse_args()
def banner():
print("""
__________________________________________________________
| _ _ _ __ __ _ ____ _ |
| | | _| |__ (_) \/ | / \ / ___| __ _(_)____ |
| | |/ / '_ \ | | |\/| | / _ \| | _ ____\ \ / / |_ / |
| | <| |_) | | | | | |/ ___ \ |_| |_____\ V /| |/ / |
| |_|\_\_.__/___|_|_| |_/_/ \_\____| \_/ |_/___| |
| |_____| |
| version %s |
|________________________________________________________|\n\n""" % (version))
def test_text():
print(merge.tail(n=5))
print(merge.shape)
print(merge.shape[0])
for col in merge.columns:
print(col)
# print("f123\n"+str(f123.shape))
# print("d123q1\n"+str(d123q1.shape))
# print(merge.tail(n=5))
# print(merge.shape)
# print(merge.Domain.unique())
# run main kb_iMAG-viz-workflow.py script
if __name__ == "__main__":
banner()
args = parse_args()
if len(sys.argv) is None:
parser.print_help()
elif args.query_annotation_table is None:
parser.print_help()
elif args.path_to_kb_imagviz is None:
parser.print_help()
else:
output_query_annotation = "iMAG-viz-output_query-flattened-annotation-data.tsv"
output_annotation_count = "iMAG-viz-output_annotation-count-data"
# files needed for function combine_external_checkm_and_taxonomy_info AND import_and_merge_tables
query_genome_data = os.path.realpath(args.path_to_kb_imagviz + "/test/query-genomes/Delmont_genomeQC-data.tsv")
reference_IMG_genome_data = os.path.realpath(args.path_to_kb_imagviz + "/test/reference-genomes/IMG_genomeQC-data.tsv")
reference_Other_genome_data = os.path.realpath(args.path_to_kb_imagviz + "/test/reference-genomes/Other_genomeQC-data.tsv")
# this output file contains all of the genome data used in the analysis
output_combined_genome_data = os.path.realpath("iMAG-viz-output_ALL_genomeQC-data.tsv")
# files needed for function import_and_merge_tables
query_annotation_data = os.path.realpath(args.path_to_kb_imagviz + "/test/query-genomes/TARA-MAGs_Delmont-Archaea-only-2017_clean.RAST.txt")
query_isolate_annotation_data = os.path.realpath(args.path_to_kb_imagviz + "/test/reference-genomes/Archaea.Isolates_clean.RAST.txt")
query_MAG_annotation_data = os.path.realpath(args.path_to_kb_imagviz + "/test/reference-genomes/Archaea.MAGs_clean.RAST.txt")
query_SAG_annotation_data = os.path.realpath(args.path_to_kb_imagviz + "/test/reference-genomes/Archaea.SAGs_clean.RAST.txt")
check_for_py3()
read_and_parse_rast_annotations(str(args.query_annotation_table), output_query_annotation)
combine_external_checkm_and_taxonomy_info(query_genome_data, reference_IMG_genome_data, reference_Other_genome_data)
import_and_merge_tables(args.save_master_table)
taxalist = extract_lineages_for_selected_level(args.taxa_level, query_genome_data)[0] # get list of lineages to iterate over
for lineage_number in range(len(taxalist)):
lineage = taxalist[lineage_number]
# if (str(lineage) == "p__Nanoarchaeota") or (str(lineage) == "p__Micrarchaeota") or (str(lineage) == "p__Euryarchaeota") or (str(lineage) == "p__Asgardarchaeota"):
# if (str(lineage) == "p__Thermoplasmatota"):
if (str(lineage) != "nan"): # some groups don't have phyla? need to work this out.
print("Starting with lineage: "+str(lineage))
subset_data_by_lineage(lineage, args.taxa_level)
if (args.generate_count_tables == "Yes"):
count_annotation_data_for_level(merge_reduced, output_annotation_count, lineage)
else:
print("Skipping the count table generation (time-consuming step; assuming this has been done already or next steps will fail).\n")
import_count_and_combine_with_genome_metadata(lineage)
if genome_number > 3:
if (args.dimensional_reduction_method == "pca"):
run_principal_component_analysis()
else:
run_tsne_dimensional_reduction()
if (args.plotting_method == "ggplot"):
plot_dimensional_reduction_results_rggplot(lineage, mode="pca")
else:
plot_dimensional_reduction_results_seaborn(lineage, mode="pca")
else:
print("\nWarning: not enough data (genomes) to run a meaningful dimensional reduction. Select a different group.")
print("Finished with lineage: "+str(lineage)+'\n')
print("Success! Done running kb_iMAG-viz")