Skip to content

Commit

Permalink
bug fix virulence report, contamination check
Browse files Browse the repository at this point in the history
  • Loading branch information
tpillone committed Sep 17, 2020
1 parent 6a4343a commit 1587eb2
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 127 deletions.
2 changes: 2 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ snp_caller: "gatk_gvcfs"

# mapping: either bwa or bwa_stringent
mapping: "bwa"

16S_database: "/data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp_vsearch_format.fasta"
2 changes: 2 additions & 0 deletions rules/annotation/resistance/mykrobe.rules
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ rule search_resistance_paired_reads_with_mykrobe:
"""
if [ "{params.species}" = "staph" ]
then
echo Staphylococcus aureus
mykrobe predict --format json "{wildcards.sample}" "{params.species}" --seq {input} --min_variant_conf {params.confidence} > {output[0]} 2> {log}
else
echo Mycobacterium tuberculosis
if [ -z "{params.panel}" ]
then
mykrobe predict --format json "{wildcards.sample}" "{params.species}" --seq {input} --min_variant_conf {params.confidence} > {output[0]} 2> {log}
Expand Down
18 changes: 17 additions & 1 deletion rules/benchmark/resistance/compare_results.rules
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,20 @@ if 'sra_samples' in config:
output:
discrepancies = "{path}_phenotype.tsv",
script:
"scripts/add_phenotype.py"
"scripts/add_phenotype.py"


rule statistics_wrong_results:
singularity:
singularity_envs["python_r"]
input:
rgi = "report/resistance/rgi_variant_phenotype.tsv",
mykrobe = "report/resistance/mykrobe_variant_phenotype.tsv",
tbprofiler = "report/resistance/tb-profiler/tbprofiler_variant_phenotype.tsv",
walker = "report/resistance/bwa/walker_resistant_annotated/walker_resistant_annotated_variant_phenotype.tsv"
params:
antibio = lambda wildcards: wildcards.antibiotic
output:
venn = "report/resistance/frequency_FP_{antibiotic}.tab"
script:
"scripts/get_frequency_FP.py"
228 changes: 115 additions & 113 deletions rules/quality/16S_check.rules
Original file line number Diff line number Diff line change
@@ -1,125 +1,127 @@

rule extract_16S_with_barnap:
singularity:
"docker://quay.io/biocontainers/barrnap:0.9--3"
input:
"samples/{sample}/assembly/spades/contigs.fasta"
output:
"{sample}/contamination/markers/rrna/{sample}_barrnap.gff",
"{sample}/contamination/markers/rrna/{sample}_barrnap.ffn",
shell:
"""
barrnap --outseq {output[1]} --quiet {input} > {output[0]}
"""
if "16S_database" in config:

rule extract_16S:
singularity:
singularity_envs["python_r"]
input:
"{sample}/contamination/markers/rrna/{sample}_barrnap.ffn",
output:
"{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn",
script:
"scripts/extract_16S.py"
rule extract_16S_with_barnap:
singularity:
"docker://quay.io/biocontainers/barrnap:0.9--3"
input:
"samples/{sample}/assembly/spades/contigs.fasta"
output:
"samples/{sample}/contamination/markers/rrna/{sample}_barrnap.gff",
"samples/{sample}/contamination/markers/rrna/{sample}_barrnap.ffn",
shell:
"""
barrnap --outseq {output[1]} --quiet {input} > {output[0]}
"""

rule merge_16S:
singularity:
singularity_envs["python_r"]
input:
expand("{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn", sample=read_naming.keys()),
output:
"report/contamination/markers/rrna/merged_barrnap_16S.ffn",
script: "scripts/combine_16S.py"
rule extract_16S:
singularity:
singularity_envs["python_r"]
input:
"samples/{sample}/contamination/markers/rrna/{sample}_barrnap.ffn",
output:
"samples/{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn",
script:
"scripts/extract_16S.py"

rule align_16S_with_mafft:
singularity:
singularity_envs["mafft"]
input:
"report/contamination/markers/rrna/merged_barrnap_16S.ffn",
output:
"report/contamination/markers/rrna/merged_barrnap_16S_mafft.ffn",
shell:
"""
mafft {input} > {output}
"""
rule merge_16S:
singularity:
singularity_envs["python_r"]
input:
expand("samples/{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn", sample=read_naming.keys()),
output:
"report/contamination/markers/rrna/merged_barrnap_16S.ffn",
script: "scripts/combine_16S.py"

rule calculate_parwise_identity:
singularity:
singularity_envs["python_r"]
input:
"report/contamination/markers/rrna/merged_barrnap_16S_mafft.ffn"
output:
"report/contamination/markers/rrna/pairwise_id.tab"
script:
"scripts/calculate_pairwise_id_from_alignment.py"
rule align_16S_with_mafft:
singularity:
singularity_envs["mafft"]
input:
"report/contamination/markers/rrna/merged_barrnap_16S.ffn",
output:
"report/contamination/markers/rrna/merged_barrnap_16S_mafft.ffn",
shell:
"""
mafft {input} > {output}
"""

rule calculate_parwise_identity:
singularity:
singularity_envs["python_r"]
input:
"report/contamination/markers/rrna/merged_barrnap_16S_mafft.ffn"
output:
"report/contamination/markers/rrna/pairwise_id.tab"
script:
"scripts/calculate_pairwise_id_from_alignment.py"

rule assign_taxonomy_vsearch:
singularity:
singularity_envs["vsearch"]
input:
"{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn"
output:
"{sample}/contamination/markers/rrna/{sample}_vsearch.txt"
shell:
"""
# (for domain) k (kingdom), p (phylum), c (class), o (order), f (family), g (genus), or s (species)
# vsearch --sintax fastafile --db fastafile --tabbedout outputfile
vsearch --sintax {input[0]} --db /data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp_vsearch_format.fasta --tabbedout {output[0]}
"""

rule QIIME1_assign_taxonomy_rdp:
singularity:
singularity_envs["rdp"]
input:
"report/contamination/markers/rrna/merged_barrnap_16S.ffn",
"/data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp.fasta",
"/data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp_taxonomy.txt"
output:
"report/contamination/markers/rrna/merged_rdp.txt"
threads:
1
shell:
'''
export RDP_JAR_PATH=$(command -v rdp_classifier-2.2.jar);
assign_path=$(which assign_taxonomy.py)
python $assign_path \
-i {input[0]} \
-r {input[1]} \
-t {input[2]} \
-m rdp \
-o $(dirname {output[0]}) \
-c 0.5 \
--rdp_max_memory 30000
'''
rule assign_taxonomy_vsearch:
singularity:
singularity_envs["vsearch"]
input:
"samples/{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn"
output:
"samples/{sample}/contamination/markers/rrna/{sample}_vsearch.txt"
shell:
"""
# (for domain) k (kingdom), p (phylum), c (class), o (order), f (family), g (genus), or s (species)
# vsearch --sintax fastafile --db fastafile --tabbedout outputfile
vsearch --sintax {input[0]} --db /data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp_vsearch_format.fasta --tabbedout {output[0]}
"""

rule QIIME1_assign_taxonomy_rdp:
singularity:
singularity_envs["rdp"]
input:
"report/contamination/markers/rrna/merged_barrnap_16S.ffn",
"/data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp.fasta",
"/data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp_taxonomy.txt"
output:
"report/contamination/markers/rrna/merged_rdp.txt"
threads:
1
shell:
'''
export RDP_JAR_PATH=$(command -v rdp_classifier-2.2.jar);
assign_path=$(which assign_taxonomy.py)
python $assign_path \
-i {input[0]} \
-r {input[1]} \
-t {input[2]} \
-m rdp \
-o $(dirname {output[0]}) \
-c 0.5 \
--rdp_max_memory 30000
'''

rule assign_taxonomy_ssearch:
singularity:
singularity_envs["fasta36"]
singularity:
"docker://quay.io/biocontainers/vsearch:2.15.0--h2d02072_0"
input:
"{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn",
"/data/databases/amplicon_based_metagenomics/16S/ezbiocloud201805/QIIME/DB_amp_vsearch_format.fasta",
output:
"report/contamination/markers/rrna/{sample}_ssearch.txt"
log:
"report/contamination/markers/rrna/{sample}_ssearch.log"
shell:
"""
# keep only best hit
echo ssearch36 -b 1 -m 10 -n -z 3 {input[0]} {input[1]}
ssearch36 -b 1 -m 10 -n -z 3 {input[0]} {input[1]} 1> {output[0]} 2> {log}
"""

rule assign_taxonomy_ssearch:
singularity:
singularity_envs["fasta36"]
singularity:
"docker://quay.io/biocontainers/vsearch:2.15.0--h2d02072_0"
input:
"samples/{sample}/contamination/markers/rrna/{sample}_barrnap_16S.ffn",
config["16S_database"],
output:
"report/contamination/markers/rrna/{sample}_ssearch.txt"
log:
"report/contamination/markers/rrna/{sample}_ssearch.log"
shell:
"""
# keep only best hit
echo ssearch36 -b 1 -m 10 -n -z 3 {input[0]} {input[1]}
ssearch36 -b 1 -m 10 -n -z 3 {input[0]} {input[1]} 1> {output[0]} 2> {log}
"""

rule format_taxonomy_ssearch:
singularity:
singularity_envs["python_r"]
input:
expand("report/contamination/markers/rrna/{sample}_ssearch.txt", sample=read_naming.keys())
output:
"report/contamination/markers/rrna/summary.txt"
script:
"scripts/ssearch_summary.py"

rule format_taxonomy_ssearch:
singularity:
singularity_envs["python_r"]
input:
expand("report/contamination/markers/rrna/{sample}_ssearch.txt", sample=read_naming.keys())
output:
"report/contamination/markers/rrna/summary.txt"
script:
"scripts/ssearch_summary.py"
14 changes: 9 additions & 5 deletions rules/report_generation/scripts/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,26 @@ def checkm_table(checkm_table):

def get_rrna_summary_table(raw_table,
sample2scientific_name):
df = pandas.read_csv(raw_table, delimiter="\t", header=0)


df = pandas.read_csv(raw_table, delimiter="\t", header=0)

# sample query hit alignment_length alignment_length percent_identity evalue bitscore
row_list = []
sample2count = {}
for n,row in df.iterrows():
sample = row["sample"]
for n, row in df.iterrows():
sample = str(row["sample"])
contig = row["query"]
BBH_taxnonomy = row["hit"].split(",")[1:]
identity = row["percent_identity"]
if sample not in sample2count:
sample2count[sample] = 0
sample2count[sample] += 1
row_list.append([sample,sample2count[sample], sample2scientific_name[sample], contig, BBH_taxnonomy, identity])
row_list.append([sample,
sample2count[sample],
sample2scientific_name[sample],
contig,
BBH_taxnonomy,
identity])

header = ["sample", "N.", "expected", "contig", "BBH_taxnonomy", "identity"]
df = pandas.DataFrame(row_list, columns=header)
Expand Down
19 changes: 11 additions & 8 deletions rules/report_generation/scripts/report_virulence.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@
sample2median_depth = {}
sampls2cumulated_size = {}
sample2n_contigs = {}
sampls2cumulated_size_filtered = {}
for one_table in contig_gc_depth_file_list:
table = pandas.read_csv(one_table,
delimiter='\t',
Expand All @@ -157,20 +158,22 @@
sample2median_depth[sample] = data_whole_gnome["mean_depth"]
sampls2cumulated_size[sample] = data_whole_gnome["contig_size"]
sample2n_contigs[sample] = n_contigs
sampls2cumulated_size_filtered[sample] = int(table.query('median_depth>=5 & contig != "TOTAL"')[["contig_size"]].sum())

sample2scientific_name = snakemake.params["sample_table"].to_dict()["ScientificName"]

table_lowcoverage_contigs = report.quality_table(low_cov_fastas,
sample2gc,
sample2median_depth,
sampls2cumulated_size,
sample2n_contigs,
sample2scientific_name,
low_cov_detail=low_cov_detail)
sample2gc,
sample2median_depth,
sampls2cumulated_size,
sampls2cumulated_size_filtered,
sample2n_contigs,
sample2scientific_name,
low_cov_detail=low_cov_detail)

table_virulence = report.virulence_table(virulence_reports,
blast_files,
ordered_samples)
blast_files,
ordered_samples)

mash_table = report.get_mash_table(mash_results, mash_detail, sample2scientific_name)

Expand Down

0 comments on commit 1587eb2

Please sign in to comment.