diff --git a/dockerfiles/denovo/Dockerfile b/dockerfiles/denovo/Dockerfile new file mode 100644 index 000000000..8416b9133 --- /dev/null +++ b/dockerfiles/denovo/Dockerfile @@ -0,0 +1,22 @@ +ARG SV_PIPELINE_IMAGE=sv-pipeline:latest +FROM $SV_PIPELINE_IMAGE + +COPY /src/denovo /src/denovo/ + +# make & cpp are needed for installing `tidyverse`. +RUN apt-get -qqy update && \ + apt-get -qqy install ca-certificates build-essential && \ + Rscript -e "\ + install.packages( \ + c( \ + 'BiocManager', \ + 'Rcpp', \ + 'tidyverse', \ + 'R.utils', \ + 'UpSetR', \ + 'ggplotify' \ + ), \ + repos = 'https://cran.rstudio.com', \ + quiet = TRUE \ + )" && \ + pip install fsspec gcsfs diff --git a/scripts/docker/build_docker.py b/scripts/docker/build_docker.py index 16983362c..16d09dcb9 100755 --- a/scripts/docker/build_docker.py +++ b/scripts/docker/build_docker.py @@ -244,6 +244,11 @@ class to track dependencies, control build and push of entire job docker_dependencies={ "samtools-cloud": "SAMTOOLS_CLOUD_IMAGE", "sv-utils-env": "VIRTUAL_ENV_IMAGE"} + ), + "denovo": ImageDependencies( + git_dependencies=("dockerfiles/denovo/*", "src/denovo/*"), + docker_dependencies={ + "sv-pipeline": "SV_PIPELINE_IMAGE"} ) } non_public_images = frozenset({"melt"}) diff --git a/src/denovo/clean_ped.R b/src/denovo/clean_ped.R new file mode 100644 index 000000000..19ca64ef6 --- /dev/null +++ b/src/denovo/clean_ped.R @@ -0,0 +1,88 @@ +# This script creates a ped file with all families >3 into trios +# to match desired input for de novo SV filtering + +# Define args and load libraries +args = commandArgs(trailingOnly=TRUE) +require(plyr) +library(data.table) +library(tidyverse) + +input_ped <- args[1] + +# Read input ped +ped <- fread(input_ped,header = TRUE) + +# Add a column with size of family +ped %>% + group_by(FamID) %>% + mutate(FamSize = n()) -> ped2 + +# Steph's code +distfam<-ped2%>%distinct(FamID,FamSize) +fam_breakdown<-distfam%>%group_by(FamSize)%>%tally() + +# subset the families > 3 +large_families<-ped2%>%filter(FamSize>=4) +large_family_ids <- large_families$FamID # FamIDs of families >3 + +large_families_prob<-large_families%>%filter(MotherID!=0|FatherID!=0) # probands from families > 3 +large_families_prob$newFamID <- NA +families<-unique(large_families_prob$FamID) + +# function that splits by probands, adds _# to the end of the FamID and then finds the parents and adds _# to them as well +split_fam<-function(probands,pedigree){ + trio <- data.frame(matrix(ncol = 8, nrow = 0)) + families<-unique(probands$FamID) + for (i in 1:length(families)){ + # reassign family IDs for >=quads (fam1 => fam1_1, fam1_2) + perfam<-subset(probands,FamID==families[i]) + for (x in 1:nrow(perfam)){ + probands$newFamID[probands$IndividualID==as.character(perfam[x,2])]<-paste0(probands$FamID[probands$IndividualID==as.character(perfam[x,2])],"_",x) + } + } + # each proband row should now have a new fam_id + # pedigree needs a new column + pedigree$newFamID<-NA + # now go back and pick up parents info and rename famID of parents + for (y in 1:nrow(probands)){ + paternal<-subset(pedigree, IndividualID == as.character(probands[y, 3])) + paternal$newFamID <- as.character(probands[y,8]) + maternal<-subset(pedigree, IndividualID == as.character(probands[y, 4])) + maternal$newFamID<-as.character(probands[y,8]) + trio<-rbind(data.frame(trio), data.frame(probands[y,]),data.frame(paternal),data.frame(maternal)) + } + return(trio) +} + +output<-split_fam(large_families_prob, large_families) + +# subset out newFamID column and rename FamID +new_output <- subset(output, select=c("newFamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected")) +colnames(new_output)[1] <- "FamID" + +# subset to just families with all three in the new ped +new_output %>% + group_by(FamID) %>% + mutate(FamSize = n()) %>% + filter(FamSize==3)-> trios + +new_trios <- subset(trios, select=c("FamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected")) + +# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script +new_trios$FatherID[!(new_trios$FatherID %in% ped$IndividualID)] <- 0 +new_trios$MotherID[!(new_trios$MotherID %in% ped$IndividualID)] <- 0 + +# subset ped to only include families that are not > 3 +ped_without_large_families <- subset(ped, !(FamID %in% large_family_ids)) + +# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script +ped_without_large_families$FatherID[!(ped_without_large_families$FatherID %in% ped$IndividualID)] <- 0 +ped_without_large_families$MotherID[!(ped_without_large_families$MotherID %in% ped$IndividualID)] <- 0 + +# concat new trio pedigree and old pedigree without families > 3 +ped_without_large_families_df <- data.frame(ped_without_large_families) +new_trios_df <- data.frame(new_trios) + +final_ped<-rbind(new_trios_df, ped_without_large_families_df) + +write.table(final_ped,file="cleaned_ped.txt",quote=F,row.names=F,col.names=T,sep="\t") diff --git a/src/denovo/create_per_sample_bed.R b/src/denovo/create_per_sample_bed.R new file mode 100644 index 000000000..da955ad69 --- /dev/null +++ b/src/denovo/create_per_sample_bed.R @@ -0,0 +1,66 @@ +require(plyr) +library(data.table) +library(tidyverse) + + +args = commandArgs(trailingOnly=TRUE) +input_gd <- args[1] +proband_gd <- args[2] +parents_gd <- args[3] +ped <- args[4] +chromosome <- args[5] + +df_ori <- as.data.frame(read.table(input_gd, header = FALSE, sep="\t",stringsAsFactors=FALSE, quote="")) +colnames(df_ori) <- c("chrom", "start", "end", "name", "annotation", "svtype") +df <- subset(df_ori, chrom == chromosome) + +df_DUP <- subset(df, svtype == 'DUP' | svtype == 'DEL;DUP') +if (nrow(df_DUP) > 0) { + df_DUP$svtype <- 'DUP' +} + +df_DEL <- subset(df, svtype == 'DEL' | svtype == 'DEL;DUP') +if (nrow(df_DEL) > 0) { + df_DEL$svtype <- 'DEL' +} + +if (nrow(df_DEL) > 0 & nrow(df_DUP) > 0) { + df2 = rbind(df_DUP,df_DEL) +} else if (nrow(df_DEL) > 0) { + df2 = df_DEL +} else if (nrow(df_DUP) > 0) { + df2 = df_DUP +} + +if (exists('df2')) { + ped <- as.data.frame(read.table(ped, header = TRUE, sep="\t",stringsAsFactors=FALSE, quote="")) + samples <- ped$IndividualID + df2$samples <- gsub(" ", "",toString(samples)) + + df2 %>% + subset(select = c(chrom, start, end, name, svtype, samples)) %>% + separate_rows(samples, sep = ",", convert = T) -> df3 + + df3$chrom <- paste0(df3$chrom, "_", df3$svtype, "_", df3$samples) + df3 <- df3[, c(1,2,3,4)] + + ped_subset <- subset(ped, select = c('FamID', 'IndividualID')) + colnames(ped_subset) <- c('FamID', 'samples') + + df2 %>% + subset(select = c(chrom, start, end, name, svtype, samples)) %>% + separate_rows(samples, sep = ",", convert = T) -> new_df + + new_df <- merge(new_df,ped_subset, by="samples", all.x = TRUE) + new_df$chrom <- paste0(new_df$chrom, "_", new_df$svtype, "_", new_df$FamID) + new_df <- new_df[, c(2,3,4,5)] + + #df4 = rbind(df3,new_df) + + write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F) + write.table(new_df, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F) +} else{ + df3 <- data.frame() + write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F) + write.table(df3, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F) +} \ No newline at end of file diff --git a/src/denovo/denovo_outliers.py b/src/denovo/denovo_outliers.py new file mode 100644 index 000000000..2d3d08b51 --- /dev/null +++ b/src/denovo/denovo_outliers.py @@ -0,0 +1,33 @@ +import argparse +import pandas as pd +import numpy as np +import math + +parser = argparse.ArgumentParser(description='Parse arguments') +parser.add_argument('--bed', dest='bed', help='Input BED file') +args = parser.parse_args() + +bed_file = args.bed + +bed_original = pd.read_csv(bed_file, sep='\t', index_col=False).replace(np.nan, '', regex=True) + +# Remove outliers +bed = bed_original +sample_counts = bed['sample'].value_counts() +q3, q1 = np.percentile(sample_counts.tolist(), [75, 25]) +iqr = q3 - q1 +count_threshold = int(math.ceil((1.5 * iqr + q3) * 3)) +sample_counts2 = sample_counts.rename_axis('sample').reset_index(name='count') +samples_keep = sample_counts2[(sample_counts2['count'] <= count_threshold)]['sample'].tolist() + +# Keep samples and outliers in sepparate files +output = bed[(bed['sample'].isin(samples_keep))] +output_outliers = bed[(~bed['sample'].isin(samples_keep))] + +bed_original.loc[~(bed_original['sample'].isin(samples_keep)) & bed_original['is_de_novo'], 'filter_flag'] = 'sample_outlier' + +# Write output +output.to_csv(path_or_buf='final.denovo.merged.bed', mode='a', index=False, sep='\t', header=True) +output_outliers.to_csv(path_or_buf='final.denovo.merged.outliers.bed', mode='a', index=False, sep='\t', header=True) + +bed_original.to_csv(path_or_buf='final.denovo.merged.allSamples.bed', mode='a', index=False, sep='\t', header=True) diff --git a/src/denovo/denovo_sv_plots.R b/src/denovo/denovo_sv_plots.R new file mode 100644 index 000000000..5cd7171e0 --- /dev/null +++ b/src/denovo/denovo_sv_plots.R @@ -0,0 +1,385 @@ +# Description: Post-hoc QC analysis of filtered de novo SVs + + +# Load libraries +library(tidyr) +library(ggplot2) +library(UpSetR) +library(data.table) +library(dplyr) +library(gridExtra) +library(grid) +library(tidyverse) +library("grid") +library("ggplotify") +library(dplyr) + +args = commandArgs(trailingOnly=TRUE) + +input_bed <- args[1] +input_ped <- args[2] +out_file <- args[3] + +#input_outliers <-args [2] +#input_ped <- args[3] +#out_file <- args[4] + +ped <- fread(input_ped) + +denovo <- as.data.frame(read.table(input_bed,header = TRUE, sep="\t",stringsAsFactors=FALSE, quote="")) +#outliers <- as.data.frame(read.table(input_outliers,header = TRUE, sep="\t",stringsAsFactors=FALSE, quote="")) + +denovo$SVTYPE <- factor(denovo$SVTYPE, levels = rev(c("DEL", "DUP", "INS", "INV", "CPX", "CTX"))) + +denovo_ins <- subset(denovo, SVTYPE == "INS") +denovo_del <- subset(denovo, SVTYPE == "DEL") +denovo_dup <- subset(denovo, SVTYPE == "DUP") +denovo_inv <- subset(denovo, SVTYPE == "INV") +denovo_cpx <- subset(denovo, SVTYPE == "CPX") +denovo_ctx <- subset(denovo, SVTYPE == "CTX") + +# Define colors +del_col <- "#D43925" +dup_col <- "#2376B2" +ins_col <- "#D474E0" +inv_col <- "#FA9627" +ctx_col <- "#638E6C" +cpx_col <- "#4DA1A9" + +# Get some numbers +length(unique(denovo$name)) #of unique denovos +length(unique(denovo$sample)) #number of samples with denovos + +#length(unique(outliers$name)) +#length(unique(outliers$sample)) +#outliers_in_gd <- subset(outliers, select= c(name, in_gd)) + +denovo %>% + # subset(chrom != "chrX") %>% + select(sample, name) %>% + group_by(sample) %>% + tally() -> sample_count + +denovo %>% + count(sample, SVTYPE, name= "svtype_per_sample") -> type + +type_boxplot <- ggplot(type, aes(x=SVTYPE, fill = SVTYPE, y=svtype_per_sample)) + + geom_jitter(position = position_jitter(seed = 1, width = 0.2), color = 'grey', size = 4.0) + geom_boxplot(outlier.shape=NA) + labs(title = "Number of De Novo SVs per Type", y = "Number of de novo SVs", x = "SV Type") + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) +ggsave("type_boxplot.png", type_boxplot, width = 30, height = 40, limitsize = FALSE) + +sample_boxplot <- ggplot(sample_count, aes(x=factor(0), y=n)) + + geom_jitter(position = position_jitter(seed = 1, width = 0.2), color = 'gray47', size = 4.0) + geom_boxplot(outlier.shape=NA) + labs(title = "Number of de Novo SVs per Sample", y = "Number of de novo SVs", x = "Samples") + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) +ggsave("sample_boxplot.png", sample_boxplot, width = 30, height = 35, limitsize = FALSE) + +sample_count %>% + ggplot(aes(y = n)) + + geom_boxplot() -> boxplot + +# Make plots +denovo$chrom <- factor(denovo$chrom, levels = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX")) + +# Count by SV Type +denovo %>% + select(SVTYPE, name, SVTYPE) %>% + unique() %>% + ggplot(aes(x = SVTYPE, fill = SVTYPE)) + + geom_bar() + + labs(x="SV Type", y="Number de novo SVs", title= "Number of De Novo SVs per SV Type") + + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + scale_x_discrete(limits = rev(levels(denovo$SVTYPE))) + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) -> p_type_count +p_type_count + +ggsave("per_type.png", p_type_count, width = 30, height = 35, limitsize = FALSE) + +# Count by chromosome +denovo %>% + select(chrom, name, SVTYPE) %>% + unique() %>% + ggplot(aes(x = chrom, fill = SVTYPE)) + + geom_bar() + + labs(x="Chromosome", y="Number de novo SVs", title= "Number of De Novo SVs per Chromosome") + + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) -> p_chr_count +p_chr_count + +ggsave("per_chrom.png", p_chr_count, width = 30, height = 45, limitsize = FALSE) + +# Count by sample +df <- as.data.frame(table(denovo$sample)) +df$samples <- as.character(df$Var1) +df$num_of_denovos <- as.numeric(as.character(df$Freq)) +df$samples <- as.factor(df$samples) + +p <- ggplot(df, aes(y=num_of_denovos)) + geom_boxplot() + labs(title = "Number of de novo SVs per sample", y = "Number of de novos", x = "Samples") + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) +ggsave("per_sample.png", p, width = 30, height = 35, limitsize = FALSE) + +# De novo count by allele frequency +denovo$AF <- as.numeric(denovo$AF) +denovo$AC <- as.numeric(denovo$AC) +ac_1_freq <- min(subset(denovo, AC == 1)$AF) +max_AF_str <- toString(round(max(denovo$AF), digits=3)) +# denovo$AF_interv <- cut(denovo$AF, breaks = c(0, ac_1_freq, 0.001, 0.01, 0.03, max(denovo$AF)), +# labels = c("AC=1", "AC 1 - AF<=0.001", "AF 0.001-0.01", "AF 0.01-0.03", "AF>0.03")) +denovo$AF_interv <- cut(denovo$AF, breaks = c(0, ac_1_freq, 0.001, max(denovo$AF)), + labels = c("AC=1", "AC 1-0.001", "AF>0.001")) + +denovo %>% + select(AF_interv, name, SVTYPE) %>% + unique() %>% + ggplot(aes(x = AF_interv, fill = SVTYPE)) + + geom_bar() + + # scale_y_log10()+ + labs(x="Allele Frequency", y="Number de novo SVs", title="Number of De Novo\n SVs by Frequency") + + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) -> p_af_count +ggsave("per_freq.png", p_af_count, width = 30, height = 35, limitsize = FALSE) + +denovo_in_gd <- subset(denovo, in_gd == "True") +denovo_not_in_gd <- subset(denovo, in_gd == "False") + +denovo_in_gd %>% + select(AF_interv, name, SVTYPE) %>% + unique() %>% + ggplot(aes(x = AF_interv, fill = SVTYPE)) + + geom_bar() + + # scale_y_log10()+ + labs(x="Allele Frequency", y="Number de novo SVs", title= "Number of De Novo SVs \nin Genomic Disorder Regions by Frequency") + + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) -> p_af_count_in_gd +ggsave("per_freq_gd.png", p_af_count_in_gd, width = 30, height = 35, limitsize = FALSE) + +denovo_not_in_gd %>% + select(AF_interv, name, SVTYPE) %>% + unique() %>% + ggplot(aes(x = AF_interv, fill = SVTYPE)) + + geom_bar() + + # scale_y_log10()+ + labs(x="Allele Frequency", y="Number de novo SVs", title= "Number of De Novo SVs not in \n Genomic Disorder Regions by Frequency") + + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) -> p_af_count_not_in_gd +ggsave("per_freq_not_gd.png", p_af_count_not_in_gd, width = 30, height = 35, limitsize = FALSE) + +# De novo count by size +denovo$SVLEN <- as.numeric(denovo$SVLEN) +#denovo[denovo$SVLEN < 1,]$SVLEN <- 1 +denovo$SVLEN_interv <- cut(denovo$SVLEN, breaks = c(0, 100, 500, 2500, 10000, 50000, max(denovo$SVLEN)), + labels = c("<100bp", "101-500bp", "501bp-2.5kb", "2.5kb-10kb", "10kb-50kb", ">50kb")) +denovo %>% + select(SVLEN_interv, name, SVTYPE) %>% + unique() %>% + ggplot(aes(x = SVLEN_interv, fill = SVTYPE)) + + geom_bar() + + # scale_y_log10()+ + labs(x="Size of SV", y="Number de novo SVs", title= "De Novo SVs by Size") + + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) -> p_size_count +ggsave("size.png", p_size_count, width = 25, height = 20) + +##Evidence plot +SR_denovo <- subset(denovo, EVIDENCE_FIX == "SR", select = c(ALGORITHMS, GQ)) +caller_quality_denovos <- subset(denovo, select = c(name,ALGORITHMS, GQ, EVIDENCE)) + +denovo %>% + select(name, SVTYPE, EVIDENCE_FIX) %>% + unique() %>% + ggplot(aes(x = forcats::fct_infreq(EVIDENCE_FIX), fill = SVTYPE)) + + geom_bar() + + # scale_y_log10()+ + labs(x="Evidence", y="Number de novo SVs", title= "De Novo SVs by Evidence") + + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + + theme_classic() + + theme( + legend.position = "right", + axis.text = element_text(size = 70), + axis.title = element_text(size = 70), + plot.title = element_text(size=70), + axis.text.x = element_text(angle = 45, hjust = 1), + panel.border = element_blank(), + axis.line = element_line(colour = "black"), + legend.title = element_text(size=30), + legend.text = element_text(size=25)) -> p_evidence +ggsave("evidence.png", p_evidence, width = 25, height = 20) + +# Annotation plot +annot <- grep("_", names(denovo), value = T) +annot <- grep("LINCRNA", annot, value = T, invert = T) +# annot <- annot[!annot %in% c("PROTEIN_CODING__NEAREST_TSS", "PROTEIN_CODING__INTERGENIC")] + +lof <- grep("LOF", annot, value = T) +cg <- grep("COPY_GAIN", annot, value = T) +msv <- grep("MSV_EXON_OVR", annot, value = T) +inv_span <- grep("INV_SPAN", annot, value = T) +dup_partial <- grep("DUP_PARTIAL", annot, value = T) +intronic <- grep("INTRONIC", annot, value = T) +utr <- grep("UTR", annot, value = T) +promoter<- grep("PROMOTER", annot, value = T) +nearest_tss<- grep("NEAREST_TSS", annot, value = T) +intergenic<- grep("INTERGENIC", annot, value = T) +coding <- grep("CODING", annot, value = T) +coding <- coding[!coding %in% c('PROTEIN_CODING__NEAREST_TSS', "PROTEIN_CODING__INTERGENIC")] + +denovo$is_lof <- apply(denovo[,lof, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_cg <- apply(denovo[,cg, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_msv <- apply(denovo[,msv, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_inv_span <- apply(denovo[,inv_span, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_dup_partial <- apply(denovo[,dup_partial, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_intronic <- apply(denovo[,intronic, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_utr <- apply(denovo[,utr, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_promoter <- apply(denovo[,promoter, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_nearest_tss <- apply(denovo[,nearest_tss, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) +denovo$is_intergenic <- apply(denovo[,intergenic, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '' | r == FALSE)) +denovo$is_coding <- apply(denovo[,coding, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '')) + +keep_cols <- c(grep("SVTYPE", names(denovo)), (ncol(denovo)-9-1):ncol(denovo)) +denovo_cons <- denovo[,keep_cols] + +ref_cols <- grep("is_", names(denovo_cons)) +denovo_cons_type <- cbind(denovo_cons[,'SVTYPE', drop = FALSE], (denovo_cons[,ref_cols])*1) + +sv_cols <- list( + list(query = elements, + params = list("SVTYPE", c("DEL","DUP", "INS", "INV", "CPX", "CTX")), color = del_col, active = T + , query.name = "DEL" + ), #red + list(query = elements, + params = list("SVTYPE", c("DUP", "INS", "INV", "CPX", "CTX")), color = dup_col, active = T + , query.name = "DUP" + ), #blue + list(query = elements, + params = list("SVTYPE", c("INS", "INV", "CPX", "CTX")), color = ins_col, active = T + , query.name = "INS" + ), #orange + list(query = elements, + params = list("SVTYPE", c("INV", "CPX", "CTX")), color = inv_col, active = T + , query.name = "INV" + ), #green + list(query = elements, + params = list("SVTYPE", "CPX", "CTX"), color = cpx_col, active = T + , query.name = "CPX" ) + # ), #purple + #list(query = elements, + #params = list("SVTYPE", "CTX"), color = ctx_col, active = T, + #query.name = "CTX" + #) +) + +denovo_cons_type %>% + upset(sets = grep("is_", names(denovo_cons_type), value = T), + #queries = sv_cols, + order.by = "freq", + set_size.show = TRUE, + # set_size.scale_max = 15000, + text.scale = 4, + query.legend = "top", + ) -> p_denovo_upset_all + +grob_annotation_upset_plot <- as.grob(p_denovo_upset_all) +ggsave("annotation.png", grob_annotation_upset_plot, width = 30, height = 20) + +# Creating a panel of plots +lay <- rbind(c(1,1,1,2,2,2,3,3,3), + c(1,1,1,2,2,2,3,3,3), + c(4,4,4,4,4,4,4,4,4), + c(4,4,4,4,4,4,4,4,4), + c(5,5,NA,6,6,NA,7,7,NA), + c(5,5,NA,6,6,NA,7,7,NA), + c(8,8,8,8,NA,9,9,9,9), + c(8,8,8,8,NA,9,9,9,9), + c(10,10,10,10,10,10,NA,NA,NA), + c(10,10,10,10,10,10,NA,NA,NA)) + +ml <- grid.arrange(sample_boxplot, type_boxplot, p_type_count, p_chr_count, p_af_count, p_af_count_in_gd, p_af_count_not_in_gd, p_size_count, p_evidence, grob_annotation_upset_plot, layout_matrix = lay, top=textGrob("De Novo SV Data", gp=gpar(fontsize=75))) +ggsave(out_file, ml, width = 80, height = 150, limitsize = FALSE) diff --git a/src/denovo/denovo_svs.py b/src/denovo/denovo_svs.py new file mode 100644 index 000000000..3ba87e1b1 --- /dev/null +++ b/src/denovo/denovo_svs.py @@ -0,0 +1,833 @@ +""" +SV de novo filtering script +""" + +import argparse +import yaml +import numpy as np +import pandas as pd +import pybedtools +import collections +import time +from subprocess import Popen, PIPE + + +pd.options.mode.chained_assignment = None # default='warn' + + +def verbose_print(msg, verbose): + if verbose == "True": + print(msg) + + +def get_count(row, items): + return sum(samp in items for samp in row['samples'].split(',')) + + +def get_parents_frequency(row, items): + return sum(s in items for s in row['samples'].split(',')) / len(items) + + +def get_family_count(row, ped): + sample = row['sample'] + father = ped[(ped['IndividualID'] == sample)]['FatherID'].values[0] + mother = ped[(ped['IndividualID'] == sample)]['MotherID'].values[0] + parents = [father, mother] + return ( + sum(s in parents for s in row['samples'].split(',')) + ) + + +def variant_info(row, field, vcf): + row_name = row['name'] + samp = row['sample'] + + filt_pos = vcf[(vcf['ID'] == row_name)]['FORMAT'].str.split(':').tolist() + + if field in filt_pos[0]: + idx = filt_pos[0].index(field) + if samp in vcf.columns: + samp_info = vcf[(vcf['ID'] == row_name)][samp].str.split(':').tolist() + samp_info_field = samp_info[0][idx] + else: + samp_info_field = 'NA' + else: + samp_info_field = 'NA' + + return samp_info_field + + +def add_family(row, ped): + sample = row['sample'] + fam = ped[(ped['IndividualID'] == sample)]['FamID'].values + chr = row['chrom'] + if fam.size != 0: + return '_'.join([str(fam[0]), chr]) + + +def get_info_parent(row, ped, vcf, parent, field): + row_name = row['name'] + sample = row['sample'] + parent_id = ped[(ped['IndividualID'] == sample)][parent].values + + filt_pos = vcf[(vcf['ID'] == row_name)]['FORMAT'].str.split(':').tolist() + + if field in filt_pos[0]: + idx = filt_pos[0].index(field) + parent_info = vcf[(vcf['ID'] == row_name)][parent_id[0]].str.split(':').tolist() + parent_info_field = parent_info[0][idx] + else: + return 'NA' + + return parent_info_field + + +def get_parents(proband, ped): + mother = ped[(ped['IndividualID'] == proband)]['MotherID'].values + father = ped[(ped['IndividualID'] == proband)]['FatherID'].values + parent_list = [mother, father] + return parent_list + + +def get_family_id(row, ped): + samp = row['sample'] + family_id = ped[(ped['IndividualID'] == samp)]['FamID'].values + return family_id[0] + + +def get_bincov_matrix_url(sample, sample_batches, batch_bincov): + batch = sample_batches.loc[sample_batches['sample'] == sample]['batch'].iloc[0] + bincov_gs = batch_bincov.loc[batch_bincov['batch'] == batch]['bincov'].iloc[0] + return bincov_gs + + +def write_to_filter_file(filename, header, original_df, filtered_df): + filename.write(header) + filename.write("\n") + original = original_df['name_famid'] + filtered = filtered_df['name_famid'] + to_write = list(set(original) - set(filtered)) + filename.write(str(to_write)) + filename.write("\n") + + +def write_to_size_file(filename, header, df): + filename.write(header) + filename.write(str(len(df))) + filename.write("\n") + + +def min_gq(row, gq_min): + paternal_srgq = row['paternal_srgq'] + maternal_srgq = row['maternal_srgq'] + paternal_pegq = row['paternal_pegq'] + maternal_pegq = row['maternal_pegq'] + paternal_gq = row['paternal_gq'] + maternal_gq = row['maternal_gq'] + minimum = min([paternal_srgq, maternal_srgq, paternal_gq, maternal_gq, paternal_pegq, maternal_pegq]) + if minimum != '.': + if int(min([paternal_srgq, maternal_srgq, paternal_gq, maternal_gq, paternal_pegq, maternal_pegq])) <= gq_min: + return 'Remove' + else: + return 'Keep' + else: + return 'Keep' + + +def convert_to_bedtool(df, cols_to_keep=None, sort=True): + if cols_to_keep is None: + string_obj = df.to_string(header=False, index=False) + bt_obj = pybedtools.BedTool(string_obj, from_string=True) + if sort: + string_obj = df.to_string(header=False, index=False) + bt_obj = pybedtools.BedTool(string_obj, from_string=True).sort() + else: + string_obj = df[cols_to_keep].to_string(header=False, index=False) + bt_obj = pybedtools.BedTool(string_obj, from_string=True).sort() + return bt_obj + + +def tabix_query(filename, chrom, start, end): + process = Popen(['tabix', '-h', filename, f'{chrom}:{start}-{end}'], stdout=PIPE) + cov = [] + for line in process.stdout: + cov.append(line.decode("utf-8").strip().split()) + cov_matrix = pd.DataFrame(cov) + return cov_matrix + + +def get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member): + svtype = row['svtype'] + start = int(row['start']) + end = int(row['end']) + if svtype == 'INS': + start = start - 100 + end = end + 100 + proband = row['sample'] + proband_matrix = get_bincov_matrix_url(proband, sample_batches, batch_bincov) + mom = get_parents(proband, ped)[0][0] + mom_matrix = get_bincov_matrix_url(mom, sample_batches, batch_bincov) + dad = get_parents(proband, ped)[1][0] + dad_matrix = get_bincov_matrix_url(dad, sample_batches, batch_bincov) + if family_member == 'proband': + return [proband, proband_matrix] + if family_member == 'mother': + return [mom, mom_matrix] + if family_member == 'father': + return [dad, dad_matrix] + + +def find_coverage(row, ped, sample_batches, batch_bincov, family_member): + chrom = row['chrom'] + start = int(row['start']) + end = int(row['end']) + svtype = row['svtype'] + if svtype == 'INS': + start = start - 100 + end = end + 100 + if svtype == 'DEL' or svtype == 'DUP' or svtype == 'INS': + if family_member == 'mother': + matrix = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='mother')[1] + if family_member == 'father': + matrix = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='father')[1] + cov_matrix = tabix_query(matrix, chrom, start, end) + + header = cov_matrix.iloc[0].to_list() # grab the first row for the header + new_header = [] + for x in header: + new_header.append(x) + cov_matrix = cov_matrix[1:] # take the data less the header row + cov_matrix.columns = new_header + if family_member == 'mother': + indv = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='mother')[0] + if family_member == 'father': + indv = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='father')[0] + sample_cov_matrix = cov_matrix[indv].tolist() + sample_cov_matrix_decoded = [] + for x in sample_cov_matrix: + sample_cov_matrix_decoded.append(x) + return sample_cov_matrix_decoded + else: + return 'NA' + + +def get_median_coverage(mom_matrix, dad_matrix, coverage_cutoff): + mom_coverage = mom_matrix + dad_coverage = dad_matrix + + if mom_coverage == 'NA' and dad_coverage == 'NA': + return 'Keep' + + if mom_coverage != 'NA' and dad_coverage != 'NA': + coverage_d = { + 'Mother': mom_coverage, + 'Father': dad_coverage + } + + coverage_df = pd.DataFrame(coverage_d) + ''' + if coverage_df.empty: + print('Coverage df is empty!') + exit() + ''' + median = coverage_df.median() + median_list = median.tolist() + median_filtered = [x for x in median_list if x < coverage_cutoff] + if len(median_filtered) > 0: + return 'Remove' + else: + return 'Keep' + else: + return 'Remove' # TODO: check with Alba what should this return. + + +def get_cnv_intersection_depth(bed, raw, overlap): + intersect = bed.coverage(raw).to_dataframe(disable_auto_names=True, header=None) + if len(intersect) != 0: + names_overlap = intersect[intersect[10] > overlap][6].to_list() + else: + names_overlap = [''] + return names_overlap + + +def get_cnv_intersection_other(bed, raw, overlap): + overlap = bed.intersect(raw, wo=True, f=overlap, r=True).to_dataframe(disable_auto_names=True, header=None) + if len(overlap) != 0: + names_overlap = overlap[6].to_list() + else: + names_overlap = [''] + return names_overlap + + +def get_insertion_intersection(bed, raw): + overlap = bed.closest(raw).to_dataframe(disable_auto_names=True, header=None) + if len(overlap) != 0: + overlap['is_close'] = abs(overlap[8] - overlap[1]) < 100 + if sum(bool(x) for x in overlap['is_close']) > 0: + names_overlap = overlap[(overlap['is_close'])][6].to_list() + else: + names_overlap = [''] + return names_overlap + + +def main(): + """ + Parse arguments vcf_metrics + """ + parser = argparse.ArgumentParser(description='Parse arguments') + parser.add_argument('--bed', dest='bed', help='Input BED file') + parser.add_argument('--ped', dest='ped', help='Ped file') + parser.add_argument('--vcf', dest='vcf', help='VCF file') + parser.add_argument('--disorder', dest='disorder', help='Genomic disorder regions') + parser.add_argument('--out', dest='out', help='Output file with all variants') + parser.add_argument('--out_de_novo', dest='out_de_novo', help='Output file with only de novo variants') + parser.add_argument('--raw_proband', dest='raw_proband', help='Directory with raw SV calls - output from m04') + parser.add_argument('--raw_parents', dest='raw_parents', help='Directory with raw SV calls - output from m04') + parser.add_argument('--raw_depth_proband', dest='raw_depth_proband', help='Directory with raw SV depth calls - output from m04') + parser.add_argument('--raw_depth_parents', dest='raw_depth_parents', help='Directory with raw depth SV calls - output from m04') + parser.add_argument('--config', dest='config', help='Config file') + parser.add_argument('--exclude_regions', dest='exclude_regions', help='File containing regions with known somatic mutations') + parser.add_argument('--coverage', dest='coverage', help='File with batch in first column respective coverage file in second column') + parser.add_argument('--sample_batches', dest='sample_batches', help='File with samples in first column and their respective batch in second column') + parser.add_argument('--verbose', dest='verbose', help='Verbosity') + args = parser.parse_args() + + bed_file = args.bed + ped_file = args.ped + vcf_file = args.vcf + disorder_file = args.disorder + out_file = args.out + de_novo_out_file = args.out_de_novo + raw_file_proband = args.raw_proband + raw_file_parent = args.raw_parents + raw_file_depth_proband = args.raw_depth_proband + raw_file_depth_parent = args.raw_depth_parents + verbose = args.verbose + config_file = args.config + exclude_regions = args.exclude_regions + coverage = args.coverage + batches = args.sample_batches + + with open(config_file, "r") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + + large_cnv_size = int(config['large_cnv_size']) + gnomad_col = config['gnomad_col'] + alt_gnomad_col = config['alt_gnomad_col'] + gnomad_af = float(config['gnomad_AF']) + parents_af = float(config['parents_AF']) + large_raw_overlap = float(config['large_raw_overlap']) + small_raw_overlap = float(config['small_raw_overlap']) + cohort_af = float(config['cohort_AF']) + coverage_cutoff = float(config['coverage_cutoff']) + depth_only_size = float(config['depth_only_size']) + parents_overlap = float(config['parents_overlap']) + gq_min = float(config['gq_min']) + + # Read files + verbose_print('Reading Input Files', verbose) + bed = pd.read_csv(bed_file, sep='\t').replace(np.nan, '', regex=True) + bed = bed[(bed['samples'] != "")] + bed.rename(columns={'#chrom': 'chrom'}, inplace=True) + vcf = pd.read_csv(vcf_file, sep='\t') + ped = pd.read_csv(ped_file, sep='\t') + disorder = pd.read_csv(disorder_file, sep='\t', header=None) + raw_bed_colnames = ['ID', 'start', 'end', 'svtype', 'sample'] + raw_bed_child = pd.read_csv(raw_file_proband, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True) + raw_bed_parent = pd.read_csv(raw_file_parent, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True) + raw_bed_depth_child = pd.read_csv(raw_file_depth_proband, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True) + raw_bed_depth_parent = pd.read_csv(raw_file_depth_parent, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True) + exclude_regions = pd.read_csv(exclude_regions, sep='\t').replace(np.nan, '', regex=True) + bincov_colnames = ['batch', 'bincov', 'index'] + sample_batches_colnames = ['sample', 'batch'] + bincov = pd.read_csv(coverage, sep='\t', names=bincov_colnames, header=None).replace(np.nan, '', regex=True) + sample_batches = pd.read_csv(batches, sep='\t', names=sample_batches_colnames, header=None).replace(np.nan, '', regex=True) + + ######################### + # REFORMAT AND ANNOTATE # + ######################### + # Exit out of bed file is empty + if bed.size == 0: + bed.to_csv(path_or_buf=out_file, mode='a', index=False, sep='\t', header=True) + bed.to_csv(path_or_buf=de_novo_out_file, mode='a', index=False, sep='\t', header=True) + exit() + # Get parents and children ids + verbose_print('Getting parents/children/affected/unaffected IDs', verbose) + start = time.time() + list_of_trios = [item for item, count in collections.Counter(ped["FamID"]).items() if count == 3] + families = ped[(ped['FamID'].isin(list_of_trios)) & + (ped['FatherID'] != "0") & + (ped['MotherID'] != "0")]['FamID'].values + trios = ped[(ped['FamID'].isin(families))] + parents = trios[(trios['FatherID'] == "0") & (trios['MotherID'] == "0")]['IndividualID'].values + children = trios[(trios['FatherID'] != "0") & (trios['MotherID'] != "0")]['IndividualID'].values + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Get counts at cohort level and number of parents/children + verbose_print('Getting counts', verbose) + start = time.time() + bed['num_children'] = bed.apply(lambda r: get_count(r, children), axis=1) + bed['num_parents'] = bed.apply(lambda r: get_count(r, parents), axis=1) + bed['AF_parents'] = bed.apply(lambda r: get_parents_frequency(r, parents), axis=1) + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Remove mCNVs, BNDs and SVs in sex chromosomes + start = time.time() + verbose_print('Remove BND and mCNV', verbose) + bed = bed[(~bed['svtype'].isin(['BND', 'CNV']))] + # bed = bed[(~bed['svtype'].isin(['BND', 'CNV']) & (~bed['chrom'].isin(["chrY", "chrX"])))] + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Flag if small or large CNV based on large_cnv_size cutoff and flag for removal based on size for depth only calls + verbose_print('Flagging calls depending on size', verbose) + start = time.time() + bed['is_large_cnv'] = (bed['SVLEN'] >= large_cnv_size) & ((bed['svtype'] == 'DEL') | (bed['svtype'] == 'DUP')) + bed['is_small_cnv'] = (bed['SVLEN'] < large_cnv_size) & ((bed['svtype'] == 'DEL') | (bed['svtype'] == 'DUP')) + bed['is_depth_only'] = (bed['EVIDENCE'] == "RD") + bed['is_depth_only_small'] = (bed['svtype'] == "DUP") & (bed['ALGORITHMS'] == "depth") & (bed['SVLEN'] <= depth_only_size) + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Split into one row per sample + verbose_print('Split into one row per sample', verbose) + start = time.time() + bed_split = bed.assign(sample=bed.samples.str.split(",")).explode('sample') + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Sepparate variants in children and parents + verbose_print('Sepparate variants in children and parents', verbose) + start = time.time() + bed_child = bed_split[bed_split['sample'].isin(children)] + bed_parents = bed_split[bed_split['sample'].isin(parents)] + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Flag variants in genomic disorder (GD) regions + verbose_print('Flagging variants in GD', verbose) + start = time.time() + bed_child['in_gd'] = bed_child['name'].isin(disorder[0]) + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Annotate family information for filtering + bed_child['family_id'] = bed_child.apply(lambda r: get_family_id(r, ped), axis=1) + bed_child['name_famid'] = bed_child['name'] + "_" + bed_child['family_id'].astype(str).str.strip("[]") + bed_parents['family_id'] = bed_parents.apply(lambda r: get_family_id(r, ped), axis=1) + bed_parents['name_famid'] = bed_parents['name'] + "_" + bed_parents['family_id'].astype(str).str.strip("[]") + + # Filter out by frequency - AF gnomad < 0.01 OR inGD + verbose_print('Filtering by frequency', verbose) + start = time.time() + bed_child["AF"] = pd.to_numeric(bed_child["AF"]) + try: + bed_child[gnomad_col] = pd.to_numeric(bed_child[gnomad_col]) + remove_freq = bed_child[~((((bed_child[gnomad_col] <= gnomad_af) & (bed_child['AF'] <= cohort_af)) | + ((bed_child[gnomad_col].isnull()) & (bed_child['AF'] <= cohort_af)) | + (bed_child['in_gd'])))]['name_famid'].to_list() + bed_child['is_de_novo'] = pd.Series(True, index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), False) + bed_child['filter_flag'] = pd.Series('de_novo', index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), 'AF') + + except KeyError: + bed_child[alt_gnomad_col] = pd.to_numeric(bed_child[alt_gnomad_col]) + remove_freq = bed_child[~((((bed_child[alt_gnomad_col] <= gnomad_af) & (bed_child['AF'] <= cohort_af)) | + ((bed_child[alt_gnomad_col].isnull()) & (bed_child['AF'] <= cohort_af)) | + (bed_child['in_gd'])))]['name_famid'].to_list() + bed_child['is_de_novo'] = pd.Series(True, index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), False) + bed_child['filter_flag'] = pd.Series('de_novo', index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), 'AF') + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Get counts within family and remove if SV in parents + verbose_print('Keep variants in children only', verbose) + start = time.time() + try: + parents_sc = int(config['parents_SC']) + except KeyError: + parents_sc = len(parents) + if len(bed_child.index) > 0: # Otherwise, if nothing in bed_child will get an error + bed_child['num_parents_family'] = bed_child.apply(lambda r: get_family_count(r, ped), axis=1) + + remove_child = bed_child[~((bed_child['num_parents_family'] == 0) & # there are no parents of the affected child in the samples column + (bed_child['num_children'] >= 1) & # for each line there is at least one child in the samples column + (bed_child['AF_parents'] <= parents_af) & # for each line, the frequency of parents in the samples column must be < 0.05 + (bed_child['num_parents'] <= parents_sc))]['name_famid'].to_list() # for each line, the number of parents must be <= parents_SC or the total number of parents in the ped + + bed_child.loc[bed_child['name_famid'].isin(remove_child) & bed_child['is_de_novo'], 'filter_flag'] = 'in_parent' + bed_child.loc[bed_child['name_famid'].isin(remove_child) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Extract info from the VCF file + verbose_print('Appending FILTER information', verbose) + start = time.time() + if len(bed_child.index) > 0: + bed_child['GT'] = bed_child.apply(lambda r: variant_info(r, 'GT', vcf), axis=1) + bed_child['EV'] = bed_child.apply(lambda r: variant_info(r, 'EV', vcf), axis=1) + bed_child['GQ'] = bed_child.apply(lambda r: variant_info(r, 'GQ', vcf), axis=1) + bed_child['RD_CN'] = bed_child.apply(lambda r: variant_info(r, 'RD_CN', vcf), axis=1) + bed_child['RD_GQ'] = bed_child.apply(lambda r: variant_info(r, 'RD_GQ', vcf), axis=1) + bed_child['PE_GQ'] = bed_child.apply(lambda r: variant_info(r, 'PE_GQ', vcf), axis=1) + bed_child['PE_GT'] = bed_child.apply(lambda r: variant_info(r, 'PE_GT', vcf), axis=1) + bed_child['SR_GQ'] = bed_child.apply(lambda r: variant_info(r, 'SR_GQ', vcf), axis=1) + bed_child['SR_GT'] = bed_child.apply(lambda r: variant_info(r, 'SR_GT', vcf), axis=1) + else: + bed_child['GT'] = 'NA' + bed_child['EV'] = 'NA' + bed_child['GQ'] = 'NA' + bed_child['RD_CN'] = 'NA' + bed_child['RD_GQ'] = 'NA' + bed_child['PE_GQ'] = 'NA' + bed_child['PE_GT'] = 'NA' + bed_child['SR_GQ'] = 'NA' + bed_child['SR_GT'] = 'NA' + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Remove WHAM only and GT = 1 + verbose_print('Remove wham only and GT=1 calls', verbose) + start = time.time() + remove_wham = bed_child[(bed_child['ALGORITHMS'] == "wham") & (bed_child['GQ'] == '1')] + bed_child.loc[bed_child['name_famid'].isin(remove_wham) & bed_child['is_de_novo'], 'filter_flag'] = 'wham_only' + bed_child.loc[bed_child['name_famid'].isin(remove_wham), 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Annotate parental information + if len(bed_child.index) > 0: + bed_child['paternal_gq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'GQ'), axis=1) + bed_child['maternal_gq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'GQ'), axis=1) + bed_child['paternal_rdcn'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'RD_CN'), axis=1) + bed_child['maternal_rdcn'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'RD_CN'), axis=1) + bed_child['paternal_pegq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'PE_GQ'), axis=1) + bed_child['maternal_pegq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'PE_GQ'), axis=1) + bed_child['paternal_srgq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'SR_GQ'), axis=1) + bed_child['maternal_srgq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'SR_GQ'), axis=1) + else: + bed_child['paternal_gq'] = 'NA' + bed_child['maternal_gq'] = 'NA' + bed_child['paternal_rdcn'] = 'NA' + bed_child['maternal_rdcn'] = 'NA' + bed_child['paternal_pegq'] = 'NA' + bed_child['maternal_pegq'] = 'NA' + bed_child['paternal_srgq'] = 'NA' + bed_child['maternal_srgq'] = 'NA' + + # LARGE CNV: Check for false negative in parents: check depth in parents, independently of the calls + verbose_print('Large CNVs check', verbose) + start = time.time() + # 1. Strip out if same CN in parents and proband if not in chrX + remove_large_cnv = bed_child.loc[~(((bed_child['SVLEN'] <= 5000) | ((bed_child['RD_CN'] != bed_child['maternal_rdcn']) & (bed_child['RD_CN'] != bed_child['paternal_rdcn']))) | (bed_child['chrom'] == 'chrX'))] + bed_child.loc[bed_child['name_famid'].isin(remove_large_cnv) & bed_child['is_de_novo'], 'filter_flag'] = 'same_cn_as_parents' + bed_child.loc[bed_child['name_famid'].isin(remove_large_cnv) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # 2. Check if call in parents with bedtools coverage (332) + verbose_print('CNV present in parents check', verbose) + start = time.time() + cols_keep = ['family_chrom', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid'] + bed_child_large = bed_child[(bed_child['is_large_cnv'])] + bed_parents_large = bed_parents[(bed_parents['is_large_cnv'])] + if (len(bed_child_large.index) > 0) & (len(bed_parents_large.index) > 0): + bed_child_large['family_chrom'] = bed_child_large.apply(lambda r: add_family(r, ped), axis=1) + bed_child_large = bed_child_large[cols_keep].to_string(header=False, index=False) + bed_child_large = pybedtools.BedTool(bed_child_large, from_string=True).sort() + + bed_parents_large['family_chrom'] = bed_parents_large.apply(lambda r: add_family(r, ped), axis=1) + bed_parents_large = bed_parents_large[cols_keep].to_string(header=False, index=False) + bed_parents_large = pybedtools.BedTool(bed_parents_large, from_string=True).sort() + + bed_overlap = bed_child_large.coverage(bed_parents_large).to_dataframe(disable_auto_names=True, header=None) + names_overlap = bed_overlap[(bed_overlap[10] >= parents_overlap)][6].to_list() + else: + names_overlap = [''] + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Add if overlap to bed child table + bed_child['overlap_parent'] = (bed_child['name_famid'].isin(names_overlap)) + + # Small calls: + # If RD,SR and < large_cnv_size, treat RD,SR as SR + verbose_print('Small CNVs check', verbose) + start = time.time() + bed_child['EVIDENCE_FIX'] = bed_child['EVIDENCE'] + bed_child[(bed_child['SVLEN'] <= large_cnv_size) & + (bed_child['EVIDENCE'] == "RD,SR") & + ((bed_child['svtype'] == 'DEL') | (bed_child['svtype'] == 'DUP'))]['EVIDENCE_FIX'] = "SR" + + # Add if variant contains RD evidence + bed_child['contains_RD'] = bed_child.EVIDENCE_FIX.str.contains('RD') + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + ################### + # RAW FILES CHECK # + ################### + # Keep all CPT and CTX - they are rare and the type is different than in the raw files + # Check only DEL,DUP and INS for raw evidence + verbose_print('Checking raw files', verbose) + # Remove depth raw calls > 1MB + raw_bed_depth_parent['SVLEN'] = raw_bed_depth_parent['end'] - raw_bed_depth_parent['start'] + raw_bed_ref_depth_parent_subset = raw_bed_depth_parent[(raw_bed_depth_parent['SVLEN'] < 1000000)] + + # Reformat raw files + raw_bed_ref_child = convert_to_bedtool(raw_bed_child, sort=False) + raw_bed_ref_parent = convert_to_bedtool(raw_bed_parent, sort=False) + raw_bed_ref_depth_child = convert_to_bedtool(raw_bed_depth_child, sort=False) + raw_bed_ref_depth_parent = convert_to_bedtool(raw_bed_ref_depth_parent_subset, sort=False) + + # Reformat de novo filt calls + bed_child['chrom_type_sample'] = bed_child['chrom'] + "_" + bed_child['SVTYPE'] + "_" + bed_child['sample'] + bed_child['chrom_type_family'] = bed_child['chrom'] + "_" + bed_child['SVTYPE'] + "_" + bed_child['family_id'].astype(str) + + cols_keep_child = ['chrom_type_sample', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid'] + cols_keep_parent = ['chrom_type_family', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid'] + + # Subset bed file down to just those where is_de_novo is still = True to make searching faster + bed_child_de_novo = bed_child[bed_child['is_de_novo']] + + # INSERTIONS: + verbose_print('Checking insertions in raw files', verbose) + start = time.time() + bed_filt_ins = bed_child_de_novo[bed_child_de_novo['SVTYPE'] == 'INS'] + if len(bed_filt_ins.index) > 0: + verbose_print('Checking if insertion in proband is in raw files', verbose) + bed_filt_ins_proband = convert_to_bedtool(bed_filt_ins, cols_to_keep=cols_keep_child, sort=True) + ins_names_overlap_proband = get_insertion_intersection(bed_filt_ins_proband, raw_bed_ref_child) + verbose_print('Checking if insertion in proband are also in raw files for the parents', verbose) + bed_filt_ins_fam = convert_to_bedtool(bed_filt_ins, cols_to_keep=cols_keep_parent, sort=True) + ins_names_overlap_parent = get_insertion_intersection(bed_filt_ins_fam, raw_bed_ref_parent) + ins_names_overlap = [x for x in ins_names_overlap_proband if x not in ins_names_overlap_parent] + else: + ins_names_overlap = [''] + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Large CNVS: Reciprocal overlap large_raw_overlap + verbose_print('Checking large cnvs in raw files', verbose) + start = time.time() + + large_bed_filt_cnv_depth = bed_child_de_novo[(bed_child_de_novo['is_large_cnv']) & (bed_child_de_novo['SVLEN'] >= 5000)] + large_bed_filt_cnv_other = bed_child_de_novo[(bed_child_de_novo['is_large_cnv']) & (bed_child_de_novo['SVLEN'] < 5000)] + + if len(large_bed_filt_cnv_other.index) > 0: + verbose_print('Checking if intermediate cnv in proband is in raw files', verbose) + bed_filt_cnv_proband_other = convert_to_bedtool(large_bed_filt_cnv_other, cols_to_keep=cols_keep_child, sort=True) + large_cnv_names_overlap_proband_other = get_cnv_intersection_other(bed_filt_cnv_proband_other, raw_bed_ref_child, large_raw_overlap) + verbose_print('Checking if intermediate cnvs in proband are also in raw files for the parents', verbose) + bed_filt_cnv_fam_other = convert_to_bedtool(large_bed_filt_cnv_other, cols_to_keep=cols_keep_parent, sort=True) + large_cnv_names_overlap_parent_other = get_cnv_intersection_other(bed_filt_cnv_fam_other, raw_bed_ref_parent, large_raw_overlap) + large_cnv_names_overlap_other = [x for x in large_cnv_names_overlap_proband_other if x not in large_cnv_names_overlap_parent_other] + else: + large_cnv_names_overlap_other = [''] + + if len(large_bed_filt_cnv_depth.index) > 0: + verbose_print('Checking if large cnv in proband is in raw files', verbose) + bed_filt_cnv_proband_depth = convert_to_bedtool(large_bed_filt_cnv_depth, cols_to_keep=cols_keep_child, sort=True) + large_cnv_names_overlap_proband_depth = get_cnv_intersection_depth(bed_filt_cnv_proband_depth, raw_bed_ref_depth_child, large_raw_overlap) + verbose_print('Checking if large cnv in proband are also in raw files for the parents', verbose) + bed_filt_cnv_fam_depth = convert_to_bedtool(large_bed_filt_cnv_depth, cols_to_keep=cols_keep_parent, sort=True) + large_cnv_names_overlap_parent_depth = get_cnv_intersection_depth(bed_filt_cnv_fam_depth, raw_bed_ref_depth_parent, large_raw_overlap) + large_cnv_names_overlap_depth = [x for x in large_cnv_names_overlap_proband_depth if x not in large_cnv_names_overlap_parent_depth] + else: + large_cnv_names_overlap_depth = [''] + + large_cnv_names_overlap = large_cnv_names_overlap_other + large_cnv_names_overlap_depth + + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Small CNVs - Reciprocal overlap small_raw_overlap + verbose_print('Checking small cnvs in raw files', verbose) + start = time.time() + small_bed_filt_cnv = bed_child_de_novo[(bed_child_de_novo['is_small_cnv'])] # We do not want to check against raw depth files if CNV <= 5kb + if len(small_bed_filt_cnv.index) > 0: + verbose_print('Checking if small cnv in proband is in raw files', verbose) + bed_filt_cnv_proband = convert_to_bedtool(small_bed_filt_cnv, cols_to_keep=cols_keep_child, sort=True) + small_cnv_names_overlap_proband = get_cnv_intersection_other(bed_filt_cnv_proband, raw_bed_ref_child, small_raw_overlap) + verbose_print('Checking small cnvs in probands are also in raw files for parents', verbose) + bed_filt_cnv_fam = convert_to_bedtool(small_bed_filt_cnv, cols_to_keep=cols_keep_parent, sort=True) + small_cnv_names_overlap_parent = get_cnv_intersection_other(bed_filt_cnv_fam, raw_bed_ref_parent, small_raw_overlap) + small_cnv_names_overlap = [x for x in small_cnv_names_overlap_proband if x not in small_cnv_names_overlap_parent] + else: + small_cnv_names_overlap = [''] + delta = end - start + print("Took %f seconds to process" % delta) + + bed_child.loc[~(bed_child['name_famid'].isin(ins_names_overlap + large_cnv_names_overlap + small_cnv_names_overlap)) & bed_child['is_de_novo'], 'filter_flag'] = 'not_in_raw_files_or_in_parents_raw_files' + bed_child.loc[~(bed_child['name_famid'].isin(ins_names_overlap + large_cnv_names_overlap + small_cnv_names_overlap)) & bed_child['is_de_novo'], 'is_de_novo'] = False + + ############# + # FILTERING # + ############# + verbose_print('Filtering out calls', verbose) + + # 1. Filter out calls in exclude regions + verbose_print('Filtering out calls in exclude regions', verbose) + start = time.time() + # Reformat exclude_regions to bedtool + exclue_regions_bt = convert_to_bedtool(exclude_regions, sort=True) + # convert bed_final to bedtool + cols_keep_exclude_regions = ['chrom', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid'] + if len(bed_child.index) > 0: + bed_child_bt = convert_to_bedtool(bed_child, cols_keep_exclude_regions, sort=True) + exclude_regions_intersect = bed_child_bt.coverage(exclue_regions_bt).to_dataframe(disable_auto_names=True, header=None) # HB said to use bedtools coverage, -f and -F give the same SVs to be removed + if len(exclude_regions_intersect) != 0: + remove_regions = exclude_regions_intersect[exclude_regions_intersect[10] > 0.5][6].to_list() + else: + remove_regions = [''] + else: + remove_regions = [''] + + bed_child.loc[bed_child['name_famid'].isin(remove_regions) & bed_child['is_de_novo'], 'filter_flag'] = 'in_blacklist_region' + bed_child.loc[bed_child['name_famid'].isin(remove_regions) & bed_child['is_de_novo'], 'is_de_novo'] = False + + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # 2. Filter by type + # Keep any SV type that is not a DEL, DUP, or INS + keep_other_sv = bed_child[((~bed_child['SVTYPE'].isin(['DEL', 'DUP', 'INS'])) & (bed_child['filter_flag'] != 'AF') & (bed_child['filter_flag'] != 'in_parent'))]['name_famid'].to_list() + + # 3. Filter on DELs, DUPs, and INS + # Filter by size + # Filter out if large CNVs have parents overlap + verbose_print('Filtering out large CNVs with parents overlap', verbose) + start = time.time() + remove_large = bed_child[(bed_child['is_large_cnv']) & + (bed_child['overlap_parent'])]['name_famid'].to_list() + bed_child.loc[bed_child['name_famid'].isin(remove_large) & bed_child['is_de_novo'], 'filter_flag'] = 'large_cnv_with_parent_overlap' + bed_child.loc[bed_child['name_famid'].isin(remove_large) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + bed_child.to_csv(path_or_buf="after_large_cnv.txt", mode='a', index=False, sep='\t', header=True) + + # Filter out if small cnvs that are SR-only don't have BOTHSIDES_SUPPORT + verbose_print('Filtering out small CNVs that are SR-only and dont have BOTHSIDES_SUPPORT', verbose) + start = time.time() + remove_small = bed_child[(bed_child['is_small_cnv']) & + (bed_child['EVIDENCE_FIX'] == 'SR') & + ~(bed_child.FILTER.str.contains('BOTHSIDES_SUPPORT'))]['name_famid'].to_list() + bed_child.loc[bed_child['name_famid'].isin(remove_small) & bed_child['is_de_novo'], 'filter_flag'] = 'small_cnv_SR_only' + bed_child.loc[bed_child['name_famid'].isin(remove_small) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Filter out calls that are depth only and < depth_only_size + verbose_print('Filtering out calls that are depth only and < depth_only_size', verbose) + start = time.time() + remove_depth_small = bed_child[(bed_child['is_depth_only_small'])]['name_famid'].to_list() + bed_child.loc[bed_child['name_famid'].isin(remove_depth_small) & bed_child['is_de_novo'], 'filter_flag'] = 'depth_only_small_variant' + bed_child.loc[bed_child['name_famid'].isin(remove_depth_small) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Filter out DELs that are >500bp and RD_CN=2 and PE only envidence + verbose_print('Filtering out DELs that are RD_CN=2 and PE only evidence', verbose) + start = time.time() + remove_dels = bed_child[(bed_child['SVTYPE'] == 'DEL') & ((bed_child['RD_CN'] == '2') | (bed_child['RD_CN'] == '3')) & (bed_child['EVIDENCE'] == 'PE')]['name_famid'].to_list() + bed_child.loc[bed_child['name_famid'].isin(remove_dels) & bed_child['is_de_novo'], 'filter_flag'] = 'del_with_rdcn2_pe_only' + bed_child.loc[bed_child['name_famid'].isin(remove_dels) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # 4. Filter by quality + # Filter out if parents GQ is <= gq_min + verbose_print('Filtering if parents GQ <= min_gq', verbose) + start = time.time() + if len(bed_child.index) > 0: + bed_child['keep_gq'] = bed_child.apply(lambda r: min_gq(r, gq_min), axis=1) + remove_gq = bed_child[bed_child['keep_gq'] == 'Remove']['name_famid'].to_list() + bed_child.loc[bed_child['name_famid'].isin(remove_gq) & bed_child['is_de_novo'], 'filter_flag'] = 'bad_gq_in_parents' + bed_child.loc[bed_child['name_famid'].isin(remove_gq) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Filter out INS that are manta or melt only and are SR only, have GQ=0, and FILTER contains 'HIGH_SR_BACKGROUND' + verbose_print('Filtering out INS that are manta or melt only and SR only, with GQ=0 and FILTER contains HIGH_SR_BACKGROUND', verbose) + start = time.time() + remove_ins = bed_child[(bed_child['SVTYPE'] == 'INS') & ((bed_child['ALGORITHMS'] == 'manta') | (bed_child['ALGORITHMS'] == 'melt')) & (bed_child['EVIDENCE_FIX'] == 'SR') & ((bed_child['GQ'] == '0') | (bed_child.FILTER.str.contains('HIGH_SR_BACKGROUND')))]['name_famid'].to_list() + bed_child.loc[bed_child['name_famid'].isin(remove_ins) & bed_child['is_de_novo'], 'filter_flag'] = 'ins_filter' + bed_child.loc[bed_child['name_famid'].isin(remove_ins) & bed_child['is_de_novo'], 'is_de_novo'] = False + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # Remove if low coverage in parents + start = time.time() + bed_child_coverage = bed_child[bed_child['is_de_novo']] + verbose_print('Removing calls if there is low coverage evidence in parents', verbose) + if len(bed_child_coverage.index) > 0: + bed_child_coverage['median_coverage'] = bed_child_coverage.apply( + lambda r: get_median_coverage( + find_coverage(r, ped, sample_batches, bincov, family_member='mother'), + find_coverage(r, ped, sample_batches, bincov, family_member='father'), + coverage_cutoff), + axis=1) + remove_coverage = bed_child_coverage[bed_child_coverage['median_coverage'] == 'Remove']['name_famid'].to_list() + else: + remove_coverage = [''] + + bed_child.loc[bed_child['name_famid'].isin(remove_coverage) & bed_child['is_de_novo'], 'filter_flag'] = 'low_coverage_in_parents' + bed_child.loc[bed_child['name_famid'].isin(remove_coverage) & bed_child['is_de_novo'], 'is_de_novo'] = False + + end = time.time() + delta = end - start + print("Took %f seconds to process" % delta) + + # 5. Clean up and remove duplicated CPX SV + # Keep SVs + bed_child.loc[bed_child['name_famid'].isin(keep_other_sv), 'is_de_novo'] = True + bed_child.loc[bed_child['name_famid'].isin(keep_other_sv), 'filter_flag'] = 'not_del_dup_ins' + + # Remove duplicated CPX events that come from vcf output of module 18 + bed_child['is_cpx'] = (bed_child['SVTYPE'] == "CPX") + bed_child['is_duplicated'] = bed_child.duplicated(keep='first', subset=['start', 'end', 'sample']) + remove_duplicated = bed_child[(bed_child['is_duplicated']) & (bed_child['is_cpx'])]['name_famid'].to_list() + bed_child.loc[bed_child['name_famid'].isin(remove_duplicated) & bed_child['is_de_novo'], 'is_de_novo'] = False + bed_child.loc[bed_child['name_famid'].isin(remove_duplicated) & bed_child['is_de_novo'], 'filter_flag'] = 'duplicated_cpx' + + # Move sample column to column 6 + bed_final = bed_child + col = bed_final.pop("sample") + bed_final.insert(loc=5, column='sample', value=col, allow_duplicates=True) + + # Define output files + output = bed_final + de_novo = bed_final[(bed_final['is_de_novo']) | (bed_final['filter_flag'] == 'ins_filter') | (bed_final['in_gd'])] + + # Write output + output.to_csv(path_or_buf=out_file, mode='a', index=False, sep='\t', header=True) + de_novo.to_csv(path_or_buf=de_novo_out_file, mode='a', index=False, sep='\t', header=True) + + +if __name__ == '__main__': + main() diff --git a/src/denovo/reformat_raw_bed.R b/src/denovo/reformat_raw_bed.R new file mode 100644 index 000000000..5337a6054 --- /dev/null +++ b/src/denovo/reformat_raw_bed.R @@ -0,0 +1,49 @@ +# This script reformats the output bed files from module 01 with raw call evidence +# to match desired input for de novo SV filtering + +# Define args and load libraries +args = commandArgs(trailingOnly=TRUE) +require(plyr) +library(data.table) +library(tidyverse) + +input_bed <- args[1] +input_ped <- args[2] +out_probands <- args[3] +out_parents <- args[4] + +# Read input bed +bed <- fread(input_bed) +ped <- fread(input_ped) + +# adding back column names +colnames(bed) <- c("CHROM", "start", "end", "name", "svtype", "samples", "SVTYPE") + +# Split into one sample per row +bed %>% + subset(select = c(CHROM, start, end, SVTYPE, samples)) %>% + separate_rows(samples, sep = ",", convert = T) -> bed_split + +# split into parents and probands +bed_probands <- bed_split +bed_probands <- subset(bed_probands, !(samples %in% ped$MotherID) & !(samples %in% ped$FatherID)) + +bed_parents <- bed_split +bed_parents <- subset(bed_parents, samples %in% ped$MotherID | samples %in% ped$FatherID) + +# add in a family ID column to bed_parents +ped_subset <- subset(ped, select = c('FamID', 'IndividualID')) +colnames(ped_subset) <- c('FamID', 'samples') + +bed_parents <- merge(bed_parents,ped_subset, by="samples", all.x = TRUE) + +# Reformat chromosome column +bed_probands$CHROM <- paste0(bed_probands$CHROM, "_", bed_probands$SVTYPE, "_", bed_probands$samples) +bed_parents$CHROM <- paste0(bed_parents$CHROM, "_", bed_parents$SVTYPE, "_", bed_parents$FamID) + +# reorder the columns of bed_parents +bed_parents2 <- bed_parents[, c(2,3,4,5,1)] + +# Write output file +write.table(bed_probands, out_probands, sep = "\t", quote = F, row.names = F, col.names = F) +write.table(bed_parents2, out_parents, sep = "\t", quote = F, row.names = F, col.names = F) diff --git a/wdl/DeNovoSVsScatter.wdl b/wdl/DeNovoSVsScatter.wdl new file mode 100644 index 000000000..eaedbaf0a --- /dev/null +++ b/wdl/DeNovoSVsScatter.wdl @@ -0,0 +1,204 @@ +version 1.0 + +import "Structs.wdl" +import "Utils.wdl" as Utils + +workflow DeNovoSVsScatter { + + input { + File ped_input + Array[File] vcf_files + File disorder_input + String chromosome + File raw_proband + File raw_parents + File raw_depth_proband + File raw_depth_parents + File exclude_regions + File sample_batches + File batch_bincov_index + File python_config + String variant_interpretation_docker + RuntimeAttr? runtime_attr_denovo + RuntimeAttr? runtime_attr_vcf_to_bed + RuntimeAttr? runtime_attr_merge_bed + } + + Array[String] coverage_index_files = transpose(read_tsv(batch_bincov_index))[2] + + # Scatter genotyping over shards + scatter (shard in vcf_files) { + call Utils.VcfToBed as VcfToBed { + input: + vcf_file=shard, + args="--info ALL --include-filters", + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_vcf_to_bed + } + + call RunDeNovo { + input: + bed_input=VcfToBed.bed_output, + ped_input=ped_input, + vcf_input=shard, + disorder_input=disorder_input, + chromosome=chromosome, + raw_proband=raw_proband, + raw_parents=raw_parents, + raw_depth_proband=raw_depth_proband, + raw_depth_parents=raw_depth_parents, + exclude_regions = exclude_regions, + coverage_indices = coverage_index_files, + sample_batches = sample_batches, + batch_bincov_index = batch_bincov_index, + python_config=python_config, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_denovo + } + } + + call MergeBedFiles as MergeBedFilesAnnotated { + input: + bed_files = RunDeNovo.annotation_output, + chromosome = chromosome, + variant_interpretation_docker = variant_interpretation_docker, + runtime_attr_override = runtime_attr_merge_bed + } + + call MergeBedFiles as MergeBedFilesFinal { + input: + bed_files = RunDeNovo.denovo_output, + chromosome = chromosome, + variant_interpretation_docker = variant_interpretation_docker, + runtime_attr_override = runtime_attr_merge_bed + } + + output { + File per_chromosome_annotation_output_file = MergeBedFilesAnnotated.per_chromosome_denovo_output + File per_chromosome_final_output_file = MergeBedFilesFinal.per_chromosome_denovo_output + } +} + +task RunDeNovo { + + input { + File bed_input + File ped_input + File vcf_input + File disorder_input + String chromosome + File raw_proband + File raw_parents + File raw_depth_proband + File raw_depth_parents + File exclude_regions + Array[File] coverage_indices + File batch_bincov_index + File sample_batches + File python_config + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float vcf_size = size(vcf_input, "GB") + Float bed_size = size(bed_input, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 16, + disk_gb: ceil(15 + vcf_size + bed_size * 1.5), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File denovo_output = "~{basename}.denovo.bed.gz" + File annotation_output = "~{basename}.annotation.bed.gz" + } + + String basename = basename(vcf_input, ".vcf.gz") + command { + set -exuo pipefail + + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + + bcftools view ~{vcf_input} | grep -v ^## | bgzip -c > ~{basename}.noheader.vcf.gz + python /src/denovo/denovo_svs.py \ + --bed ~{bed_input} \ + --ped ~{ped_input} \ + --vcf ~{basename}.noheader.vcf.gz \ + --disorder ~{disorder_input} \ + --out ~{basename}.annotation.bed \ + --out_de_novo ~{basename}.denovo.bed \ + --raw_proband ~{raw_proband} \ + --raw_parents ~{raw_parents} \ + --raw_depth_proband ~{raw_depth_proband} \ + --raw_depth_parents ~{raw_depth_parents} \ + --config ~{python_config} \ + --exclude_regions ~{exclude_regions} \ + --coverage ~{batch_bincov_index} \ + --sample_batches ~{sample_batches} \ + --verbose True + + bgzip ~{basename}.denovo.bed + bgzip ~{basename}.annotation.bed + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task MergeBedFiles { + + input { + Array[File] bed_files + String chromosome + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float bed_files_size = size(bed_files, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + (bed_files_size) * 2.0), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File per_chromosome_denovo_output = "~{chromosome}.denovo.merged.bed.gz" + } + + command { + set -exuo pipefail + + zcat ~{bed_files[0]} | awk 'NR==1' > ~{chromosome}.denovo.merged.bed + zcat ~{sep=" " bed_files} | grep -v ^chrom >> ~{chromosome}.denovo.merged.bed + bgzip ~{chromosome}.denovo.merged.bed + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} \ No newline at end of file diff --git a/wdl/ReformatRawFiles.wdl b/wdl/ReformatRawFiles.wdl new file mode 100644 index 000000000..5b6f676c0 --- /dev/null +++ b/wdl/ReformatRawFiles.wdl @@ -0,0 +1,223 @@ +version 1.0 + +import "Structs.wdl" +import "Utils.wdl" as Utils +import "TasksMakeCohortVcf.wdl" as TasksMakeCohortVcf + +workflow ReformatRawFiles { + + input { + Array[String] contigs + File raw_files_list + File ped_input + String variant_interpretation_docker + String sv_base_mini_docker + Boolean depth + RuntimeAttr? runtime_attr_vcf_to_bed + RuntimeAttr? runtime_attr_merge_bed + RuntimeAttr? runtime_attr_divide_by_chrom + RuntimeAttr? runtime_attr_reformat_bed + } + + Array[String] raw_files = transpose(read_tsv(raw_files_list))[1] + + scatter(raw_file in raw_files) { + call Utils.VcfToBed as VcfToBed { + input: + vcf_file=raw_file, + args="--info SVTYPE", + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_vcf_to_bed + } + } + + call TasksMakeCohortVcf.CatUncompressedFiles as MergeBedFiles { + input: + shards=VcfToBed.bed_output, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_merge_bed + } + + scatter (contig in contigs) { + call RawDivideByChrom { + input: + bed_file=MergeBedFiles.outfile, + chromosome=contig, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override=runtime_attr_divide_by_chrom + } + + if (depth) { + call RawReformatBedDepth { + input: + per_chromosome_bed_file=RawDivideByChrom.per_chromosome_bed_output, + ped_input=ped_input, + chromosome=contig, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override=runtime_attr_reformat_bed + } + } + + if (!(depth)) { + call RawReformatBed { + input: + per_chromosome_bed_file=RawDivideByChrom.per_chromosome_bed_output, + ped_input=ped_input, + chromosome=contig, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override=runtime_attr_reformat_bed + } + } + File reformatted_parents_output_ = select_first([RawReformatBed.reformatted_parents_output, RawReformatBedDepth.reformatted_parents_depth_output]) + File reformatted_proband_output_ = select_first([RawReformatBed.reformatted_proband_output, RawReformatBedDepth.reformatted_proband_depth_output]) + } + + + output { + Array[File] reformatted_parents_raw_files = reformatted_parents_output_ + Array[File] reformatted_proband_raw_files = reformatted_proband_output_ + } +} + +task RawDivideByChrom { + + input { + File bed_file + String chromosome + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(bed_file, "GB") + Float base_mem_gb = 3.75 + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + input_size * 1.3), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File per_chromosome_bed_output = "${chromosome}.bed.gz" + } + + command { + set -exuo pipefail + + zcat ~{bed_file} | \ + awk '$1 == "~{chromosome}"' | \ + bgzip -c > ~{chromosome}.bed.gz + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task RawReformatBed { + + input { + File per_chromosome_bed_file + File ped_input + String chromosome + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float bed_file_size = size(per_chromosome_bed_file, "GB") + Float ped_size = size(ped_input, "GB") + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(10 + ped_size + bed_file_size * 2.0), + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File reformatted_proband_output = "${chromosome}.proband.reformatted.sorted.bed.gz" + File reformatted_parents_output = "${chromosome}.parents.reformatted.sorted.bed.gz" + } + + command { + set -exuo pipefail + + # Reformat bed file + Rscript /src/denovo/reformat_raw_bed.R ~{per_chromosome_bed_file} ~{ped_input} ~{chromosome}.proband.reformatted.bed ~{chromosome}.parents.reformatted.bed + bedtools sort -i ~{chromosome}.proband.reformatted.bed | bgzip -c > ~{chromosome}.proband.reformatted.sorted.bed.gz + bedtools sort -i ~{chromosome}.parents.reformatted.bed | bgzip -c > ~{chromosome}.parents.reformatted.sorted.bed.gz + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task RawReformatBedDepth { + input { + File per_chromosome_bed_file + File ped_input + String chromosome + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float bed_file_size = size(per_chromosome_bed_file, "GB") + Float ped_size = size(ped_input, "GB") + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(10 + ped_size + bed_file_size * 2.0), + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File reformatted_proband_depth_output = "${chromosome}.proband.depth.reformatted.sorted.bed.gz" + File reformatted_parents_depth_output = "${chromosome}.parents.depth.reformatted.sorted.bed.gz" + } + + command { + set -exuo pipefail + + # Reformat bed file + Rscript /src/denovo/reformat_raw_bed.R ~{per_chromosome_bed_file} ~{ped_input} ~{chromosome}.proband.reformatted.bed ~{chromosome}.parents.reformatted.bed + bedtools sort -i ~{chromosome}.proband.reformatted.bed | bgzip > ~{chromosome}.proband.depth.reformatted.sorted.bed.gz + bedtools sort -i ~{chromosome}.parents.reformatted.bed | bgzip > ~{chromosome}.parents.depth.reformatted.sorted.bed.gz + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} diff --git a/wdl/RunDeNovoSVs.wdl b/wdl/RunDeNovoSVs.wdl new file mode 100644 index 000000000..2b8dbcfe9 --- /dev/null +++ b/wdl/RunDeNovoSVs.wdl @@ -0,0 +1,662 @@ +version 1.0 + +import "Structs.wdl" +import "ReformatRawFiles.wdl" as raw +import "TasksMakeCohortVcf.wdl" as miniTasks +import "DeNovoSVsScatter.wdl" as runDeNovo +import "Utils.wdl" as util + +workflow DeNovoSV { + + input { + File vcf_file + File? vcf_index + File ped_input + File batch_raw_file + File batch_depth_raw_file + File batch_bincov_index + + Array[String] contigs + String prefix + + File? family_ids_txt + + File python_config + File genomic_disorder_input + + File exclude_regions + File sample_batches + + Int records_per_shard + + String variant_interpretation_docker + String sv_base_mini_docker + String sv_pipeline_updates_docker + String python_docker + RuntimeAttr? runtime_attr_gd + RuntimeAttr? runtime_attr_denovo + RuntimeAttr? runtime_attr_raw_vcf_to_bed + RuntimeAttr? runtime_attr_raw_merge_bed + RuntimeAttr? runtime_attr_subset_vcf + RuntimeAttr? runtime_attr_vcf_to_bed + RuntimeAttr? runtime_attr_raw_divide_by_chrom + RuntimeAttr? runtime_attr_raw_reformat_bed + RuntimeAttr? runtime_attr_merge_final_bed_files + RuntimeAttr? runtime_attr_create_plots + RuntimeAttr? runtime_override_shard_vcf + RuntimeAttr? runtime_attr_clean_ped + RuntimeAttr? runtime_attr_call_outliers + RuntimeAttr? runtime_attr_get_batched_files + RuntimeAttr? runtime_attr_subset_by_samples + RuntimeAttr? runtime_attr_merge_gd + RuntimeAttr? runtime_attr_batch_vcf + RuntimeAttr? runtime_attr_merge + } + + # family_ids_txt is a text file containg one family id per line. + # If this file is given, subset all other input files to only include the necessary batches. + if (defined(family_ids_txt)) { + File family_ids_txt_ = select_first([family_ids_txt]) + call GetBatchedFiles { + input: + batch_raw_file = batch_raw_file, + batch_depth_raw_file = batch_depth_raw_file, + ped_input = ped_input, + family_ids_txt = family_ids_txt_, + sample_batches = sample_batches, + batch_bincov_index = batch_bincov_index, + python_docker=python_docker, + runtime_attr_override = runtime_attr_get_batched_files + } + + call util.SubsetVcfBySamplesList { + input: + vcf = vcf_file, + vcf_idx = vcf_index, + list_of_samples = GetBatchedFiles.samples, + outfile_name = prefix, + keep_af = true, + remove_private_sites = false, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_subset_by_samples + } + } + + # Makes a ped file of singletons, duos, and trios for input into the de novo script (only including families of interest) + call CleanPed { + input: + ped_input = ped_input, + vcf_input = select_first([SubsetVcfBySamplesList.vcf_subset, vcf_file]), + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_clean_ped + } + + # Splits raw files into probands and parents and reformats to have chrom_svtype_sample as the first column for probands and chrom_svtype_famid as the first column for parents + call raw.ReformatRawFiles as ReformatRawFiles { + input: + contigs = contigs, + raw_files_list = select_first([GetBatchedFiles.batch_raw_files_list, batch_raw_file]), + ped_input = CleanPed.cleaned_ped, + depth = false, + variant_interpretation_docker = variant_interpretation_docker, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_vcf_to_bed = runtime_attr_raw_vcf_to_bed, + runtime_attr_merge_bed = runtime_attr_raw_merge_bed, + runtime_attr_divide_by_chrom = runtime_attr_raw_divide_by_chrom, + runtime_attr_reformat_bed = runtime_attr_raw_reformat_bed + } + + # Splits raw files into probands and parents and reformats to have chrom_svtype_sample as the first column for probands and chrom_svtype_famid as the first column for parents + call raw.ReformatRawFiles as ReformatDepthRawFiles { + input: + contigs = contigs, + raw_files_list = select_first([GetBatchedFiles.batch_depth_raw_files_list, batch_depth_raw_file]), + ped_input = CleanPed.cleaned_ped, + depth = true, + variant_interpretation_docker = variant_interpretation_docker, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_vcf_to_bed = runtime_attr_raw_vcf_to_bed, + runtime_attr_merge_bed = runtime_attr_raw_merge_bed, + runtime_attr_divide_by_chrom = runtime_attr_raw_divide_by_chrom, + runtime_attr_reformat_bed = runtime_attr_raw_reformat_bed + } + + scatter (i in range(length(contigs))) { + # Generates a list of genomic disorder regions in the vcf input as well as in the depth raw files + call GetGenomicDisorders { + input: + genomic_disorder_input=genomic_disorder_input, + ped = ped_input, + vcf_file = select_first([SubsetVcfBySamplesList.vcf_subset, vcf_file]), + depth_raw_file_proband = ReformatDepthRawFiles.reformatted_proband_raw_files[i], + depth_raw_file_parents = ReformatDepthRawFiles.reformatted_parents_raw_files[i], + chromosome=contigs[i], + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_gd + } + + # Splits vcf by chromosome + call SubsetVcf { + input: + vcf_file = select_first([SubsetVcfBySamplesList.vcf_subset, vcf_file]), + chromosome=contigs[i], + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_subset_vcf + } + + # Shards vcf + call miniTasks.ScatterVcf as ScatterVcf { + input: + vcf=SubsetVcf.vcf_output, + prefix=prefix, + records_per_shard=records_per_shard, + sv_pipeline_docker=sv_pipeline_updates_docker, + runtime_attr_override=runtime_override_shard_vcf + } + + # Runs the de novo calling python script on each shard and outputs a per chromosome list of de novo SVs + call runDeNovo.DeNovoSVsScatter as GetDeNovo { + input: + ped_input=CleanPed.cleaned_ped, + vcf_files=ScatterVcf.shards, + disorder_input=GetGenomicDisorders.gd_output_for_denovo, + chromosome=contigs[i], + raw_proband=ReformatRawFiles.reformatted_proband_raw_files[i], + raw_parents=ReformatRawFiles.reformatted_parents_raw_files[i], + raw_depth_proband=ReformatDepthRawFiles.reformatted_proband_raw_files[i], + raw_depth_parents=ReformatDepthRawFiles.reformatted_parents_raw_files[i], + exclude_regions = exclude_regions, + sample_batches = sample_batches, + batch_bincov_index = select_first([GetBatchedFiles.batch_bincov_index_subset, batch_bincov_index]), + python_config=python_config, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_denovo = runtime_attr_denovo, + runtime_attr_vcf_to_bed = runtime_attr_vcf_to_bed + } + } + + # Merges the per chromosome final de novo SV outputs + call MergeDenovoBedFiles { + input: + bed_files = GetDeNovo.per_chromosome_final_output_file, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_merge_final_bed_files + } + + # Outputs a final callset of de novo SVs as well as outlier de novo SV calls + call CallOutliers { + input: + bed_file = MergeDenovoBedFiles.merged_denovo_output, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_call_outliers + } + + # Generates plots for QC + call CreatePlots { + input: + bed_file = CallOutliers.final_denovo_nonOutliers_output, + ped_input = ped_input, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_create_plots + } + + # Merges the genomic disorder region output from each chromosome to compile a list of genomic disorder regions + call MergeGenomicDisorders { + input: + genomic_disorder_input=GetGenomicDisorders.gd_output_from_depth_raw_files, + variant_interpretation_docker=variant_interpretation_docker, + runtime_attr_override = runtime_attr_merge_gd + } + + output { + File cleaned_ped = CleanPed.cleaned_ped + File final_denovo_nonOutliers = CallOutliers.final_denovo_nonOutliers_output + File final_denovo_outliers = CallOutliers.final_denovo_outliers_output + File final_denovo_nonOutliers_plots = CreatePlots.output_plots + Array [File] denovo_output_annotated = GetDeNovo.per_chromosome_annotation_output_file + File gd_depth = MergeGenomicDisorders.gd_output_from_depth + File gd_vcf = GetGenomicDisorders.gd_output_from_final_vcf[1] + } +} + +task SubsetVcf { + + input { + File vcf_file + String chromosome + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf_file, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + input_size * 1.5), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File vcf_output = "~{chromosome}.vcf.gz" + File no_header_vcf_output = "~{chromosome}.noheader.vcf.gz" + } + + command <<< + set -euxo pipefail + + bcftools index ~{vcf_file} + bcftools view ~{vcf_file} --regions ~{chromosome} -O z -o ~{chromosome}.vcf.gz + bcftools view ~{chromosome}.vcf.gz | grep -v ^## | bgzip -c > ~{chromosome}.noheader.vcf.gz + >>> + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task GetGenomicDisorders { + + input { + File vcf_file + File ped + File depth_raw_file_proband + File depth_raw_file_parents + String chromosome + File genomic_disorder_input + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(select_all([vcf_file, ped, genomic_disorder_input, depth_raw_file_parents, depth_raw_file_proband]), "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + input_size), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File gd_output_from_final_vcf = "gd.variants.from.final.vcf.txt.gz" + File gd_output_from_depth_raw_files = "~{chromosome}.gd.variants.in.depth.raw.files.txt.gz" + File gd_output_for_denovo = "annotated.gd.variants.names.txt" + } + + command <<< + set -euxo pipefail + + sort -k1,1 -k2,2n ~{genomic_disorder_input} > sorted.genomic.txt + bedtools intersect -wa -wb -f 0.3 -r -a ~{vcf_file} -b ~{genomic_disorder_input} | cut -f 3 |sort -u > annotated.gd.variants.names.txt + + echo "Done with first line" + + bedtools intersect -wa -wb -f 0.3 -r -a ~{vcf_file} -b ~{genomic_disorder_input} > gd.variants.from.final.vcf.txt + bgzip gd.variants.from.final.vcf.txt + + echo "Done with GD from vcf" + + Rscript /src/denovo/create_per_sample_bed.R ~{genomic_disorder_input} unsorted.gd.per.sample.txt unsorted.gd.per.family.txt ~{ped} ~{chromosome} + sort -k1,1 -k2,2n unsorted.gd.per.sample.txt > gd.per.sample.txt + sort -k1,1 -k2,2n unsorted.gd.per.family.txt > gd.per.family.txt + cat ~{depth_raw_file_parents} | gunzip | sort -k1,1 -k2,2n | bgzip -c > sorted.depth.parents.bed.gz + cat ~{depth_raw_file_proband} | gunzip | sort -k1,1 -k2,2n | bgzip -c > sorted.depth.proband.bed.gz + + echo "Done with R script" + + bedtools intersect -wa -wb -f 0.3 -r -sorted -a gd.per.sample.txt -b sorted.depth.proband.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.proband.txt + bedtools intersect -wa -wb -f 0.3 -r -sorted -a gd.per.family.txt -b sorted.depth.parents.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.parents.txt + + echo "done with intersect in depth variants" + + bedtools coverage -wa -wb -sorted -a gd.per.family.txt -b sorted.depth.parents.bed.gz | awk '{if ($NF>=0.30) print }' > ~{chromosome}.coverage.parents.txt + bedtools coverage -wa -wb -sorted -a gd.per.sample.txt -b sorted.depth.proband.bed.gz | awk '{if ($NF>=0.30) print }' > ~{chromosome}.coverage.proband.txt + + echo "done with coverage in depth variants" + + cat ~{chromosome}.coverage.parents.txt ~{chromosome}.coverage.proband.txt > ~{chromosome}.coverage.txt + + echo "done with cat" + + bedtools intersect -wa -wb -f 0.3 -sorted -a gd.per.sample.txt -b sorted.depth.proband.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.proband.no.r.txt + bedtools intersect -wa -wb -f 0.3 -sorted -a gd.per.family.txt -b sorted.depth.parents.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.parents.no.r.txt + + echo "done with intersect no -r" + + cat ~{chromosome}.gd.variants.in.depth.raw.file.proband.no.r.txt ~{chromosome}.gd.variants.in.depth.raw.file.parents.no.r.txt > ~{chromosome}.remove.txt + + echo "done with cat" + + bedtools intersect -v -wb -b ~{chromosome}.remove.txt -a ~{chromosome}.coverage.txt > ~{chromosome}.kept.coverage.txt + + echo "done with grep" + + cat ~{chromosome}.gd.variants.in.depth.raw.file.proband.txt ~{chromosome}.gd.variants.in.depth.raw.file.parents.txt ~{chromosome}.kept.coverage.txt > ~{chromosome}.gd.variants.in.depth.raw.files.txt + bgzip ~{chromosome}.gd.variants.in.depth.raw.files.txt + echo "done with cat" + >>> + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task MergeGenomicDisorders { + + input { + Array[File] genomic_disorder_input + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(genomic_disorder_input, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + input_size), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File gd_output_from_depth = "gd.raw.files.output.txt.gz" + } + + command { + set -euxo pipefail + + zcat ~{sep=" " genomic_disorder_input} > gd.raw.files.output.txt + bgzip gd.raw.files.output.txt + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task MergeDenovoBedFiles { + + input { + Array[File] bed_files + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float bed_files_size = size(bed_files, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + (bed_files_size) * 2.0), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File merged_denovo_output = "denovo.merged.bed.gz" + } + + command { + set -exuo pipefail + + zcat ~{bed_files[0]} | awk 'NR<=1' > denovo.merged.bed + zcat ~{sep=" " bed_files} | grep -v ^chrom >> denovo.merged.bed + bgzip denovo.merged.bed + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task CallOutliers { + + input { + File bed_file + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float bed_files_size = size(bed_file, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 16, # 3.75 + disk_gb: ceil(10 + (bed_files_size) * 2.0), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File final_denovo_nonOutliers_output = "final.denovo.merged.bed.gz" + File final_denovo_outliers_output = "final.denovo.merged.outliers.bed.gz" + File final_denovo_allSamples_output = "final.denovo.merged.allSamples.bed.gz" + } + + command { + set -exuo pipefail + + python /src/denovo/denovo_outliers.py --bed ~{bed_file} + + bgzip final.denovo.merged.bed + bgzip final.denovo.merged.outliers.bed + bgzip final.denovo.merged.allSamples.bed + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task CreatePlots { + + input { + File bed_file + File ped_input + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(select_all([bed_file, ped_input]), "GB") + + RuntimeAttr default_attr = object { + mem_gb: 16, # 3.75 + disk_gb: ceil(10 + input_size * 1.2), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File output_plots = "output_plots.pdf" + } + + command { + set -exuo pipefail + + Rscript /src/denovo/denovo_sv_plots.R ~{bed_file} ~{ped_input} output_plots.pdf + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task CleanPed { + + input { + File ped_input + File vcf_input + String variant_interpretation_docker + RuntimeAttr? runtime_attr_override + } + + Float ped_size = size(ped_input, "GB") + Float vcf_size = size(vcf_input, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + vcf_size + ped_size * 1.5), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File cleaned_ped = "subset_cleaned_ped.txt" + } + + command { + set -exuo pipefail + + # TODO: this script should get the name of the output file it generates as an input argument. + # The output filename is currently hardcoded to be 'clean_ped.txt'. + Rscript /src/denovo/clean_ped.R ~{ped_input} + cut -f2 cleaned_ped.txt | awk 'NR > 1' > all_samples.txt + bcftools query -l ~{vcf_input} > samples_to_include_in_ped.txt + + # Note: grep returns a non-success exit code (i.e., other than `0`) when it cannot find the + # match in the following scripts. We do not expect it to find a match for every entry. + # Hence, to avoid exit with unsuccessful code, we can either drop pipefail from above or use `|| true`. + + grep -w -v -f samples_to_include_in_ped.txt all_samples.txt > excluded_samples.txt || true + grep -w -f excluded_samples.txt cleaned_ped.txt | cut -f1 | sort -u > excluded_families.txt || true + grep -w -v -f excluded_families.txt cleaned_ped.txt > subset_cleaned_ped.txt || true + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +} + +task GetBatchedFiles { + + input { + File batch_raw_file + File batch_depth_raw_file + File family_ids_txt + File ped_input + File sample_batches + File batch_bincov_index + String python_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(select_all([batch_raw_file, batch_depth_raw_file, ped_input, sample_batches, batch_bincov_index, family_ids_txt]), "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + input_size), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File batch_raw_files_list = "batch_raw_files_list.txt" + File batch_depth_raw_files_list = "batch_depth_raw_files_list.txt" + File batch_bincov_index_subset = "batch_bincov_index.txt" + File samples = "samples.txt" + } + + command { + set -exuo pipefail + + if grep -q -w -f ~{family_ids_txt} ~{ped_input}; then + grep -w -f ~{family_ids_txt} ~{ped_input} | cut -f2 | sort -u > samples.txt + else + echo "No matching family IDs from family_ids_txt found in ped_input file." >&2 + exit 1 + fi + + if grep -q -w -f samples.txt ~{sample_batches}; then + grep -w -f samples.txt ~{sample_batches} | cut -f2 | sort -u > batches.txt + else + echo "No matching individual IDs found in the sample_batches file." >&2 + exit 1 + fi + + grep -w -f batches.txt ~{batch_bincov_index} > batch_bincov_index.txt + grep -w -f batches.txt ~{batch_raw_file} > batch_raw_files_list.txt + grep -w -f batches.txt ~{batch_depth_raw_file} > batch_depth_raw_files_list.txt + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: python_docker + } +} \ No newline at end of file diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl index 16efe98b9..d4a343895 100644 --- a/wdl/Utils.wdl +++ b/wdl/Utils.wdl @@ -782,3 +782,48 @@ task SubsetVcfBySamplesList { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } } + +task VcfToBed { + + input { + File vcf_file + String variant_interpretation_docker + String? args + RuntimeAttr? runtime_attr_override + } + + Float vcf_size = size(vcf_file, "GB") + + RuntimeAttr default_attr = object { + mem_gb: 3.75, + disk_gb: ceil(10 + vcf_size * 1.5), + cpu_cores: 1, + preemptible_tries: 2, + max_retries: 1, + boot_disk_gb: 8 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File bed_output = "~{basename}.bed.gz" + } + + String basename = basename(vcf_file, ".vcf.gz") + command { + set -exuo pipefail + + svtk vcf2bed ~{vcf_file} ~{args} ~{basename}.bed + bgzip ~{basename}.bed + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: variant_interpretation_docker + } +}