Add a preview version of the de-novo pipeline (#641)

* Add de-novo pipeline * Remove duplicate declarations of runtime_override_shard_vcf. * Fix linting errors. * Fix linting errors. * Fix a typo. * Refactor input_size var & limit it to only the largest files. * Delete a comment. * Move VcfToBed to Utils.wdl. * Replace RawVcfToBed with VcfToBed. * Remove the extra flag. * Prefix BED/CSV column names with `#` for consistency with the rest of the pipeline. * Delete unused task. * Delete unused output. * Remove unused input. * Fix a leftover from replacing shards_string with shards. * Replace RawMergeBed with TasksMakeCohortVcf.CatUncompressedFiles. * Revert adding # to the column header. * Replace grep with awk for better error handling. * Replace GetBatchedVcf with SubsetVcfBySamplesList for performance.
broadinstitute · May 2, 2024 · 76060cf · 76060cf
1 parent a42dfeb
commit 76060cf
Show file tree

Hide file tree

Showing 12 changed files with 2,615 additions and 0 deletions.
diff --git a/dockerfiles/denovo/Dockerfile b/dockerfiles/denovo/Dockerfile
@@ -0,0 +1,22 @@
+ARG SV_PIPELINE_IMAGE=sv-pipeline:latest
+FROM $SV_PIPELINE_IMAGE
+
+COPY /src/denovo /src/denovo/
+
+# make & cpp are needed for installing `tidyverse`.
+RUN apt-get -qqy update && \
+    apt-get -qqy install ca-certificates build-essential && \
+    Rscript -e "\
+        install.packages( \
+            c( \
+                'BiocManager', \
+                'Rcpp', \
+                'tidyverse', \
+                'R.utils', \
+                'UpSetR', \
+                'ggplotify' \
+            ), \
+            repos = 'https://cran.rstudio.com', \
+            quiet = TRUE \
+        )" && \
+    pip install fsspec gcsfs
diff --git a/scripts/docker/build_docker.py b/scripts/docker/build_docker.py
@@ -244,6 +244,11 @@ class to track dependencies, control build and push of entire job
             docker_dependencies={
                 "samtools-cloud": "SAMTOOLS_CLOUD_IMAGE",
                 "sv-utils-env": "VIRTUAL_ENV_IMAGE"}
+        ),
+        "denovo": ImageDependencies(
+            git_dependencies=("dockerfiles/denovo/*", "src/denovo/*"),
+            docker_dependencies={
+                "sv-pipeline": "SV_PIPELINE_IMAGE"}
         )
     }
     non_public_images = frozenset({"melt"})

diff --git a/src/denovo/clean_ped.R b/src/denovo/clean_ped.R
@@ -0,0 +1,88 @@
+# This script creates a ped file with all families >3 into trios
+# to match desired input for de novo SV filtering
+
+# Define args and load libraries
+args = commandArgs(trailingOnly=TRUE)
+require(plyr)
+library(data.table)
+library(tidyverse)
+
+input_ped <- args[1]
+
+# Read input ped
+ped <- fread(input_ped,header = TRUE)
+
+# Add a column with size of family
+ped %>%
+  group_by(FamID) %>%
+  mutate(FamSize = n()) -> ped2
+
+# Steph's code
+distfam<-ped2%>%distinct(FamID,FamSize)
+fam_breakdown<-distfam%>%group_by(FamSize)%>%tally()
+
+# subset the families > 3
+large_families<-ped2%>%filter(FamSize>=4)
+large_family_ids <- large_families$FamID # FamIDs of families >3
+
+large_families_prob<-large_families%>%filter(MotherID!=0|FatherID!=0) # probands from families > 3
+large_families_prob$newFamID <- NA
+families<-unique(large_families_prob$FamID)
+
+# function that splits by probands, adds _# to the end of the FamID and then finds the parents and adds _# to them as well
+split_fam<-function(probands,pedigree){
+  trio <- data.frame(matrix(ncol = 8, nrow = 0))
+  families<-unique(probands$FamID)
+  for (i in 1:length(families)){
+    # reassign family IDs for >=quads (fam1 => fam1_1, fam1_2)
+    perfam<-subset(probands,FamID==families[i])
+    for (x in 1:nrow(perfam)){
+      probands$newFamID[probands$IndividualID==as.character(perfam[x,2])]<-paste0(probands$FamID[probands$IndividualID==as.character(perfam[x,2])],"_",x)
+    }
+  }
+  # each proband row should now have a new fam_id
+  # pedigree needs a new column
+  pedigree$newFamID<-NA
+  # now go back and pick up parents info and rename famID of parents
+  for (y in 1:nrow(probands)){
+    paternal<-subset(pedigree, IndividualID == as.character(probands[y, 3]))
+    paternal$newFamID <- as.character(probands[y,8])
+    maternal<-subset(pedigree, IndividualID == as.character(probands[y, 4]))
+    maternal$newFamID<-as.character(probands[y,8])
+    trio<-rbind(data.frame(trio), data.frame(probands[y,]),data.frame(paternal),data.frame(maternal))
+  }
+  return(trio)
+}
+
+output<-split_fam(large_families_prob, large_families)
+
+# subset out newFamID column and rename FamID
+new_output <- subset(output, select=c("newFamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))
+colnames(new_output)[1] <- "FamID"
+
+# subset to just families with all three in the new ped
+new_output %>%
+  group_by(FamID) %>%
+  mutate(FamSize = n()) %>%
+  filter(FamSize==3)-> trios
+
+new_trios <- subset(trios, select=c("FamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))
+
+# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
+new_trios$FatherID[!(new_trios$FatherID %in% ped$IndividualID)] <- 0
+new_trios$MotherID[!(new_trios$MotherID %in% ped$IndividualID)] <- 0
+
+# subset ped to only include families that are not > 3
+ped_without_large_families <- subset(ped, !(FamID %in% large_family_ids))
+
+# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
+ped_without_large_families$FatherID[!(ped_without_large_families$FatherID %in% ped$IndividualID)] <- 0
+ped_without_large_families$MotherID[!(ped_without_large_families$MotherID %in% ped$IndividualID)] <- 0
+
+# concat new trio pedigree and old pedigree without families > 3
+ped_without_large_families_df <- data.frame(ped_without_large_families)
+new_trios_df <- data.frame(new_trios)
+
+final_ped<-rbind(new_trios_df, ped_without_large_families_df)
+
+write.table(final_ped,file="cleaned_ped.txt",quote=F,row.names=F,col.names=T,sep="\t")
diff --git a/src/denovo/create_per_sample_bed.R b/src/denovo/create_per_sample_bed.R
@@ -0,0 +1,66 @@
+require(plyr)
+library(data.table)
+library(tidyverse)
+
+
+args = commandArgs(trailingOnly=TRUE)
+input_gd <- args[1]
+proband_gd <- args[2]
+parents_gd <- args[3]
+ped <- args[4]
+chromosome <- args[5]
+
+df_ori <- as.data.frame(read.table(input_gd, header = FALSE, sep="\t",stringsAsFactors=FALSE, quote=""))
+colnames(df_ori) <- c("chrom", "start", "end", "name", "annotation", "svtype")
+df <- subset(df_ori, chrom == chromosome)
+
+df_DUP <- subset(df, svtype == 'DUP' | svtype == 'DEL;DUP')
+if (nrow(df_DUP) > 0) {
+  df_DUP$svtype <- 'DUP'
+}
+
+df_DEL <- subset(df, svtype == 'DEL' | svtype == 'DEL;DUP')
+if (nrow(df_DEL) > 0) {
+  df_DEL$svtype <- 'DEL'
+}
+
+if (nrow(df_DEL) > 0 & nrow(df_DUP) > 0) {
+  df2 = rbind(df_DUP,df_DEL)
+} else if (nrow(df_DEL) > 0) {
+  df2 = df_DEL
+} else if (nrow(df_DUP) > 0) {
+  df2 = df_DUP
+}
+
+if (exists('df2')) {
+  ped <- as.data.frame(read.table(ped, header = TRUE, sep="\t",stringsAsFactors=FALSE, quote=""))
+  samples <- ped$IndividualID
+  df2$samples <- gsub(" ", "",toString(samples))
+
+  df2 %>%
+    subset(select = c(chrom, start, end, name, svtype, samples)) %>%
+    separate_rows(samples, sep = ",", convert = T) -> df3
+
+  df3$chrom <- paste0(df3$chrom, "_", df3$svtype, "_", df3$samples)
+  df3 <- df3[, c(1,2,3,4)]
+
+  ped_subset <- subset(ped, select = c('FamID', 'IndividualID'))
+  colnames(ped_subset) <- c('FamID', 'samples')
+
+  df2 %>%
+    subset(select = c(chrom, start, end, name, svtype, samples)) %>%
+    separate_rows(samples, sep = ",", convert = T) -> new_df
+
+  new_df <- merge(new_df,ped_subset, by="samples", all.x = TRUE)
+  new_df$chrom <- paste0(new_df$chrom, "_", new_df$svtype, "_", new_df$FamID)
+  new_df <- new_df[, c(2,3,4,5)]
+
+  #df4 = rbind(df3,new_df)
+
+  write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+  write.table(new_df, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+} else{
+  df3 <- data.frame()
+  write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+  write.table(df3, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+}
diff --git a/src/denovo/denovo_outliers.py b/src/denovo/denovo_outliers.py
@@ -0,0 +1,33 @@
+import argparse
+import pandas as pd
+import numpy as np
+import math
+
+parser = argparse.ArgumentParser(description='Parse arguments')
+parser.add_argument('--bed', dest='bed', help='Input BED file')
+args = parser.parse_args()
+
+bed_file = args.bed
+
+bed_original = pd.read_csv(bed_file, sep='\t', index_col=False).replace(np.nan, '', regex=True)
+
+# Remove outliers
+bed = bed_original
+sample_counts = bed['sample'].value_counts()
+q3, q1 = np.percentile(sample_counts.tolist(), [75, 25])
+iqr = q3 - q1
+count_threshold = int(math.ceil((1.5 * iqr + q3) * 3))
+sample_counts2 = sample_counts.rename_axis('sample').reset_index(name='count')
+samples_keep = sample_counts2[(sample_counts2['count'] <= count_threshold)]['sample'].tolist()
+
+# Keep samples and outliers in sepparate files
+output = bed[(bed['sample'].isin(samples_keep))]
+output_outliers = bed[(~bed['sample'].isin(samples_keep))]
+
+bed_original.loc[~(bed_original['sample'].isin(samples_keep)) & bed_original['is_de_novo'], 'filter_flag'] = 'sample_outlier'
+
+# Write output
+output.to_csv(path_or_buf='final.denovo.merged.bed', mode='a', index=False, sep='\t', header=True)
+output_outliers.to_csv(path_or_buf='final.denovo.merged.outliers.bed', mode='a', index=False, sep='\t', header=True)
+
+bed_original.to_csv(path_or_buf='final.denovo.merged.allSamples.bed', mode='a', index=False, sep='\t', header=True)