Skip to content

Commit

Permalink
Add a preview version of the de-novo pipeline (#641)
Browse files Browse the repository at this point in the history
* Add de-novo pipeline

* Remove duplicate declarations of runtime_override_shard_vcf.

* Fix linting errors.

* Fix linting errors.

* Fix a typo.

* Refactor input_size var & limit it to only the largest files.

* Delete a comment.

* Move VcfToBed to Utils.wdl.

* Replace RawVcfToBed with VcfToBed.

* Remove the extra flag.

* Prefix BED/CSV column names with `#` for consistency with the rest of the pipeline.

* Delete unused task.

* Delete unused output.

* Remove unused input.

* Fix a leftover from replacing shards_string with shards.

* Replace RawMergeBed with TasksMakeCohortVcf.CatUncompressedFiles.

* Revert adding # to the column header.

* Replace grep with awk for better error handling.

* Replace GetBatchedVcf with SubsetVcfBySamplesList for performance.
  • Loading branch information
VJalili authored May 2, 2024
1 parent a42dfeb commit 76060cf
Show file tree
Hide file tree
Showing 12 changed files with 2,615 additions and 0 deletions.
22 changes: 22 additions & 0 deletions dockerfiles/denovo/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
ARG SV_PIPELINE_IMAGE=sv-pipeline:latest
FROM $SV_PIPELINE_IMAGE

COPY /src/denovo /src/denovo/

# make & cpp are needed for installing `tidyverse`.
RUN apt-get -qqy update && \
apt-get -qqy install ca-certificates build-essential && \
Rscript -e "\
install.packages( \
c( \
'BiocManager', \
'Rcpp', \
'tidyverse', \
'R.utils', \
'UpSetR', \
'ggplotify' \
), \
repos = 'https://cran.rstudio.com', \
quiet = TRUE \
)" && \
pip install fsspec gcsfs
5 changes: 5 additions & 0 deletions scripts/docker/build_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ class to track dependencies, control build and push of entire job
docker_dependencies={
"samtools-cloud": "SAMTOOLS_CLOUD_IMAGE",
"sv-utils-env": "VIRTUAL_ENV_IMAGE"}
),
"denovo": ImageDependencies(
git_dependencies=("dockerfiles/denovo/*", "src/denovo/*"),
docker_dependencies={
"sv-pipeline": "SV_PIPELINE_IMAGE"}
)
}
non_public_images = frozenset({"melt"})
Expand Down
88 changes: 88 additions & 0 deletions src/denovo/clean_ped.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# This script creates a ped file with all families >3 into trios
# to match desired input for de novo SV filtering

# Define args and load libraries
args = commandArgs(trailingOnly=TRUE)
require(plyr)
library(data.table)
library(tidyverse)

input_ped <- args[1]

# Read input ped
ped <- fread(input_ped,header = TRUE)

# Add a column with size of family
ped %>%
group_by(FamID) %>%
mutate(FamSize = n()) -> ped2

# Steph's code
distfam<-ped2%>%distinct(FamID,FamSize)
fam_breakdown<-distfam%>%group_by(FamSize)%>%tally()

# subset the families > 3
large_families<-ped2%>%filter(FamSize>=4)
large_family_ids <- large_families$FamID # FamIDs of families >3

large_families_prob<-large_families%>%filter(MotherID!=0|FatherID!=0) # probands from families > 3
large_families_prob$newFamID <- NA
families<-unique(large_families_prob$FamID)

# function that splits by probands, adds _# to the end of the FamID and then finds the parents and adds _# to them as well
split_fam<-function(probands,pedigree){
trio <- data.frame(matrix(ncol = 8, nrow = 0))
families<-unique(probands$FamID)
for (i in 1:length(families)){
# reassign family IDs for >=quads (fam1 => fam1_1, fam1_2)
perfam<-subset(probands,FamID==families[i])
for (x in 1:nrow(perfam)){
probands$newFamID[probands$IndividualID==as.character(perfam[x,2])]<-paste0(probands$FamID[probands$IndividualID==as.character(perfam[x,2])],"_",x)
}
}
# each proband row should now have a new fam_id
# pedigree needs a new column
pedigree$newFamID<-NA
# now go back and pick up parents info and rename famID of parents
for (y in 1:nrow(probands)){
paternal<-subset(pedigree, IndividualID == as.character(probands[y, 3]))
paternal$newFamID <- as.character(probands[y,8])
maternal<-subset(pedigree, IndividualID == as.character(probands[y, 4]))
maternal$newFamID<-as.character(probands[y,8])
trio<-rbind(data.frame(trio), data.frame(probands[y,]),data.frame(paternal),data.frame(maternal))
}
return(trio)
}

output<-split_fam(large_families_prob, large_families)

# subset out newFamID column and rename FamID
new_output <- subset(output, select=c("newFamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))
colnames(new_output)[1] <- "FamID"

# subset to just families with all three in the new ped
new_output %>%
group_by(FamID) %>%
mutate(FamSize = n()) %>%
filter(FamSize==3)-> trios

new_trios <- subset(trios, select=c("FamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))

# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
new_trios$FatherID[!(new_trios$FatherID %in% ped$IndividualID)] <- 0
new_trios$MotherID[!(new_trios$MotherID %in% ped$IndividualID)] <- 0

# subset ped to only include families that are not > 3
ped_without_large_families <- subset(ped, !(FamID %in% large_family_ids))

# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
ped_without_large_families$FatherID[!(ped_without_large_families$FatherID %in% ped$IndividualID)] <- 0
ped_without_large_families$MotherID[!(ped_without_large_families$MotherID %in% ped$IndividualID)] <- 0

# concat new trio pedigree and old pedigree without families > 3
ped_without_large_families_df <- data.frame(ped_without_large_families)
new_trios_df <- data.frame(new_trios)

final_ped<-rbind(new_trios_df, ped_without_large_families_df)

write.table(final_ped,file="cleaned_ped.txt",quote=F,row.names=F,col.names=T,sep="\t")
66 changes: 66 additions & 0 deletions src/denovo/create_per_sample_bed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
require(plyr)
library(data.table)
library(tidyverse)


args = commandArgs(trailingOnly=TRUE)
input_gd <- args[1]
proband_gd <- args[2]
parents_gd <- args[3]
ped <- args[4]
chromosome <- args[5]

df_ori <- as.data.frame(read.table(input_gd, header = FALSE, sep="\t",stringsAsFactors=FALSE, quote=""))
colnames(df_ori) <- c("chrom", "start", "end", "name", "annotation", "svtype")
df <- subset(df_ori, chrom == chromosome)

df_DUP <- subset(df, svtype == 'DUP' | svtype == 'DEL;DUP')
if (nrow(df_DUP) > 0) {
df_DUP$svtype <- 'DUP'
}

df_DEL <- subset(df, svtype == 'DEL' | svtype == 'DEL;DUP')
if (nrow(df_DEL) > 0) {
df_DEL$svtype <- 'DEL'
}

if (nrow(df_DEL) > 0 & nrow(df_DUP) > 0) {
df2 = rbind(df_DUP,df_DEL)
} else if (nrow(df_DEL) > 0) {
df2 = df_DEL
} else if (nrow(df_DUP) > 0) {
df2 = df_DUP
}

if (exists('df2')) {
ped <- as.data.frame(read.table(ped, header = TRUE, sep="\t",stringsAsFactors=FALSE, quote=""))
samples <- ped$IndividualID
df2$samples <- gsub(" ", "",toString(samples))

df2 %>%
subset(select = c(chrom, start, end, name, svtype, samples)) %>%
separate_rows(samples, sep = ",", convert = T) -> df3

df3$chrom <- paste0(df3$chrom, "_", df3$svtype, "_", df3$samples)
df3 <- df3[, c(1,2,3,4)]

ped_subset <- subset(ped, select = c('FamID', 'IndividualID'))
colnames(ped_subset) <- c('FamID', 'samples')

df2 %>%
subset(select = c(chrom, start, end, name, svtype, samples)) %>%
separate_rows(samples, sep = ",", convert = T) -> new_df

new_df <- merge(new_df,ped_subset, by="samples", all.x = TRUE)
new_df$chrom <- paste0(new_df$chrom, "_", new_df$svtype, "_", new_df$FamID)
new_df <- new_df[, c(2,3,4,5)]

#df4 = rbind(df3,new_df)

write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
write.table(new_df, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
} else{
df3 <- data.frame()
write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
write.table(df3, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
}
33 changes: 33 additions & 0 deletions src/denovo/denovo_outliers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import argparse
import pandas as pd
import numpy as np
import math

parser = argparse.ArgumentParser(description='Parse arguments')
parser.add_argument('--bed', dest='bed', help='Input BED file')
args = parser.parse_args()

bed_file = args.bed

bed_original = pd.read_csv(bed_file, sep='\t', index_col=False).replace(np.nan, '', regex=True)

# Remove outliers
bed = bed_original
sample_counts = bed['sample'].value_counts()
q3, q1 = np.percentile(sample_counts.tolist(), [75, 25])
iqr = q3 - q1
count_threshold = int(math.ceil((1.5 * iqr + q3) * 3))
sample_counts2 = sample_counts.rename_axis('sample').reset_index(name='count')
samples_keep = sample_counts2[(sample_counts2['count'] <= count_threshold)]['sample'].tolist()

# Keep samples and outliers in sepparate files
output = bed[(bed['sample'].isin(samples_keep))]
output_outliers = bed[(~bed['sample'].isin(samples_keep))]

bed_original.loc[~(bed_original['sample'].isin(samples_keep)) & bed_original['is_de_novo'], 'filter_flag'] = 'sample_outlier'

# Write output
output.to_csv(path_or_buf='final.denovo.merged.bed', mode='a', index=False, sep='\t', header=True)
output_outliers.to_csv(path_or_buf='final.denovo.merged.outliers.bed', mode='a', index=False, sep='\t', header=True)

bed_original.to_csv(path_or_buf='final.denovo.merged.allSamples.bed', mode='a', index=False, sep='\t', header=True)
Loading

0 comments on commit 76060cf

Please sign in to comment.