Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a preview version of the de-novo pipeline #641

Merged
merged 19 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions dockerfiles/denovo/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
ARG SV_PIPELINE_IMAGE=sv-pipeline:latest
FROM $SV_PIPELINE_IMAGE

COPY /src/denovo /src/denovo/

# make & cpp are needed for installing `tidyverse`.
RUN apt-get -qqy update && \
apt-get -qqy install ca-certificates build-essential && \
Rscript -e "\
install.packages( \
c( \
'BiocManager', \
'Rcpp', \
'tidyverse', \
'R.utils', \
'UpSetR', \
'ggplotify' \
), \
repos = 'https://cran.rstudio.com', \
quiet = TRUE \
)" && \
pip install fsspec gcsfs
5 changes: 5 additions & 0 deletions scripts/docker/build_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ class to track dependencies, control build and push of entire job
docker_dependencies={
"samtools-cloud": "SAMTOOLS_CLOUD_IMAGE",
"sv-utils-env": "VIRTUAL_ENV_IMAGE"}
),
"denovo": ImageDependencies(
git_dependencies=("dockerfiles/denovo/*", "src/denovo/*"),
docker_dependencies={
"sv-pipeline": "SV_PIPELINE_IMAGE"}
)
}
non_public_images = frozenset({"melt"})
Expand Down
88 changes: 88 additions & 0 deletions src/denovo/clean_ped.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# This script creates a ped file with all families >3 into trios
# to match desired input for de novo SV filtering

# Define args and load libraries
args = commandArgs(trailingOnly=TRUE)
require(plyr)
library(data.table)
library(tidyverse)

input_ped <- args[1]

# Read input ped
ped <- fread(input_ped,header = TRUE)

# Add a column with size of family
ped %>%
group_by(FamID) %>%
mutate(FamSize = n()) -> ped2

# Steph's code
distfam<-ped2%>%distinct(FamID,FamSize)
fam_breakdown<-distfam%>%group_by(FamSize)%>%tally()

# subset the families > 3
large_families<-ped2%>%filter(FamSize>=4)
large_family_ids <- large_families$FamID # FamIDs of families >3

large_families_prob<-large_families%>%filter(MotherID!=0|FatherID!=0) # probands from families > 3
large_families_prob$newFamID <- NA
families<-unique(large_families_prob$FamID)

# function that splits by probands, adds _# to the end of the FamID and then finds the parents and adds _# to them as well
split_fam<-function(probands,pedigree){
trio <- data.frame(matrix(ncol = 8, nrow = 0))
families<-unique(probands$FamID)
for (i in 1:length(families)){
# reassign family IDs for >=quads (fam1 => fam1_1, fam1_2)
perfam<-subset(probands,FamID==families[i])
for (x in 1:nrow(perfam)){
probands$newFamID[probands$IndividualID==as.character(perfam[x,2])]<-paste0(probands$FamID[probands$IndividualID==as.character(perfam[x,2])],"_",x)
}
}
# each proband row should now have a new fam_id
# pedigree needs a new column
pedigree$newFamID<-NA
# now go back and pick up parents info and rename famID of parents
for (y in 1:nrow(probands)){
paternal<-subset(pedigree, IndividualID == as.character(probands[y, 3]))
paternal$newFamID <- as.character(probands[y,8])
maternal<-subset(pedigree, IndividualID == as.character(probands[y, 4]))
maternal$newFamID<-as.character(probands[y,8])
trio<-rbind(data.frame(trio), data.frame(probands[y,]),data.frame(paternal),data.frame(maternal))
}
return(trio)
}

output<-split_fam(large_families_prob, large_families)

# subset out newFamID column and rename FamID
new_output <- subset(output, select=c("newFamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))
colnames(new_output)[1] <- "FamID"

# subset to just families with all three in the new ped
new_output %>%
group_by(FamID) %>%
mutate(FamSize = n()) %>%
filter(FamSize==3)-> trios

new_trios <- subset(trios, select=c("FamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))

# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
new_trios$FatherID[!(new_trios$FatherID %in% ped$IndividualID)] <- 0
new_trios$MotherID[!(new_trios$MotherID %in% ped$IndividualID)] <- 0

# subset ped to only include families that are not > 3
ped_without_large_families <- subset(ped, !(FamID %in% large_family_ids))

# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
ped_without_large_families$FatherID[!(ped_without_large_families$FatherID %in% ped$IndividualID)] <- 0
ped_without_large_families$MotherID[!(ped_without_large_families$MotherID %in% ped$IndividualID)] <- 0

# concat new trio pedigree and old pedigree without families > 3
ped_without_large_families_df <- data.frame(ped_without_large_families)
new_trios_df <- data.frame(new_trios)

final_ped<-rbind(new_trios_df, ped_without_large_families_df)

write.table(final_ped,file="cleaned_ped.txt",quote=F,row.names=F,col.names=T,sep="\t")
66 changes: 66 additions & 0 deletions src/denovo/create_per_sample_bed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
require(plyr)
library(data.table)
library(tidyverse)


args = commandArgs(trailingOnly=TRUE)
input_gd <- args[1]
proband_gd <- args[2]
parents_gd <- args[3]
ped <- args[4]
chromosome <- args[5]

df_ori <- as.data.frame(read.table(input_gd, header = FALSE, sep="\t",stringsAsFactors=FALSE, quote=""))
colnames(df_ori) <- c("chrom", "start", "end", "name", "annotation", "svtype")
df <- subset(df_ori, chrom == chromosome)

df_DUP <- subset(df, svtype == 'DUP' | svtype == 'DEL;DUP')
if (nrow(df_DUP) > 0) {
df_DUP$svtype <- 'DUP'
}

df_DEL <- subset(df, svtype == 'DEL' | svtype == 'DEL;DUP')
if (nrow(df_DEL) > 0) {
df_DEL$svtype <- 'DEL'
}

if (nrow(df_DEL) > 0 & nrow(df_DUP) > 0) {
df2 = rbind(df_DUP,df_DEL)
} else if (nrow(df_DEL) > 0) {
df2 = df_DEL
} else if (nrow(df_DUP) > 0) {
df2 = df_DUP
}

if (exists('df2')) {
ped <- as.data.frame(read.table(ped, header = TRUE, sep="\t",stringsAsFactors=FALSE, quote=""))
samples <- ped$IndividualID
df2$samples <- gsub(" ", "",toString(samples))

df2 %>%
subset(select = c(chrom, start, end, name, svtype, samples)) %>%
separate_rows(samples, sep = ",", convert = T) -> df3

df3$chrom <- paste0(df3$chrom, "_", df3$svtype, "_", df3$samples)
df3 <- df3[, c(1,2,3,4)]

ped_subset <- subset(ped, select = c('FamID', 'IndividualID'))
colnames(ped_subset) <- c('FamID', 'samples')

df2 %>%
subset(select = c(chrom, start, end, name, svtype, samples)) %>%
separate_rows(samples, sep = ",", convert = T) -> new_df

new_df <- merge(new_df,ped_subset, by="samples", all.x = TRUE)
new_df$chrom <- paste0(new_df$chrom, "_", new_df$svtype, "_", new_df$FamID)
new_df <- new_df[, c(2,3,4,5)]

#df4 = rbind(df3,new_df)

write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
write.table(new_df, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
} else{
df3 <- data.frame()
write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
write.table(df3, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
}
33 changes: 33 additions & 0 deletions src/denovo/denovo_outliers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import argparse
import pandas as pd
import numpy as np
import math

parser = argparse.ArgumentParser(description='Parse arguments')
parser.add_argument('--bed', dest='bed', help='Input BED file')
args = parser.parse_args()

bed_file = args.bed

bed_original = pd.read_csv(bed_file, sep='\t', index_col=False).replace(np.nan, '', regex=True)

# Remove outliers
bed = bed_original
sample_counts = bed['sample'].value_counts()
q3, q1 = np.percentile(sample_counts.tolist(), [75, 25])
iqr = q3 - q1
count_threshold = int(math.ceil((1.5 * iqr + q3) * 3))
sample_counts2 = sample_counts.rename_axis('sample').reset_index(name='count')
samples_keep = sample_counts2[(sample_counts2['count'] <= count_threshold)]['sample'].tolist()

# Keep samples and outliers in sepparate files
output = bed[(bed['sample'].isin(samples_keep))]
output_outliers = bed[(~bed['sample'].isin(samples_keep))]

bed_original.loc[~(bed_original['sample'].isin(samples_keep)) & bed_original['is_de_novo'], 'filter_flag'] = 'sample_outlier'

# Write output
output.to_csv(path_or_buf='final.denovo.merged.bed', mode='a', index=False, sep='\t', header=True)
output_outliers.to_csv(path_or_buf='final.denovo.merged.outliers.bed', mode='a', index=False, sep='\t', header=True)

bed_original.to_csv(path_or_buf='final.denovo.merged.allSamples.bed', mode='a', index=False, sep='\t', header=True)
Loading
Loading