-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a preview version of the de-novo pipeline (#641)
* Add de-novo pipeline * Remove duplicate declarations of runtime_override_shard_vcf. * Fix linting errors. * Fix linting errors. * Fix a typo. * Refactor input_size var & limit it to only the largest files. * Delete a comment. * Move VcfToBed to Utils.wdl. * Replace RawVcfToBed with VcfToBed. * Remove the extra flag. * Prefix BED/CSV column names with `#` for consistency with the rest of the pipeline. * Delete unused task. * Delete unused output. * Remove unused input. * Fix a leftover from replacing shards_string with shards. * Replace RawMergeBed with TasksMakeCohortVcf.CatUncompressedFiles. * Revert adding # to the column header. * Replace grep with awk for better error handling. * Replace GetBatchedVcf with SubsetVcfBySamplesList for performance.
- Loading branch information
Showing
12 changed files
with
2,615 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
ARG SV_PIPELINE_IMAGE=sv-pipeline:latest | ||
FROM $SV_PIPELINE_IMAGE | ||
|
||
COPY /src/denovo /src/denovo/ | ||
|
||
# make & cpp are needed for installing `tidyverse`. | ||
RUN apt-get -qqy update && \ | ||
apt-get -qqy install ca-certificates build-essential && \ | ||
Rscript -e "\ | ||
install.packages( \ | ||
c( \ | ||
'BiocManager', \ | ||
'Rcpp', \ | ||
'tidyverse', \ | ||
'R.utils', \ | ||
'UpSetR', \ | ||
'ggplotify' \ | ||
), \ | ||
repos = 'https://cran.rstudio.com', \ | ||
quiet = TRUE \ | ||
)" && \ | ||
pip install fsspec gcsfs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# This script creates a ped file with all families >3 into trios | ||
# to match desired input for de novo SV filtering | ||
|
||
# Define args and load libraries | ||
args = commandArgs(trailingOnly=TRUE) | ||
require(plyr) | ||
library(data.table) | ||
library(tidyverse) | ||
|
||
input_ped <- args[1] | ||
|
||
# Read input ped | ||
ped <- fread(input_ped,header = TRUE) | ||
|
||
# Add a column with size of family | ||
ped %>% | ||
group_by(FamID) %>% | ||
mutate(FamSize = n()) -> ped2 | ||
|
||
# Steph's code | ||
distfam<-ped2%>%distinct(FamID,FamSize) | ||
fam_breakdown<-distfam%>%group_by(FamSize)%>%tally() | ||
|
||
# subset the families > 3 | ||
large_families<-ped2%>%filter(FamSize>=4) | ||
large_family_ids <- large_families$FamID # FamIDs of families >3 | ||
|
||
large_families_prob<-large_families%>%filter(MotherID!=0|FatherID!=0) # probands from families > 3 | ||
large_families_prob$newFamID <- NA | ||
families<-unique(large_families_prob$FamID) | ||
|
||
# function that splits by probands, adds _# to the end of the FamID and then finds the parents and adds _# to them as well | ||
split_fam<-function(probands,pedigree){ | ||
trio <- data.frame(matrix(ncol = 8, nrow = 0)) | ||
families<-unique(probands$FamID) | ||
for (i in 1:length(families)){ | ||
# reassign family IDs for >=quads (fam1 => fam1_1, fam1_2) | ||
perfam<-subset(probands,FamID==families[i]) | ||
for (x in 1:nrow(perfam)){ | ||
probands$newFamID[probands$IndividualID==as.character(perfam[x,2])]<-paste0(probands$FamID[probands$IndividualID==as.character(perfam[x,2])],"_",x) | ||
} | ||
} | ||
# each proband row should now have a new fam_id | ||
# pedigree needs a new column | ||
pedigree$newFamID<-NA | ||
# now go back and pick up parents info and rename famID of parents | ||
for (y in 1:nrow(probands)){ | ||
paternal<-subset(pedigree, IndividualID == as.character(probands[y, 3])) | ||
paternal$newFamID <- as.character(probands[y,8]) | ||
maternal<-subset(pedigree, IndividualID == as.character(probands[y, 4])) | ||
maternal$newFamID<-as.character(probands[y,8]) | ||
trio<-rbind(data.frame(trio), data.frame(probands[y,]),data.frame(paternal),data.frame(maternal)) | ||
} | ||
return(trio) | ||
} | ||
|
||
output<-split_fam(large_families_prob, large_families) | ||
|
||
# subset out newFamID column and rename FamID | ||
new_output <- subset(output, select=c("newFamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected")) | ||
colnames(new_output)[1] <- "FamID" | ||
|
||
# subset to just families with all three in the new ped | ||
new_output %>% | ||
group_by(FamID) %>% | ||
mutate(FamSize = n()) %>% | ||
filter(FamSize==3)-> trios | ||
|
||
new_trios <- subset(trios, select=c("FamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected")) | ||
|
||
# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script | ||
new_trios$FatherID[!(new_trios$FatherID %in% ped$IndividualID)] <- 0 | ||
new_trios$MotherID[!(new_trios$MotherID %in% ped$IndividualID)] <- 0 | ||
|
||
# subset ped to only include families that are not > 3 | ||
ped_without_large_families <- subset(ped, !(FamID %in% large_family_ids)) | ||
|
||
# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script | ||
ped_without_large_families$FatherID[!(ped_without_large_families$FatherID %in% ped$IndividualID)] <- 0 | ||
ped_without_large_families$MotherID[!(ped_without_large_families$MotherID %in% ped$IndividualID)] <- 0 | ||
|
||
# concat new trio pedigree and old pedigree without families > 3 | ||
ped_without_large_families_df <- data.frame(ped_without_large_families) | ||
new_trios_df <- data.frame(new_trios) | ||
|
||
final_ped<-rbind(new_trios_df, ped_without_large_families_df) | ||
|
||
write.table(final_ped,file="cleaned_ped.txt",quote=F,row.names=F,col.names=T,sep="\t") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
require(plyr) | ||
library(data.table) | ||
library(tidyverse) | ||
|
||
|
||
args = commandArgs(trailingOnly=TRUE) | ||
input_gd <- args[1] | ||
proband_gd <- args[2] | ||
parents_gd <- args[3] | ||
ped <- args[4] | ||
chromosome <- args[5] | ||
|
||
df_ori <- as.data.frame(read.table(input_gd, header = FALSE, sep="\t",stringsAsFactors=FALSE, quote="")) | ||
colnames(df_ori) <- c("chrom", "start", "end", "name", "annotation", "svtype") | ||
df <- subset(df_ori, chrom == chromosome) | ||
|
||
df_DUP <- subset(df, svtype == 'DUP' | svtype == 'DEL;DUP') | ||
if (nrow(df_DUP) > 0) { | ||
df_DUP$svtype <- 'DUP' | ||
} | ||
|
||
df_DEL <- subset(df, svtype == 'DEL' | svtype == 'DEL;DUP') | ||
if (nrow(df_DEL) > 0) { | ||
df_DEL$svtype <- 'DEL' | ||
} | ||
|
||
if (nrow(df_DEL) > 0 & nrow(df_DUP) > 0) { | ||
df2 = rbind(df_DUP,df_DEL) | ||
} else if (nrow(df_DEL) > 0) { | ||
df2 = df_DEL | ||
} else if (nrow(df_DUP) > 0) { | ||
df2 = df_DUP | ||
} | ||
|
||
if (exists('df2')) { | ||
ped <- as.data.frame(read.table(ped, header = TRUE, sep="\t",stringsAsFactors=FALSE, quote="")) | ||
samples <- ped$IndividualID | ||
df2$samples <- gsub(" ", "",toString(samples)) | ||
|
||
df2 %>% | ||
subset(select = c(chrom, start, end, name, svtype, samples)) %>% | ||
separate_rows(samples, sep = ",", convert = T) -> df3 | ||
|
||
df3$chrom <- paste0(df3$chrom, "_", df3$svtype, "_", df3$samples) | ||
df3 <- df3[, c(1,2,3,4)] | ||
|
||
ped_subset <- subset(ped, select = c('FamID', 'IndividualID')) | ||
colnames(ped_subset) <- c('FamID', 'samples') | ||
|
||
df2 %>% | ||
subset(select = c(chrom, start, end, name, svtype, samples)) %>% | ||
separate_rows(samples, sep = ",", convert = T) -> new_df | ||
|
||
new_df <- merge(new_df,ped_subset, by="samples", all.x = TRUE) | ||
new_df$chrom <- paste0(new_df$chrom, "_", new_df$svtype, "_", new_df$FamID) | ||
new_df <- new_df[, c(2,3,4,5)] | ||
|
||
#df4 = rbind(df3,new_df) | ||
|
||
write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F) | ||
write.table(new_df, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F) | ||
} else{ | ||
df3 <- data.frame() | ||
write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F) | ||
write.table(df3, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import argparse | ||
import pandas as pd | ||
import numpy as np | ||
import math | ||
|
||
parser = argparse.ArgumentParser(description='Parse arguments') | ||
parser.add_argument('--bed', dest='bed', help='Input BED file') | ||
args = parser.parse_args() | ||
|
||
bed_file = args.bed | ||
|
||
bed_original = pd.read_csv(bed_file, sep='\t', index_col=False).replace(np.nan, '', regex=True) | ||
|
||
# Remove outliers | ||
bed = bed_original | ||
sample_counts = bed['sample'].value_counts() | ||
q3, q1 = np.percentile(sample_counts.tolist(), [75, 25]) | ||
iqr = q3 - q1 | ||
count_threshold = int(math.ceil((1.5 * iqr + q3) * 3)) | ||
sample_counts2 = sample_counts.rename_axis('sample').reset_index(name='count') | ||
samples_keep = sample_counts2[(sample_counts2['count'] <= count_threshold)]['sample'].tolist() | ||
|
||
# Keep samples and outliers in sepparate files | ||
output = bed[(bed['sample'].isin(samples_keep))] | ||
output_outliers = bed[(~bed['sample'].isin(samples_keep))] | ||
|
||
bed_original.loc[~(bed_original['sample'].isin(samples_keep)) & bed_original['is_de_novo'], 'filter_flag'] = 'sample_outlier' | ||
|
||
# Write output | ||
output.to_csv(path_or_buf='final.denovo.merged.bed', mode='a', index=False, sep='\t', header=True) | ||
output_outliers.to_csv(path_or_buf='final.denovo.merged.outliers.bed', mode='a', index=False, sep='\t', header=True) | ||
|
||
bed_original.to_csv(path_or_buf='final.denovo.merged.allSamples.bed', mode='a', index=False, sep='\t', header=True) |
Oops, something went wrong.