diff --git a/dockerfiles/denovo/Dockerfile b/dockerfiles/denovo/Dockerfile
new file mode 100644
index 000000000..8416b9133
--- /dev/null
+++ b/dockerfiles/denovo/Dockerfile
@@ -0,0 +1,22 @@
+ARG SV_PIPELINE_IMAGE=sv-pipeline:latest
+FROM $SV_PIPELINE_IMAGE
+
+COPY /src/denovo /src/denovo/
+
+# make & cpp are needed for installing `tidyverse`.
+RUN apt-get -qqy update && \
+    apt-get -qqy install ca-certificates build-essential && \
+    Rscript -e "\
+        install.packages( \
+            c( \
+                'BiocManager', \
+                'Rcpp', \
+                'tidyverse', \
+                'R.utils', \
+                'UpSetR', \
+                'ggplotify' \
+            ), \
+            repos = 'https://cran.rstudio.com', \
+            quiet = TRUE \
+        )" && \
+    pip install fsspec gcsfs
diff --git a/scripts/docker/build_docker.py b/scripts/docker/build_docker.py
index 16983362c..16d09dcb9 100755
--- a/scripts/docker/build_docker.py
+++ b/scripts/docker/build_docker.py
@@ -244,6 +244,11 @@ class to track dependencies, control build and push of entire job
             docker_dependencies={
                 "samtools-cloud": "SAMTOOLS_CLOUD_IMAGE",
                 "sv-utils-env": "VIRTUAL_ENV_IMAGE"}
+        ),
+        "denovo": ImageDependencies(
+            git_dependencies=("dockerfiles/denovo/*", "src/denovo/*"),
+            docker_dependencies={
+                "sv-pipeline": "SV_PIPELINE_IMAGE"}
         )
     }
     non_public_images = frozenset({"melt"})
diff --git a/src/denovo/clean_ped.R b/src/denovo/clean_ped.R
new file mode 100644
index 000000000..19ca64ef6
--- /dev/null
+++ b/src/denovo/clean_ped.R
@@ -0,0 +1,88 @@
+# This script creates a ped file with all families >3 into trios
+# to match desired input for de novo SV filtering
+
+# Define args and load libraries
+args = commandArgs(trailingOnly=TRUE)
+require(plyr)
+library(data.table)
+library(tidyverse)
+
+input_ped <- args[1]
+
+# Read input ped
+ped <- fread(input_ped,header = TRUE)
+
+# Add a column with size of family
+ped %>%
+  group_by(FamID) %>%
+  mutate(FamSize = n()) -> ped2
+
+# Steph's code
+distfam<-ped2%>%distinct(FamID,FamSize)
+fam_breakdown<-distfam%>%group_by(FamSize)%>%tally()
+
+# subset the families > 3
+large_families<-ped2%>%filter(FamSize>=4)
+large_family_ids <- large_families$FamID # FamIDs of families >3
+
+large_families_prob<-large_families%>%filter(MotherID!=0|FatherID!=0) # probands from families > 3
+large_families_prob$newFamID <- NA
+families<-unique(large_families_prob$FamID)
+
+# function that splits by probands, adds _# to the end of the FamID and then finds the parents and adds _# to them as well
+split_fam<-function(probands,pedigree){
+  trio <- data.frame(matrix(ncol = 8, nrow = 0))
+  families<-unique(probands$FamID)
+  for (i in 1:length(families)){
+    # reassign family IDs for >=quads (fam1 => fam1_1, fam1_2)
+    perfam<-subset(probands,FamID==families[i])
+    for (x in 1:nrow(perfam)){
+      probands$newFamID[probands$IndividualID==as.character(perfam[x,2])]<-paste0(probands$FamID[probands$IndividualID==as.character(perfam[x,2])],"_",x)
+    }
+  }
+  # each proband row should now have a new fam_id
+  # pedigree needs a new column
+  pedigree$newFamID<-NA
+  # now go back and pick up parents info and rename famID of parents
+  for (y in 1:nrow(probands)){
+    paternal<-subset(pedigree, IndividualID == as.character(probands[y, 3]))
+    paternal$newFamID <- as.character(probands[y,8])
+    maternal<-subset(pedigree, IndividualID == as.character(probands[y, 4]))
+    maternal$newFamID<-as.character(probands[y,8])
+    trio<-rbind(data.frame(trio), data.frame(probands[y,]),data.frame(paternal),data.frame(maternal))
+  }
+  return(trio)
+}
+
+output<-split_fam(large_families_prob, large_families)
+
+# subset out newFamID column and rename FamID
+new_output <- subset(output, select=c("newFamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))
+colnames(new_output)[1] <- "FamID"
+
+# subset to just families with all three in the new ped
+new_output %>%
+  group_by(FamID) %>%
+  mutate(FamSize = n()) %>%
+  filter(FamSize==3)-> trios
+
+new_trios <- subset(trios, select=c("FamID", "IndividualID", "MotherID", "FatherID", "Gender", "Affected"))
+
+# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
+new_trios$FatherID[!(new_trios$FatherID %in% ped$IndividualID)] <- 0
+new_trios$MotherID[!(new_trios$MotherID %in% ped$IndividualID)] <- 0
+
+# subset ped to only include families that are not > 3
+ped_without_large_families <- subset(ped, !(FamID %in% large_family_ids))
+
+# If mom and dad are not in original ped file as an individual, remove them so that they do not cause an error in the de novo script
+ped_without_large_families$FatherID[!(ped_without_large_families$FatherID %in% ped$IndividualID)] <- 0
+ped_without_large_families$MotherID[!(ped_without_large_families$MotherID %in% ped$IndividualID)] <- 0
+
+# concat new trio pedigree and old pedigree without families > 3
+ped_without_large_families_df <- data.frame(ped_without_large_families)
+new_trios_df <- data.frame(new_trios)
+
+final_ped<-rbind(new_trios_df, ped_without_large_families_df)
+
+write.table(final_ped,file="cleaned_ped.txt",quote=F,row.names=F,col.names=T,sep="\t")
diff --git a/src/denovo/create_per_sample_bed.R b/src/denovo/create_per_sample_bed.R
new file mode 100644
index 000000000..da955ad69
--- /dev/null
+++ b/src/denovo/create_per_sample_bed.R
@@ -0,0 +1,66 @@
+require(plyr)
+library(data.table)
+library(tidyverse)
+
+
+args = commandArgs(trailingOnly=TRUE)
+input_gd <- args[1]
+proband_gd <- args[2]
+parents_gd <- args[3]
+ped <- args[4]
+chromosome <- args[5]
+
+df_ori <- as.data.frame(read.table(input_gd, header = FALSE, sep="\t",stringsAsFactors=FALSE, quote=""))
+colnames(df_ori) <- c("chrom", "start", "end", "name", "annotation", "svtype")
+df <- subset(df_ori, chrom == chromosome)
+
+df_DUP <- subset(df, svtype == 'DUP' | svtype == 'DEL;DUP')
+if (nrow(df_DUP) > 0) {
+  df_DUP$svtype <- 'DUP'
+}
+
+df_DEL <- subset(df, svtype == 'DEL' | svtype == 'DEL;DUP')
+if (nrow(df_DEL) > 0) {
+  df_DEL$svtype <- 'DEL'
+}
+
+if (nrow(df_DEL) > 0 & nrow(df_DUP) > 0) {
+  df2 = rbind(df_DUP,df_DEL)
+} else if (nrow(df_DEL) > 0) {
+  df2 = df_DEL
+} else if (nrow(df_DUP) > 0) {
+  df2 = df_DUP
+}
+
+if (exists('df2')) {
+  ped <- as.data.frame(read.table(ped, header = TRUE, sep="\t",stringsAsFactors=FALSE, quote=""))
+  samples <- ped$IndividualID
+  df2$samples <- gsub(" ", "",toString(samples))
+
+  df2 %>%
+    subset(select = c(chrom, start, end, name, svtype, samples)) %>%
+    separate_rows(samples, sep = ",", convert = T) -> df3
+
+  df3$chrom <- paste0(df3$chrom, "_", df3$svtype, "_", df3$samples)
+  df3 <- df3[, c(1,2,3,4)]
+
+  ped_subset <- subset(ped, select = c('FamID', 'IndividualID'))
+  colnames(ped_subset) <- c('FamID', 'samples')
+
+  df2 %>%
+    subset(select = c(chrom, start, end, name, svtype, samples)) %>%
+    separate_rows(samples, sep = ",", convert = T) -> new_df
+
+  new_df <- merge(new_df,ped_subset, by="samples", all.x = TRUE)
+  new_df$chrom <- paste0(new_df$chrom, "_", new_df$svtype, "_", new_df$FamID)
+  new_df <- new_df[, c(2,3,4,5)]
+
+  #df4 = rbind(df3,new_df)
+
+  write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+  write.table(new_df, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+} else{
+  df3 <- data.frame()
+  write.table(df3, proband_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+  write.table(df3, parents_gd, sep = "\t", quote = F, row.names = F, col.names = F)
+}
\ No newline at end of file
diff --git a/src/denovo/denovo_outliers.py b/src/denovo/denovo_outliers.py
new file mode 100644
index 000000000..2d3d08b51
--- /dev/null
+++ b/src/denovo/denovo_outliers.py
@@ -0,0 +1,33 @@
+import argparse
+import pandas as pd
+import numpy as np
+import math
+
+parser = argparse.ArgumentParser(description='Parse arguments')
+parser.add_argument('--bed', dest='bed', help='Input BED file')
+args = parser.parse_args()
+
+bed_file = args.bed
+
+bed_original = pd.read_csv(bed_file, sep='\t', index_col=False).replace(np.nan, '', regex=True)
+
+# Remove outliers
+bed = bed_original
+sample_counts = bed['sample'].value_counts()
+q3, q1 = np.percentile(sample_counts.tolist(), [75, 25])
+iqr = q3 - q1
+count_threshold = int(math.ceil((1.5 * iqr + q3) * 3))
+sample_counts2 = sample_counts.rename_axis('sample').reset_index(name='count')
+samples_keep = sample_counts2[(sample_counts2['count'] <= count_threshold)]['sample'].tolist()
+
+# Keep samples and outliers in sepparate files
+output = bed[(bed['sample'].isin(samples_keep))]
+output_outliers = bed[(~bed['sample'].isin(samples_keep))]
+
+bed_original.loc[~(bed_original['sample'].isin(samples_keep)) & bed_original['is_de_novo'], 'filter_flag'] = 'sample_outlier'
+
+# Write output
+output.to_csv(path_or_buf='final.denovo.merged.bed', mode='a', index=False, sep='\t', header=True)
+output_outliers.to_csv(path_or_buf='final.denovo.merged.outliers.bed', mode='a', index=False, sep='\t', header=True)
+
+bed_original.to_csv(path_or_buf='final.denovo.merged.allSamples.bed', mode='a', index=False, sep='\t', header=True)
diff --git a/src/denovo/denovo_sv_plots.R b/src/denovo/denovo_sv_plots.R
new file mode 100644
index 000000000..5cd7171e0
--- /dev/null
+++ b/src/denovo/denovo_sv_plots.R
@@ -0,0 +1,385 @@
+# Description: Post-hoc QC analysis of filtered de novo SVs
+
+
+# Load libraries
+library(tidyr)
+library(ggplot2)
+library(UpSetR)
+library(data.table)
+library(dplyr)
+library(gridExtra)
+library(grid)
+library(tidyverse)
+library("grid")
+library("ggplotify")
+library(dplyr)
+
+args = commandArgs(trailingOnly=TRUE)
+
+input_bed <- args[1]
+input_ped <- args[2]
+out_file <- args[3]
+
+#input_outliers <-args [2]
+#input_ped <- args[3]
+#out_file <- args[4]
+
+ped <- fread(input_ped)
+
+denovo <- as.data.frame(read.table(input_bed,header = TRUE, sep="\t",stringsAsFactors=FALSE, quote=""))
+#outliers <- as.data.frame(read.table(input_outliers,header = TRUE, sep="\t",stringsAsFactors=FALSE, quote=""))
+
+denovo$SVTYPE <- factor(denovo$SVTYPE, levels = rev(c("DEL", "DUP", "INS", "INV", "CPX", "CTX")))
+
+denovo_ins <- subset(denovo, SVTYPE == "INS")
+denovo_del <- subset(denovo, SVTYPE == "DEL")
+denovo_dup <- subset(denovo, SVTYPE == "DUP")
+denovo_inv <- subset(denovo, SVTYPE == "INV")
+denovo_cpx <- subset(denovo, SVTYPE == "CPX")
+denovo_ctx <- subset(denovo, SVTYPE == "CTX")
+
+# Define colors
+del_col <- "#D43925"
+dup_col <- "#2376B2"
+ins_col <- "#D474E0"
+inv_col <- "#FA9627"
+ctx_col <- "#638E6C"
+cpx_col <- "#4DA1A9"
+
+# Get some numbers
+length(unique(denovo$name)) #of unique denovos
+length(unique(denovo$sample)) #number of samples with denovos
+
+#length(unique(outliers$name))
+#length(unique(outliers$sample))
+#outliers_in_gd <- subset(outliers, select= c(name, in_gd))
+
+denovo %>%
+  # subset(chrom != "chrX") %>%
+  select(sample, name) %>%
+  group_by(sample) %>%
+  tally() -> sample_count
+
+denovo %>% 
+  count(sample, SVTYPE, name= "svtype_per_sample") -> type
+
+type_boxplot <- ggplot(type, aes(x=SVTYPE, fill = SVTYPE, y=svtype_per_sample)) + 
+  geom_jitter(position = position_jitter(seed = 1, width = 0.2), color = 'grey', size = 4.0) + geom_boxplot(outlier.shape=NA) + labs(title = "Number of De Novo SVs per Type", y = "Number of de novo SVs", x = "SV Type") + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25))
+ggsave("type_boxplot.png", type_boxplot, width = 30, height = 40, limitsize = FALSE) 
+
+sample_boxplot <- ggplot(sample_count, aes(x=factor(0), y=n)) + 
+  geom_jitter(position = position_jitter(seed = 1, width = 0.2), color = 'gray47', size = 4.0) + geom_boxplot(outlier.shape=NA) + labs(title = "Number of de Novo SVs per Sample", y = "Number of de novo SVs", x = "Samples") + scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25))
+ggsave("sample_boxplot.png", sample_boxplot, width = 30, height = 35, limitsize = FALSE) 
+
+sample_count %>%
+  ggplot(aes(y = n)) +
+  geom_boxplot() -> boxplot
+
+# Make plots
+denovo$chrom <- factor(denovo$chrom, levels = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX"))
+
+# Count by SV Type
+denovo %>%
+  select(SVTYPE, name, SVTYPE) %>%
+  unique() %>%
+  ggplot(aes(x = SVTYPE, fill = SVTYPE)) +
+  geom_bar() +
+  labs(x="SV Type", y="Number de novo SVs", title= "Number of De Novo SVs per SV Type") +
+  scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  scale_x_discrete(limits = rev(levels(denovo$SVTYPE))) +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25)) -> p_type_count
+p_type_count
+
+ggsave("per_type.png", p_type_count, width = 30, height = 35, limitsize = FALSE)
+
+# Count by chromosome
+denovo %>%
+  select(chrom, name, SVTYPE) %>%
+  unique() %>%
+  ggplot(aes(x = chrom, fill = SVTYPE)) +
+  geom_bar() +
+  labs(x="Chromosome", y="Number de novo SVs", title= "Number of De Novo SVs per Chromosome") +
+  scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25)) -> p_chr_count
+p_chr_count
+
+ggsave("per_chrom.png", p_chr_count, width = 30, height = 45, limitsize = FALSE)
+
+# Count by sample
+df <- as.data.frame(table(denovo$sample))
+df$samples <- as.character(df$Var1)
+df$num_of_denovos <- as.numeric(as.character(df$Freq))
+df$samples <- as.factor(df$samples)
+
+p <- ggplot(df, aes(y=num_of_denovos)) + geom_boxplot() + labs(title = "Number of de novo SVs per sample", y = "Number of de novos", x = "Samples") + theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25))
+ggsave("per_sample.png", p, width = 30, height = 35, limitsize = FALSE)
+
+# De novo count by allele frequency
+denovo$AF <- as.numeric(denovo$AF)  
+denovo$AC <- as.numeric(denovo$AC) 
+ac_1_freq <- min(subset(denovo, AC == 1)$AF)
+max_AF_str <- toString(round(max(denovo$AF), digits=3))
+# denovo$AF_interv <- cut(denovo$AF, breaks = c(0, ac_1_freq, 0.001, 0.01, 0.03, max(denovo$AF)),
+#                         labels = c("AC=1", "AC 1 - AF<=0.001", "AF 0.001-0.01", "AF 0.01-0.03", "AF>0.03"))
+denovo$AF_interv <- cut(denovo$AF, breaks = c(0, ac_1_freq, 0.001, max(denovo$AF)),
+                        labels = c("AC=1", "AC 1-0.001", "AF>0.001"))
+
+denovo %>%
+  select(AF_interv, name, SVTYPE) %>%
+  unique() %>%
+  ggplot(aes(x = AF_interv, fill = SVTYPE)) +
+  geom_bar() +
+  # scale_y_log10()+
+  labs(x="Allele Frequency", y="Number de novo SVs", title="Number of De Novo\n SVs by Frequency") +
+  scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25)) -> p_af_count
+ggsave("per_freq.png", p_af_count, width = 30, height = 35, limitsize = FALSE)
+
+denovo_in_gd <- subset(denovo, in_gd == "True")
+denovo_not_in_gd <- subset(denovo, in_gd == "False")
+
+denovo_in_gd %>%
+  select(AF_interv, name, SVTYPE) %>%
+  unique() %>%
+  ggplot(aes(x = AF_interv, fill = SVTYPE)) +
+  geom_bar() +
+  # scale_y_log10()+
+  labs(x="Allele Frequency", y="Number de novo SVs", title= "Number of De Novo SVs \nin Genomic Disorder Regions by Frequency") +
+  scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25)) -> p_af_count_in_gd
+ggsave("per_freq_gd.png", p_af_count_in_gd, width = 30, height = 35, limitsize = FALSE)
+
+denovo_not_in_gd %>%
+  select(AF_interv, name, SVTYPE) %>%
+  unique() %>%
+  ggplot(aes(x = AF_interv, fill = SVTYPE)) +
+  geom_bar() +
+  # scale_y_log10()+
+  labs(x="Allele Frequency", y="Number de novo SVs", title= "Number of De Novo SVs not in \n Genomic Disorder Regions by Frequency") +
+  scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25)) -> p_af_count_not_in_gd
+ggsave("per_freq_not_gd.png", p_af_count_not_in_gd, width = 30, height = 35, limitsize = FALSE) 
+
+# De novo count by size
+denovo$SVLEN <- as.numeric(denovo$SVLEN)  
+#denovo[denovo$SVLEN < 1,]$SVLEN <- 1
+denovo$SVLEN_interv <- cut(denovo$SVLEN, breaks = c(0, 100, 500, 2500, 10000, 50000, max(denovo$SVLEN)),
+                           labels = c("<100bp", "101-500bp", "501bp-2.5kb", "2.5kb-10kb", "10kb-50kb", ">50kb"))
+denovo %>%
+  select(SVLEN_interv, name, SVTYPE) %>%
+  unique() %>%
+  ggplot(aes(x = SVLEN_interv, fill = SVTYPE)) +
+  geom_bar() +
+  # scale_y_log10()+
+  labs(x="Size of SV", y="Number de novo SVs", title= "De Novo SVs by Size") +
+  scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25)) -> p_size_count
+ggsave("size.png", p_size_count, width = 25, height = 20) 
+
+##Evidence plot
+SR_denovo <- subset(denovo, EVIDENCE_FIX == "SR", select = c(ALGORITHMS, GQ))
+caller_quality_denovos <- subset(denovo, select = c(name,ALGORITHMS, GQ, EVIDENCE))
+
+denovo %>%
+  select(name, SVTYPE, EVIDENCE_FIX) %>%
+  unique() %>%
+  ggplot(aes(x = forcats::fct_infreq(EVIDENCE_FIX), fill = SVTYPE)) +
+  geom_bar() +
+  # scale_y_log10()+
+  labs(x="Evidence", y="Number de novo SVs", title= "De Novo SVs by Evidence") +
+  scale_fill_manual(values = rev(c("DEL"=del_col, "DUP"=dup_col, "INS"=ins_col, "INV"=inv_col, "CPX"=cpx_col, "CTX"=ctx_col))) + 
+  theme_classic() +
+  theme(
+    legend.position = "right",
+    axis.text = element_text(size = 70),
+    axis.title = element_text(size = 70),
+    plot.title = element_text(size=70),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    panel.border = element_blank(),
+    axis.line = element_line(colour = "black"),
+    legend.title = element_text(size=30),
+    legend.text = element_text(size=25)) -> p_evidence
+ggsave("evidence.png", p_evidence, width = 25, height = 20) 
+
+# Annotation plot
+annot <- grep("_", names(denovo), value = T)
+annot <- grep("LINCRNA", annot, value = T, invert = T)
+# annot <- annot[!annot %in% c("PROTEIN_CODING__NEAREST_TSS", "PROTEIN_CODING__INTERGENIC")]
+
+lof <- grep("LOF", annot, value = T)
+cg <- grep("COPY_GAIN", annot, value = T)
+msv <- grep("MSV_EXON_OVR", annot, value = T)
+inv_span <- grep("INV_SPAN", annot, value = T)
+dup_partial <- grep("DUP_PARTIAL", annot, value = T)
+intronic <- grep("INTRONIC", annot, value = T)
+utr <- grep("UTR", annot, value = T)
+promoter<- grep("PROMOTER", annot, value = T)
+nearest_tss<- grep("NEAREST_TSS", annot, value = T)
+intergenic<- grep("INTERGENIC", annot, value = T)
+coding <- grep("CODING", annot, value = T)
+coding <- coding[!coding %in% c('PROTEIN_CODING__NEAREST_TSS', "PROTEIN_CODING__INTERGENIC")]
+
+denovo$is_lof <- apply(denovo[,lof, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_cg <- apply(denovo[,cg, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_msv <- apply(denovo[,msv, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_inv_span <- apply(denovo[,inv_span, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_dup_partial <- apply(denovo[,dup_partial, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_intronic <- apply(denovo[,intronic, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_utr <- apply(denovo[,utr, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_promoter <- apply(denovo[,promoter, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_nearest_tss <- apply(denovo[,nearest_tss, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+denovo$is_intergenic <- apply(denovo[,intergenic, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == '' | r == FALSE))
+denovo$is_coding <- apply(denovo[,coding, drop = FALSE], 1, function(r) !all(is.na(r) | r == 'NA' | r == ''))
+
+keep_cols <- c(grep("SVTYPE", names(denovo)), (ncol(denovo)-9-1):ncol(denovo))
+denovo_cons <- denovo[,keep_cols]
+
+ref_cols <- grep("is_", names(denovo_cons))
+denovo_cons_type <- cbind(denovo_cons[,'SVTYPE', drop = FALSE], (denovo_cons[,ref_cols])*1)
+
+sv_cols <- list(
+  list(query = elements, 
+       params = list("SVTYPE", c("DEL","DUP", "INS", "INV", "CPX", "CTX")), color = del_col, active = T
+       , query.name = "DEL"
+  ), #red
+  list(query = elements, 
+       params = list("SVTYPE", c("DUP", "INS", "INV", "CPX", "CTX")), color = dup_col, active = T
+       , query.name = "DUP"
+  ), #blue
+  list(query = elements, 
+       params = list("SVTYPE", c("INS", "INV", "CPX", "CTX")), color = ins_col, active = T
+       , query.name = "INS"
+  ), #orange
+  list(query = elements, 
+       params = list("SVTYPE", c("INV", "CPX", "CTX")), color = inv_col, active = T
+       , query.name = "INV"
+  ), #green
+  list(query = elements, 
+       params = list("SVTYPE", "CPX", "CTX"), color = cpx_col, active = T
+       , query.name = "CPX" )
+  # ), #purple
+  #list(query = elements, 
+  #params = list("SVTYPE", "CTX"), color = ctx_col, active = T,
+  #query.name = "CTX"
+  #)
+)
+
+denovo_cons_type %>% 
+  upset(sets = grep("is_", names(denovo_cons_type), value = T),
+        #queries = sv_cols,
+        order.by = "freq", 
+        set_size.show = TRUE,
+        # set_size.scale_max = 15000, 
+        text.scale = 4, 
+        query.legend = "top",
+        ) -> p_denovo_upset_all
+
+grob_annotation_upset_plot <- as.grob(p_denovo_upset_all)
+ggsave("annotation.png", grob_annotation_upset_plot, width = 30, height = 20)
+
+# Creating a panel of plots
+lay <- rbind(c(1,1,1,2,2,2,3,3,3),
+             c(1,1,1,2,2,2,3,3,3),
+             c(4,4,4,4,4,4,4,4,4),
+             c(4,4,4,4,4,4,4,4,4),
+             c(5,5,NA,6,6,NA,7,7,NA),
+             c(5,5,NA,6,6,NA,7,7,NA),
+             c(8,8,8,8,NA,9,9,9,9),
+             c(8,8,8,8,NA,9,9,9,9),
+             c(10,10,10,10,10,10,NA,NA,NA),
+             c(10,10,10,10,10,10,NA,NA,NA))
+
+ml <- grid.arrange(sample_boxplot, type_boxplot, p_type_count, p_chr_count, p_af_count, p_af_count_in_gd, p_af_count_not_in_gd, p_size_count, p_evidence, grob_annotation_upset_plot, layout_matrix = lay, top=textGrob("De Novo SV Data", gp=gpar(fontsize=75)))
+ggsave(out_file, ml, width = 80, height = 150, limitsize = FALSE)
diff --git a/src/denovo/denovo_svs.py b/src/denovo/denovo_svs.py
new file mode 100644
index 000000000..3ba87e1b1
--- /dev/null
+++ b/src/denovo/denovo_svs.py
@@ -0,0 +1,833 @@
+"""
+SV de novo filtering script
+"""
+
+import argparse
+import yaml
+import numpy as np
+import pandas as pd
+import pybedtools
+import collections
+import time
+from subprocess import Popen, PIPE
+
+
+pd.options.mode.chained_assignment = None  # default='warn'
+
+
+def verbose_print(msg, verbose):
+    if verbose == "True":
+        print(msg)
+
+
+def get_count(row, items):
+    return sum(samp in items for samp in row['samples'].split(','))
+
+
+def get_parents_frequency(row, items):
+    return sum(s in items for s in row['samples'].split(',')) / len(items)
+
+
+def get_family_count(row, ped):
+    sample = row['sample']
+    father = ped[(ped['IndividualID'] == sample)]['FatherID'].values[0]
+    mother = ped[(ped['IndividualID'] == sample)]['MotherID'].values[0]
+    parents = [father, mother]
+    return (
+        sum(s in parents for s in row['samples'].split(','))
+    )
+
+
+def variant_info(row, field, vcf):
+    row_name = row['name']
+    samp = row['sample']
+
+    filt_pos = vcf[(vcf['ID'] == row_name)]['FORMAT'].str.split(':').tolist()
+
+    if field in filt_pos[0]:
+        idx = filt_pos[0].index(field)
+        if samp in vcf.columns:
+            samp_info = vcf[(vcf['ID'] == row_name)][samp].str.split(':').tolist()
+            samp_info_field = samp_info[0][idx]
+        else:
+            samp_info_field = 'NA'
+    else:
+        samp_info_field = 'NA'
+
+    return samp_info_field
+
+
+def add_family(row, ped):
+    sample = row['sample']
+    fam = ped[(ped['IndividualID'] == sample)]['FamID'].values
+    chr = row['chrom']
+    if fam.size != 0:
+        return '_'.join([str(fam[0]), chr])
+
+
+def get_info_parent(row, ped, vcf, parent, field):
+    row_name = row['name']
+    sample = row['sample']
+    parent_id = ped[(ped['IndividualID'] == sample)][parent].values
+
+    filt_pos = vcf[(vcf['ID'] == row_name)]['FORMAT'].str.split(':').tolist()
+
+    if field in filt_pos[0]:
+        idx = filt_pos[0].index(field)
+        parent_info = vcf[(vcf['ID'] == row_name)][parent_id[0]].str.split(':').tolist()
+        parent_info_field = parent_info[0][idx]
+    else:
+        return 'NA'
+
+    return parent_info_field
+
+
+def get_parents(proband, ped):
+    mother = ped[(ped['IndividualID'] == proband)]['MotherID'].values
+    father = ped[(ped['IndividualID'] == proband)]['FatherID'].values
+    parent_list = [mother, father]
+    return parent_list
+
+
+def get_family_id(row, ped):
+    samp = row['sample']
+    family_id = ped[(ped['IndividualID'] == samp)]['FamID'].values
+    return family_id[0]
+
+
+def get_bincov_matrix_url(sample, sample_batches, batch_bincov):
+    batch = sample_batches.loc[sample_batches['sample'] == sample]['batch'].iloc[0]
+    bincov_gs = batch_bincov.loc[batch_bincov['batch'] == batch]['bincov'].iloc[0]
+    return bincov_gs
+
+
+def write_to_filter_file(filename, header, original_df, filtered_df):
+    filename.write(header)
+    filename.write("\n")
+    original = original_df['name_famid']
+    filtered = filtered_df['name_famid']
+    to_write = list(set(original) - set(filtered))
+    filename.write(str(to_write))
+    filename.write("\n")
+
+
+def write_to_size_file(filename, header, df):
+    filename.write(header)
+    filename.write(str(len(df)))
+    filename.write("\n")
+
+
+def min_gq(row, gq_min):
+    paternal_srgq = row['paternal_srgq']
+    maternal_srgq = row['maternal_srgq']
+    paternal_pegq = row['paternal_pegq']
+    maternal_pegq = row['maternal_pegq']
+    paternal_gq = row['paternal_gq']
+    maternal_gq = row['maternal_gq']
+    minimum = min([paternal_srgq, maternal_srgq, paternal_gq, maternal_gq, paternal_pegq, maternal_pegq])
+    if minimum != '.':
+        if int(min([paternal_srgq, maternal_srgq, paternal_gq, maternal_gq, paternal_pegq, maternal_pegq])) <= gq_min:
+            return 'Remove'
+        else:
+            return 'Keep'
+    else:
+        return 'Keep'
+
+
+def convert_to_bedtool(df, cols_to_keep=None, sort=True):
+    if cols_to_keep is None:
+        string_obj = df.to_string(header=False, index=False)
+        bt_obj = pybedtools.BedTool(string_obj, from_string=True)
+        if sort:
+            string_obj = df.to_string(header=False, index=False)
+            bt_obj = pybedtools.BedTool(string_obj, from_string=True).sort()
+    else:
+        string_obj = df[cols_to_keep].to_string(header=False, index=False)
+        bt_obj = pybedtools.BedTool(string_obj, from_string=True).sort()
+    return bt_obj
+
+
+def tabix_query(filename, chrom, start, end):
+    process = Popen(['tabix', '-h', filename, f'{chrom}:{start}-{end}'], stdout=PIPE)
+    cov = []
+    for line in process.stdout:
+        cov.append(line.decode("utf-8").strip().split())
+    cov_matrix = pd.DataFrame(cov)
+    return cov_matrix
+
+
+def get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member):
+    svtype = row['svtype']
+    start = int(row['start'])
+    end = int(row['end'])
+    if svtype == 'INS':
+        start = start - 100
+        end = end + 100
+    proband = row['sample']
+    proband_matrix = get_bincov_matrix_url(proband, sample_batches, batch_bincov)
+    mom = get_parents(proband, ped)[0][0]
+    mom_matrix = get_bincov_matrix_url(mom, sample_batches, batch_bincov)
+    dad = get_parents(proband, ped)[1][0]
+    dad_matrix = get_bincov_matrix_url(dad, sample_batches, batch_bincov)
+    if family_member == 'proband':
+        return [proband, proband_matrix]
+    if family_member == 'mother':
+        return [mom, mom_matrix]
+    if family_member == 'father':
+        return [dad, dad_matrix]
+
+
+def find_coverage(row, ped, sample_batches, batch_bincov, family_member):
+    chrom = row['chrom']
+    start = int(row['start'])
+    end = int(row['end'])
+    svtype = row['svtype']
+    if svtype == 'INS':
+        start = start - 100
+        end = end + 100
+    if svtype == 'DEL' or svtype == 'DUP' or svtype == 'INS':
+        if family_member == 'mother':
+            matrix = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='mother')[1]
+        if family_member == 'father':
+            matrix = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='father')[1]
+        cov_matrix = tabix_query(matrix, chrom, start, end)
+
+        header = cov_matrix.iloc[0].to_list()  # grab the first row for the header
+        new_header = []
+        for x in header:
+            new_header.append(x)
+        cov_matrix = cov_matrix[1:]  # take the data less the header row
+        cov_matrix.columns = new_header
+        if family_member == 'mother':
+            indv = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='mother')[0]
+        if family_member == 'father':
+            indv = get_per_sample_matrix(row, ped, sample_batches, batch_bincov, family_member='father')[0]
+        sample_cov_matrix = cov_matrix[indv].tolist()
+        sample_cov_matrix_decoded = []
+        for x in sample_cov_matrix:
+            sample_cov_matrix_decoded.append(x)
+        return sample_cov_matrix_decoded
+    else:
+        return 'NA'
+
+
+def get_median_coverage(mom_matrix, dad_matrix, coverage_cutoff):
+    mom_coverage = mom_matrix
+    dad_coverage = dad_matrix
+
+    if mom_coverage == 'NA' and dad_coverage == 'NA':
+        return 'Keep'
+
+    if mom_coverage != 'NA' and dad_coverage != 'NA':
+        coverage_d = {
+            'Mother': mom_coverage,
+            'Father': dad_coverage
+        }
+
+        coverage_df = pd.DataFrame(coverage_d)
+        '''
+        if coverage_df.empty:
+            print('Coverage df is empty!')
+            exit()
+        '''
+        median = coverage_df.median()
+        median_list = median.tolist()
+        median_filtered = [x for x in median_list if x < coverage_cutoff]
+        if len(median_filtered) > 0:
+            return 'Remove'
+        else:
+            return 'Keep'
+    else:
+        return 'Remove'  # TODO: check with Alba what should this return.
+
+
+def get_cnv_intersection_depth(bed, raw, overlap):
+    intersect = bed.coverage(raw).to_dataframe(disable_auto_names=True, header=None)
+    if len(intersect) != 0:
+        names_overlap = intersect[intersect[10] > overlap][6].to_list()
+    else:
+        names_overlap = ['']
+    return names_overlap
+
+
+def get_cnv_intersection_other(bed, raw, overlap):
+    overlap = bed.intersect(raw, wo=True, f=overlap, r=True).to_dataframe(disable_auto_names=True, header=None)
+    if len(overlap) != 0:
+        names_overlap = overlap[6].to_list()
+    else:
+        names_overlap = ['']
+    return names_overlap
+
+
+def get_insertion_intersection(bed, raw):
+    overlap = bed.closest(raw).to_dataframe(disable_auto_names=True, header=None)
+    if len(overlap) != 0:
+        overlap['is_close'] = abs(overlap[8] - overlap[1]) < 100
+        if sum(bool(x) for x in overlap['is_close']) > 0:
+            names_overlap = overlap[(overlap['is_close'])][6].to_list()
+        else:
+            names_overlap = ['']
+    return names_overlap
+
+
+def main():
+    """
+    Parse arguments vcf_metrics
+    """
+    parser = argparse.ArgumentParser(description='Parse arguments')
+    parser.add_argument('--bed', dest='bed', help='Input BED file')
+    parser.add_argument('--ped', dest='ped', help='Ped file')
+    parser.add_argument('--vcf', dest='vcf', help='VCF file')
+    parser.add_argument('--disorder', dest='disorder', help='Genomic disorder regions')
+    parser.add_argument('--out', dest='out', help='Output file with all variants')
+    parser.add_argument('--out_de_novo', dest='out_de_novo', help='Output file with only de novo variants')
+    parser.add_argument('--raw_proband', dest='raw_proband', help='Directory with raw SV calls - output from m04')
+    parser.add_argument('--raw_parents', dest='raw_parents', help='Directory with raw SV calls - output from m04')
+    parser.add_argument('--raw_depth_proband', dest='raw_depth_proband', help='Directory with raw SV depth calls - output from m04')
+    parser.add_argument('--raw_depth_parents', dest='raw_depth_parents', help='Directory with raw depth SV calls - output from m04')
+    parser.add_argument('--config', dest='config', help='Config file')
+    parser.add_argument('--exclude_regions', dest='exclude_regions', help='File containing regions with known somatic mutations')
+    parser.add_argument('--coverage', dest='coverage', help='File with batch in first column respective coverage file in second column')
+    parser.add_argument('--sample_batches', dest='sample_batches', help='File with samples in first column and their respective batch in second column')
+    parser.add_argument('--verbose', dest='verbose', help='Verbosity')
+    args = parser.parse_args()
+
+    bed_file = args.bed
+    ped_file = args.ped
+    vcf_file = args.vcf
+    disorder_file = args.disorder
+    out_file = args.out
+    de_novo_out_file = args.out_de_novo
+    raw_file_proband = args.raw_proband
+    raw_file_parent = args.raw_parents
+    raw_file_depth_proband = args.raw_depth_proband
+    raw_file_depth_parent = args.raw_depth_parents
+    verbose = args.verbose
+    config_file = args.config
+    exclude_regions = args.exclude_regions
+    coverage = args.coverage
+    batches = args.sample_batches
+
+    with open(config_file, "r") as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+
+    large_cnv_size = int(config['large_cnv_size'])
+    gnomad_col = config['gnomad_col']
+    alt_gnomad_col = config['alt_gnomad_col']
+    gnomad_af = float(config['gnomad_AF'])
+    parents_af = float(config['parents_AF'])
+    large_raw_overlap = float(config['large_raw_overlap'])
+    small_raw_overlap = float(config['small_raw_overlap'])
+    cohort_af = float(config['cohort_AF'])
+    coverage_cutoff = float(config['coverage_cutoff'])
+    depth_only_size = float(config['depth_only_size'])
+    parents_overlap = float(config['parents_overlap'])
+    gq_min = float(config['gq_min'])
+
+    # Read files
+    verbose_print('Reading Input Files', verbose)
+    bed = pd.read_csv(bed_file, sep='\t').replace(np.nan, '', regex=True)
+    bed = bed[(bed['samples'] != "")]
+    bed.rename(columns={'#chrom': 'chrom'}, inplace=True)
+    vcf = pd.read_csv(vcf_file, sep='\t')
+    ped = pd.read_csv(ped_file, sep='\t')
+    disorder = pd.read_csv(disorder_file, sep='\t', header=None)
+    raw_bed_colnames = ['ID', 'start', 'end', 'svtype', 'sample']
+    raw_bed_child = pd.read_csv(raw_file_proband, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True)
+    raw_bed_parent = pd.read_csv(raw_file_parent, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True)
+    raw_bed_depth_child = pd.read_csv(raw_file_depth_proband, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True)
+    raw_bed_depth_parent = pd.read_csv(raw_file_depth_parent, sep='\t', names=raw_bed_colnames, header=None).replace(np.nan, '', regex=True)
+    exclude_regions = pd.read_csv(exclude_regions, sep='\t').replace(np.nan, '', regex=True)
+    bincov_colnames = ['batch', 'bincov', 'index']
+    sample_batches_colnames = ['sample', 'batch']
+    bincov = pd.read_csv(coverage, sep='\t', names=bincov_colnames, header=None).replace(np.nan, '', regex=True)
+    sample_batches = pd.read_csv(batches, sep='\t', names=sample_batches_colnames, header=None).replace(np.nan, '', regex=True)
+
+    #########################
+    # REFORMAT AND ANNOTATE #
+    #########################
+    # Exit out of bed file is empty
+    if bed.size == 0:
+        bed.to_csv(path_or_buf=out_file, mode='a', index=False, sep='\t', header=True)
+        bed.to_csv(path_or_buf=de_novo_out_file, mode='a', index=False, sep='\t', header=True)
+        exit()
+    # Get parents and children ids
+    verbose_print('Getting parents/children/affected/unaffected IDs', verbose)
+    start = time.time()
+    list_of_trios = [item for item, count in collections.Counter(ped["FamID"]).items() if count == 3]
+    families = ped[(ped['FamID'].isin(list_of_trios)) &
+                   (ped['FatherID'] != "0") &
+                   (ped['MotherID'] != "0")]['FamID'].values
+    trios = ped[(ped['FamID'].isin(families))]
+    parents = trios[(trios['FatherID'] == "0") & (trios['MotherID'] == "0")]['IndividualID'].values
+    children = trios[(trios['FatherID'] != "0") & (trios['MotherID'] != "0")]['IndividualID'].values
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Get counts at cohort level and number of parents/children
+    verbose_print('Getting counts', verbose)
+    start = time.time()
+    bed['num_children'] = bed.apply(lambda r: get_count(r, children), axis=1)
+    bed['num_parents'] = bed.apply(lambda r: get_count(r, parents), axis=1)
+    bed['AF_parents'] = bed.apply(lambda r: get_parents_frequency(r, parents), axis=1)
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Remove mCNVs, BNDs and SVs in sex chromosomes
+    start = time.time()
+    verbose_print('Remove BND and mCNV', verbose)
+    bed = bed[(~bed['svtype'].isin(['BND', 'CNV']))]
+    # bed = bed[(~bed['svtype'].isin(['BND', 'CNV']) & (~bed['chrom'].isin(["chrY", "chrX"])))]
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Flag if small or large CNV based on large_cnv_size cutoff and flag for removal based on size for depth only calls
+    verbose_print('Flagging calls depending on size', verbose)
+    start = time.time()
+    bed['is_large_cnv'] = (bed['SVLEN'] >= large_cnv_size) & ((bed['svtype'] == 'DEL') | (bed['svtype'] == 'DUP'))
+    bed['is_small_cnv'] = (bed['SVLEN'] < large_cnv_size) & ((bed['svtype'] == 'DEL') | (bed['svtype'] == 'DUP'))
+    bed['is_depth_only'] = (bed['EVIDENCE'] == "RD")
+    bed['is_depth_only_small'] = (bed['svtype'] == "DUP") & (bed['ALGORITHMS'] == "depth") & (bed['SVLEN'] <= depth_only_size)
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Split into one row per sample
+    verbose_print('Split into one row per sample', verbose)
+    start = time.time()
+    bed_split = bed.assign(sample=bed.samples.str.split(",")).explode('sample')
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Sepparate variants in children and parents
+    verbose_print('Sepparate variants in children and parents', verbose)
+    start = time.time()
+    bed_child = bed_split[bed_split['sample'].isin(children)]
+    bed_parents = bed_split[bed_split['sample'].isin(parents)]
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Flag variants in genomic disorder (GD) regions
+    verbose_print('Flagging variants in GD', verbose)
+    start = time.time()
+    bed_child['in_gd'] = bed_child['name'].isin(disorder[0])
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Annotate family information for filtering
+    bed_child['family_id'] = bed_child.apply(lambda r: get_family_id(r, ped), axis=1)
+    bed_child['name_famid'] = bed_child['name'] + "_" + bed_child['family_id'].astype(str).str.strip("[]")
+    bed_parents['family_id'] = bed_parents.apply(lambda r: get_family_id(r, ped), axis=1)
+    bed_parents['name_famid'] = bed_parents['name'] + "_" + bed_parents['family_id'].astype(str).str.strip("[]")
+
+    # Filter out by frequency - AF gnomad < 0.01 OR inGD
+    verbose_print('Filtering by frequency', verbose)
+    start = time.time()
+    bed_child["AF"] = pd.to_numeric(bed_child["AF"])
+    try:
+        bed_child[gnomad_col] = pd.to_numeric(bed_child[gnomad_col])
+        remove_freq = bed_child[~((((bed_child[gnomad_col] <= gnomad_af) & (bed_child['AF'] <= cohort_af)) |
+                                   ((bed_child[gnomad_col].isnull()) & (bed_child['AF'] <= cohort_af)) |
+                                   (bed_child['in_gd'])))]['name_famid'].to_list()
+        bed_child['is_de_novo'] = pd.Series(True, index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), False)
+        bed_child['filter_flag'] = pd.Series('de_novo', index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), 'AF')
+
+    except KeyError:
+        bed_child[alt_gnomad_col] = pd.to_numeric(bed_child[alt_gnomad_col])
+        remove_freq = bed_child[~((((bed_child[alt_gnomad_col] <= gnomad_af) & (bed_child['AF'] <= cohort_af)) |
+                                   ((bed_child[alt_gnomad_col].isnull()) & (bed_child['AF'] <= cohort_af)) |
+                                   (bed_child['in_gd'])))]['name_famid'].to_list()
+        bed_child['is_de_novo'] = pd.Series(True, index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), False)
+        bed_child['filter_flag'] = pd.Series('de_novo', index=bed_child.index).mask(bed_child['name_famid'].isin(remove_freq), 'AF')
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Get counts within family and remove if SV in parents
+    verbose_print('Keep variants in children only', verbose)
+    start = time.time()
+    try:
+        parents_sc = int(config['parents_SC'])
+    except KeyError:
+        parents_sc = len(parents)
+    if len(bed_child.index) > 0:  # Otherwise, if nothing in bed_child will get an error
+        bed_child['num_parents_family'] = bed_child.apply(lambda r: get_family_count(r, ped), axis=1)
+
+        remove_child = bed_child[~((bed_child['num_parents_family'] == 0) &  # there are no parents of the affected child in the samples column
+                                   (bed_child['num_children'] >= 1) &  # for each line there is at least one child in the samples column
+                                   (bed_child['AF_parents'] <= parents_af) &  # for each line, the frequency of parents in the samples column must be < 0.05
+                                   (bed_child['num_parents'] <= parents_sc))]['name_famid'].to_list()  # for each line, the number of parents must be <= parents_SC or the total number of parents in the ped
+
+        bed_child.loc[bed_child['name_famid'].isin(remove_child) & bed_child['is_de_novo'], 'filter_flag'] = 'in_parent'
+        bed_child.loc[bed_child['name_famid'].isin(remove_child) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Extract info from the VCF file
+    verbose_print('Appending FILTER information', verbose)
+    start = time.time()
+    if len(bed_child.index) > 0:
+        bed_child['GT'] = bed_child.apply(lambda r: variant_info(r, 'GT', vcf), axis=1)
+        bed_child['EV'] = bed_child.apply(lambda r: variant_info(r, 'EV', vcf), axis=1)
+        bed_child['GQ'] = bed_child.apply(lambda r: variant_info(r, 'GQ', vcf), axis=1)
+        bed_child['RD_CN'] = bed_child.apply(lambda r: variant_info(r, 'RD_CN', vcf), axis=1)
+        bed_child['RD_GQ'] = bed_child.apply(lambda r: variant_info(r, 'RD_GQ', vcf), axis=1)
+        bed_child['PE_GQ'] = bed_child.apply(lambda r: variant_info(r, 'PE_GQ', vcf), axis=1)
+        bed_child['PE_GT'] = bed_child.apply(lambda r: variant_info(r, 'PE_GT', vcf), axis=1)
+        bed_child['SR_GQ'] = bed_child.apply(lambda r: variant_info(r, 'SR_GQ', vcf), axis=1)
+        bed_child['SR_GT'] = bed_child.apply(lambda r: variant_info(r, 'SR_GT', vcf), axis=1)
+    else:
+        bed_child['GT'] = 'NA'
+        bed_child['EV'] = 'NA'
+        bed_child['GQ'] = 'NA'
+        bed_child['RD_CN'] = 'NA'
+        bed_child['RD_GQ'] = 'NA'
+        bed_child['PE_GQ'] = 'NA'
+        bed_child['PE_GT'] = 'NA'
+        bed_child['SR_GQ'] = 'NA'
+        bed_child['SR_GT'] = 'NA'
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Remove WHAM only and GT = 1
+    verbose_print('Remove wham only and GT=1 calls', verbose)
+    start = time.time()
+    remove_wham = bed_child[(bed_child['ALGORITHMS'] == "wham") & (bed_child['GQ'] == '1')]
+    bed_child.loc[bed_child['name_famid'].isin(remove_wham) & bed_child['is_de_novo'], 'filter_flag'] = 'wham_only'
+    bed_child.loc[bed_child['name_famid'].isin(remove_wham), 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Annotate parental information
+    if len(bed_child.index) > 0:
+        bed_child['paternal_gq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'GQ'), axis=1)
+        bed_child['maternal_gq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'GQ'), axis=1)
+        bed_child['paternal_rdcn'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'RD_CN'), axis=1)
+        bed_child['maternal_rdcn'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'RD_CN'), axis=1)
+        bed_child['paternal_pegq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'PE_GQ'), axis=1)
+        bed_child['maternal_pegq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'PE_GQ'), axis=1)
+        bed_child['paternal_srgq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'FatherID', 'SR_GQ'), axis=1)
+        bed_child['maternal_srgq'] = bed_child.apply(lambda r: get_info_parent(r, ped, vcf, 'MotherID', 'SR_GQ'), axis=1)
+    else:
+        bed_child['paternal_gq'] = 'NA'
+        bed_child['maternal_gq'] = 'NA'
+        bed_child['paternal_rdcn'] = 'NA'
+        bed_child['maternal_rdcn'] = 'NA'
+        bed_child['paternal_pegq'] = 'NA'
+        bed_child['maternal_pegq'] = 'NA'
+        bed_child['paternal_srgq'] = 'NA'
+        bed_child['maternal_srgq'] = 'NA'
+
+    # LARGE CNV: Check for false negative in parents: check depth in parents, independently of the calls
+    verbose_print('Large CNVs check', verbose)
+    start = time.time()
+    # 1. Strip out if same CN in parents and proband if not in chrX
+    remove_large_cnv = bed_child.loc[~(((bed_child['SVLEN'] <= 5000) | ((bed_child['RD_CN'] != bed_child['maternal_rdcn']) & (bed_child['RD_CN'] != bed_child['paternal_rdcn']))) | (bed_child['chrom'] == 'chrX'))]
+    bed_child.loc[bed_child['name_famid'].isin(remove_large_cnv) & bed_child['is_de_novo'], 'filter_flag'] = 'same_cn_as_parents'
+    bed_child.loc[bed_child['name_famid'].isin(remove_large_cnv) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # 2. Check if call in parents with bedtools coverage (332)
+    verbose_print('CNV present in parents check', verbose)
+    start = time.time()
+    cols_keep = ['family_chrom', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid']
+    bed_child_large = bed_child[(bed_child['is_large_cnv'])]
+    bed_parents_large = bed_parents[(bed_parents['is_large_cnv'])]
+    if (len(bed_child_large.index) > 0) & (len(bed_parents_large.index) > 0):
+        bed_child_large['family_chrom'] = bed_child_large.apply(lambda r: add_family(r, ped), axis=1)
+        bed_child_large = bed_child_large[cols_keep].to_string(header=False, index=False)
+        bed_child_large = pybedtools.BedTool(bed_child_large, from_string=True).sort()
+
+        bed_parents_large['family_chrom'] = bed_parents_large.apply(lambda r: add_family(r, ped), axis=1)
+        bed_parents_large = bed_parents_large[cols_keep].to_string(header=False, index=False)
+        bed_parents_large = pybedtools.BedTool(bed_parents_large, from_string=True).sort()
+
+        bed_overlap = bed_child_large.coverage(bed_parents_large).to_dataframe(disable_auto_names=True, header=None)
+        names_overlap = bed_overlap[(bed_overlap[10] >= parents_overlap)][6].to_list()
+    else:
+        names_overlap = ['']
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Add if overlap to bed child table
+    bed_child['overlap_parent'] = (bed_child['name_famid'].isin(names_overlap))
+
+    # Small calls:
+    # If RD,SR and < large_cnv_size, treat RD,SR as SR
+    verbose_print('Small CNVs check', verbose)
+    start = time.time()
+    bed_child['EVIDENCE_FIX'] = bed_child['EVIDENCE']
+    bed_child[(bed_child['SVLEN'] <= large_cnv_size) &
+              (bed_child['EVIDENCE'] == "RD,SR") &
+              ((bed_child['svtype'] == 'DEL') | (bed_child['svtype'] == 'DUP'))]['EVIDENCE_FIX'] = "SR"
+
+    # Add if variant contains RD evidence
+    bed_child['contains_RD'] = bed_child.EVIDENCE_FIX.str.contains('RD')
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    ###################
+    # RAW FILES CHECK #
+    ###################
+    # Keep all CPT and CTX - they are rare and the type is different than in the raw files
+    # Check only DEL,DUP and INS for raw evidence
+    verbose_print('Checking raw files', verbose)
+    # Remove depth raw calls > 1MB
+    raw_bed_depth_parent['SVLEN'] = raw_bed_depth_parent['end'] - raw_bed_depth_parent['start']
+    raw_bed_ref_depth_parent_subset = raw_bed_depth_parent[(raw_bed_depth_parent['SVLEN'] < 1000000)]
+
+    # Reformat raw files
+    raw_bed_ref_child = convert_to_bedtool(raw_bed_child, sort=False)
+    raw_bed_ref_parent = convert_to_bedtool(raw_bed_parent, sort=False)
+    raw_bed_ref_depth_child = convert_to_bedtool(raw_bed_depth_child, sort=False)
+    raw_bed_ref_depth_parent = convert_to_bedtool(raw_bed_ref_depth_parent_subset, sort=False)
+
+    # Reformat de novo filt calls
+    bed_child['chrom_type_sample'] = bed_child['chrom'] + "_" + bed_child['SVTYPE'] + "_" + bed_child['sample']
+    bed_child['chrom_type_family'] = bed_child['chrom'] + "_" + bed_child['SVTYPE'] + "_" + bed_child['family_id'].astype(str)
+
+    cols_keep_child = ['chrom_type_sample', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid']
+    cols_keep_parent = ['chrom_type_family', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid']
+
+    # Subset bed file down to just those where is_de_novo is still = True to make searching faster
+    bed_child_de_novo = bed_child[bed_child['is_de_novo']]
+
+    # INSERTIONS:
+    verbose_print('Checking insertions in raw files', verbose)
+    start = time.time()
+    bed_filt_ins = bed_child_de_novo[bed_child_de_novo['SVTYPE'] == 'INS']
+    if len(bed_filt_ins.index) > 0:
+        verbose_print('Checking if insertion in proband is in raw files', verbose)
+        bed_filt_ins_proband = convert_to_bedtool(bed_filt_ins, cols_to_keep=cols_keep_child, sort=True)
+        ins_names_overlap_proband = get_insertion_intersection(bed_filt_ins_proband, raw_bed_ref_child)
+        verbose_print('Checking if insertion in proband are also in raw files for the parents', verbose)
+        bed_filt_ins_fam = convert_to_bedtool(bed_filt_ins, cols_to_keep=cols_keep_parent, sort=True)
+        ins_names_overlap_parent = get_insertion_intersection(bed_filt_ins_fam, raw_bed_ref_parent)
+        ins_names_overlap = [x for x in ins_names_overlap_proband if x not in ins_names_overlap_parent]
+    else:
+        ins_names_overlap = ['']
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Large CNVS: Reciprocal overlap large_raw_overlap
+    verbose_print('Checking large cnvs in raw files', verbose)
+    start = time.time()
+
+    large_bed_filt_cnv_depth = bed_child_de_novo[(bed_child_de_novo['is_large_cnv']) & (bed_child_de_novo['SVLEN'] >= 5000)]
+    large_bed_filt_cnv_other = bed_child_de_novo[(bed_child_de_novo['is_large_cnv']) & (bed_child_de_novo['SVLEN'] < 5000)]
+
+    if len(large_bed_filt_cnv_other.index) > 0:
+        verbose_print('Checking if intermediate cnv in proband is in raw files', verbose)
+        bed_filt_cnv_proband_other = convert_to_bedtool(large_bed_filt_cnv_other, cols_to_keep=cols_keep_child, sort=True)
+        large_cnv_names_overlap_proband_other = get_cnv_intersection_other(bed_filt_cnv_proband_other, raw_bed_ref_child, large_raw_overlap)
+        verbose_print('Checking if intermediate cnvs in proband are also in raw files for the parents', verbose)
+        bed_filt_cnv_fam_other = convert_to_bedtool(large_bed_filt_cnv_other, cols_to_keep=cols_keep_parent, sort=True)
+        large_cnv_names_overlap_parent_other = get_cnv_intersection_other(bed_filt_cnv_fam_other, raw_bed_ref_parent, large_raw_overlap)
+        large_cnv_names_overlap_other = [x for x in large_cnv_names_overlap_proband_other if x not in large_cnv_names_overlap_parent_other]
+    else:
+        large_cnv_names_overlap_other = ['']
+
+    if len(large_bed_filt_cnv_depth.index) > 0:
+        verbose_print('Checking if large cnv in proband is in raw files', verbose)
+        bed_filt_cnv_proband_depth = convert_to_bedtool(large_bed_filt_cnv_depth, cols_to_keep=cols_keep_child, sort=True)
+        large_cnv_names_overlap_proband_depth = get_cnv_intersection_depth(bed_filt_cnv_proband_depth, raw_bed_ref_depth_child, large_raw_overlap)
+        verbose_print('Checking if large cnv in proband are also in raw files for the parents', verbose)
+        bed_filt_cnv_fam_depth = convert_to_bedtool(large_bed_filt_cnv_depth, cols_to_keep=cols_keep_parent, sort=True)
+        large_cnv_names_overlap_parent_depth = get_cnv_intersection_depth(bed_filt_cnv_fam_depth, raw_bed_ref_depth_parent, large_raw_overlap)
+        large_cnv_names_overlap_depth = [x for x in large_cnv_names_overlap_proband_depth if x not in large_cnv_names_overlap_parent_depth]
+    else:
+        large_cnv_names_overlap_depth = ['']
+
+    large_cnv_names_overlap = large_cnv_names_overlap_other + large_cnv_names_overlap_depth
+
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Small CNVs - Reciprocal overlap small_raw_overlap
+    verbose_print('Checking small cnvs in raw files', verbose)
+    start = time.time()
+    small_bed_filt_cnv = bed_child_de_novo[(bed_child_de_novo['is_small_cnv'])]  # We do not want to check against raw depth files if CNV <= 5kb
+    if len(small_bed_filt_cnv.index) > 0:
+        verbose_print('Checking if small cnv in proband is in raw files', verbose)
+        bed_filt_cnv_proband = convert_to_bedtool(small_bed_filt_cnv, cols_to_keep=cols_keep_child, sort=True)
+        small_cnv_names_overlap_proband = get_cnv_intersection_other(bed_filt_cnv_proband, raw_bed_ref_child, small_raw_overlap)
+        verbose_print('Checking small cnvs in probands are also in raw files for parents', verbose)
+        bed_filt_cnv_fam = convert_to_bedtool(small_bed_filt_cnv, cols_to_keep=cols_keep_parent, sort=True)
+        small_cnv_names_overlap_parent = get_cnv_intersection_other(bed_filt_cnv_fam, raw_bed_ref_parent, small_raw_overlap)
+        small_cnv_names_overlap = [x for x in small_cnv_names_overlap_proband if x not in small_cnv_names_overlap_parent]
+    else:
+        small_cnv_names_overlap = ['']
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    bed_child.loc[~(bed_child['name_famid'].isin(ins_names_overlap + large_cnv_names_overlap + small_cnv_names_overlap)) & bed_child['is_de_novo'], 'filter_flag'] = 'not_in_raw_files_or_in_parents_raw_files'
+    bed_child.loc[~(bed_child['name_famid'].isin(ins_names_overlap + large_cnv_names_overlap + small_cnv_names_overlap)) & bed_child['is_de_novo'], 'is_de_novo'] = False
+
+    #############
+    # FILTERING #
+    #############
+    verbose_print('Filtering out calls', verbose)
+
+    # 1. Filter out calls in exclude regions
+    verbose_print('Filtering out calls in exclude regions', verbose)
+    start = time.time()
+    # Reformat exclude_regions to bedtool
+    exclue_regions_bt = convert_to_bedtool(exclude_regions, sort=True)
+    # convert bed_final to bedtool
+    cols_keep_exclude_regions = ['chrom', 'start', 'end', 'name', 'svtype', 'sample', 'name_famid']
+    if len(bed_child.index) > 0:
+        bed_child_bt = convert_to_bedtool(bed_child, cols_keep_exclude_regions, sort=True)
+        exclude_regions_intersect = bed_child_bt.coverage(exclue_regions_bt).to_dataframe(disable_auto_names=True, header=None)  # HB said to use bedtools coverage, -f and -F give the same SVs to be removed
+        if len(exclude_regions_intersect) != 0:
+            remove_regions = exclude_regions_intersect[exclude_regions_intersect[10] > 0.5][6].to_list()
+        else:
+            remove_regions = ['']
+    else:
+        remove_regions = ['']
+
+    bed_child.loc[bed_child['name_famid'].isin(remove_regions) & bed_child['is_de_novo'], 'filter_flag'] = 'in_blacklist_region'
+    bed_child.loc[bed_child['name_famid'].isin(remove_regions) & bed_child['is_de_novo'], 'is_de_novo'] = False
+
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # 2. Filter by type
+    # Keep any SV type that is not a DEL, DUP, or INS
+    keep_other_sv = bed_child[((~bed_child['SVTYPE'].isin(['DEL', 'DUP', 'INS'])) & (bed_child['filter_flag'] != 'AF') & (bed_child['filter_flag'] != 'in_parent'))]['name_famid'].to_list()
+
+    # 3. Filter on DELs, DUPs, and INS
+    # Filter by size
+    # Filter out if large CNVs have parents overlap
+    verbose_print('Filtering out large CNVs with parents overlap', verbose)
+    start = time.time()
+    remove_large = bed_child[(bed_child['is_large_cnv']) &
+                             (bed_child['overlap_parent'])]['name_famid'].to_list()
+    bed_child.loc[bed_child['name_famid'].isin(remove_large) & bed_child['is_de_novo'], 'filter_flag'] = 'large_cnv_with_parent_overlap'
+    bed_child.loc[bed_child['name_famid'].isin(remove_large) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    bed_child.to_csv(path_or_buf="after_large_cnv.txt", mode='a', index=False, sep='\t', header=True)
+
+    # Filter out if small cnvs that are SR-only don't have BOTHSIDES_SUPPORT
+    verbose_print('Filtering out small CNVs that are SR-only and dont have BOTHSIDES_SUPPORT', verbose)
+    start = time.time()
+    remove_small = bed_child[(bed_child['is_small_cnv']) &
+                             (bed_child['EVIDENCE_FIX'] == 'SR') &
+                             ~(bed_child.FILTER.str.contains('BOTHSIDES_SUPPORT'))]['name_famid'].to_list()
+    bed_child.loc[bed_child['name_famid'].isin(remove_small) & bed_child['is_de_novo'], 'filter_flag'] = 'small_cnv_SR_only'
+    bed_child.loc[bed_child['name_famid'].isin(remove_small) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Filter out calls that are depth only and < depth_only_size
+    verbose_print('Filtering out calls that are depth only and < depth_only_size', verbose)
+    start = time.time()
+    remove_depth_small = bed_child[(bed_child['is_depth_only_small'])]['name_famid'].to_list()
+    bed_child.loc[bed_child['name_famid'].isin(remove_depth_small) & bed_child['is_de_novo'], 'filter_flag'] = 'depth_only_small_variant'
+    bed_child.loc[bed_child['name_famid'].isin(remove_depth_small) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Filter out DELs that are >500bp and RD_CN=2 and PE only envidence
+    verbose_print('Filtering out DELs that are RD_CN=2 and PE only evidence', verbose)
+    start = time.time()
+    remove_dels = bed_child[(bed_child['SVTYPE'] == 'DEL') & ((bed_child['RD_CN'] == '2') | (bed_child['RD_CN'] == '3')) & (bed_child['EVIDENCE'] == 'PE')]['name_famid'].to_list()
+    bed_child.loc[bed_child['name_famid'].isin(remove_dels) & bed_child['is_de_novo'], 'filter_flag'] = 'del_with_rdcn2_pe_only'
+    bed_child.loc[bed_child['name_famid'].isin(remove_dels) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # 4. Filter by quality
+    # Filter out if parents GQ is <= gq_min
+    verbose_print('Filtering if parents GQ <= min_gq', verbose)
+    start = time.time()
+    if len(bed_child.index) > 0:
+        bed_child['keep_gq'] = bed_child.apply(lambda r: min_gq(r, gq_min), axis=1)
+        remove_gq = bed_child[bed_child['keep_gq'] == 'Remove']['name_famid'].to_list()
+        bed_child.loc[bed_child['name_famid'].isin(remove_gq) & bed_child['is_de_novo'], 'filter_flag'] = 'bad_gq_in_parents'
+        bed_child.loc[bed_child['name_famid'].isin(remove_gq) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Filter out INS that are manta or melt only and are SR only, have GQ=0, and FILTER contains 'HIGH_SR_BACKGROUND'
+    verbose_print('Filtering out INS that are manta or melt only and SR only, with GQ=0 and FILTER contains HIGH_SR_BACKGROUND', verbose)
+    start = time.time()
+    remove_ins = bed_child[(bed_child['SVTYPE'] == 'INS') & ((bed_child['ALGORITHMS'] == 'manta') | (bed_child['ALGORITHMS'] == 'melt')) & (bed_child['EVIDENCE_FIX'] == 'SR') & ((bed_child['GQ'] == '0') | (bed_child.FILTER.str.contains('HIGH_SR_BACKGROUND')))]['name_famid'].to_list()
+    bed_child.loc[bed_child['name_famid'].isin(remove_ins) & bed_child['is_de_novo'], 'filter_flag'] = 'ins_filter'
+    bed_child.loc[bed_child['name_famid'].isin(remove_ins) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # Remove if low coverage in parents
+    start = time.time()
+    bed_child_coverage = bed_child[bed_child['is_de_novo']]
+    verbose_print('Removing calls if there is low coverage evidence in parents', verbose)
+    if len(bed_child_coverage.index) > 0:
+        bed_child_coverage['median_coverage'] = bed_child_coverage.apply(
+            lambda r: get_median_coverage(
+                find_coverage(r, ped, sample_batches, bincov, family_member='mother'),
+                find_coverage(r, ped, sample_batches, bincov, family_member='father'),
+                coverage_cutoff),
+            axis=1)
+        remove_coverage = bed_child_coverage[bed_child_coverage['median_coverage'] == 'Remove']['name_famid'].to_list()
+    else:
+        remove_coverage = ['']
+
+    bed_child.loc[bed_child['name_famid'].isin(remove_coverage) & bed_child['is_de_novo'], 'filter_flag'] = 'low_coverage_in_parents'
+    bed_child.loc[bed_child['name_famid'].isin(remove_coverage) & bed_child['is_de_novo'], 'is_de_novo'] = False
+
+    end = time.time()
+    delta = end - start
+    print("Took %f seconds to process" % delta)
+
+    # 5. Clean up and remove duplicated CPX SV
+    # Keep SVs
+    bed_child.loc[bed_child['name_famid'].isin(keep_other_sv), 'is_de_novo'] = True
+    bed_child.loc[bed_child['name_famid'].isin(keep_other_sv), 'filter_flag'] = 'not_del_dup_ins'
+
+    # Remove duplicated CPX events that come from vcf output of module 18
+    bed_child['is_cpx'] = (bed_child['SVTYPE'] == "CPX")
+    bed_child['is_duplicated'] = bed_child.duplicated(keep='first', subset=['start', 'end', 'sample'])
+    remove_duplicated = bed_child[(bed_child['is_duplicated']) & (bed_child['is_cpx'])]['name_famid'].to_list()
+    bed_child.loc[bed_child['name_famid'].isin(remove_duplicated) & bed_child['is_de_novo'], 'is_de_novo'] = False
+    bed_child.loc[bed_child['name_famid'].isin(remove_duplicated) & bed_child['is_de_novo'], 'filter_flag'] = 'duplicated_cpx'
+
+    # Move sample column to column 6
+    bed_final = bed_child
+    col = bed_final.pop("sample")
+    bed_final.insert(loc=5, column='sample', value=col, allow_duplicates=True)
+
+    # Define output files
+    output = bed_final
+    de_novo = bed_final[(bed_final['is_de_novo']) | (bed_final['filter_flag'] == 'ins_filter') | (bed_final['in_gd'])]
+
+    # Write output
+    output.to_csv(path_or_buf=out_file, mode='a', index=False, sep='\t', header=True)
+    de_novo.to_csv(path_or_buf=de_novo_out_file, mode='a', index=False, sep='\t', header=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/denovo/reformat_raw_bed.R b/src/denovo/reformat_raw_bed.R
new file mode 100644
index 000000000..5337a6054
--- /dev/null
+++ b/src/denovo/reformat_raw_bed.R
@@ -0,0 +1,49 @@
+# This script reformats the output bed files from module 01 with raw call evidence
+# to match desired input for de novo SV filtering
+
+# Define args and load libraries
+args = commandArgs(trailingOnly=TRUE)
+require(plyr)
+library(data.table)
+library(tidyverse)
+
+input_bed <- args[1]
+input_ped <- args[2]
+out_probands <- args[3]
+out_parents <- args[4]
+
+# Read input bed
+bed <- fread(input_bed)
+ped <- fread(input_ped)
+
+# adding back column names
+colnames(bed) <- c("CHROM", "start", "end", "name", "svtype", "samples", "SVTYPE")
+
+# Split into one sample per row
+bed %>%
+  subset(select = c(CHROM, start, end, SVTYPE, samples)) %>%
+  separate_rows(samples, sep = ",", convert = T) -> bed_split
+
+# split into parents and probands
+bed_probands <- bed_split
+bed_probands <- subset(bed_probands, !(samples %in% ped$MotherID) & !(samples %in% ped$FatherID))
+
+bed_parents <- bed_split
+bed_parents <- subset(bed_parents, samples %in% ped$MotherID | samples %in% ped$FatherID)
+
+# add in a family ID column to bed_parents
+ped_subset <- subset(ped, select = c('FamID', 'IndividualID'))
+colnames(ped_subset) <- c('FamID', 'samples')
+
+bed_parents <- merge(bed_parents,ped_subset, by="samples", all.x = TRUE)
+
+# Reformat chromosome column
+bed_probands$CHROM <- paste0(bed_probands$CHROM, "_", bed_probands$SVTYPE, "_", bed_probands$samples)
+bed_parents$CHROM <- paste0(bed_parents$CHROM, "_", bed_parents$SVTYPE, "_", bed_parents$FamID)
+
+# reorder the columns of bed_parents
+bed_parents2 <- bed_parents[, c(2,3,4,5,1)]
+
+# Write output file
+write.table(bed_probands, out_probands, sep = "\t", quote = F, row.names = F, col.names = F)
+write.table(bed_parents2, out_parents, sep = "\t", quote = F, row.names = F, col.names = F)
diff --git a/wdl/DeNovoSVsScatter.wdl b/wdl/DeNovoSVsScatter.wdl
new file mode 100644
index 000000000..eaedbaf0a
--- /dev/null
+++ b/wdl/DeNovoSVsScatter.wdl
@@ -0,0 +1,204 @@
+version 1.0
+    
+import "Structs.wdl"
+import "Utils.wdl" as Utils
+
+workflow DeNovoSVsScatter {
+
+    input {
+        File ped_input
+        Array[File] vcf_files
+        File disorder_input
+        String chromosome
+        File raw_proband
+        File raw_parents
+        File raw_depth_proband
+        File raw_depth_parents
+        File exclude_regions
+        File sample_batches
+        File batch_bincov_index
+        File python_config
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_denovo
+        RuntimeAttr? runtime_attr_vcf_to_bed
+        RuntimeAttr? runtime_attr_merge_bed
+    }
+    
+    Array[String] coverage_index_files = transpose(read_tsv(batch_bincov_index))[2]
+
+    # Scatter genotyping over shards
+    scatter (shard in vcf_files) {
+        call Utils.VcfToBed as VcfToBed {
+            input:
+                vcf_file=shard,
+                args="--info ALL --include-filters",
+                variant_interpretation_docker=variant_interpretation_docker,
+                runtime_attr_override = runtime_attr_vcf_to_bed
+        }
+
+        call RunDeNovo {
+            input:
+                bed_input=VcfToBed.bed_output,
+                ped_input=ped_input,
+                vcf_input=shard,
+                disorder_input=disorder_input,
+                chromosome=chromosome,
+                raw_proband=raw_proband,
+                raw_parents=raw_parents,
+                raw_depth_proband=raw_depth_proband,
+                raw_depth_parents=raw_depth_parents,
+                exclude_regions = exclude_regions,
+                coverage_indices = coverage_index_files,
+                sample_batches = sample_batches,
+                batch_bincov_index = batch_bincov_index,
+                python_config=python_config,
+                variant_interpretation_docker=variant_interpretation_docker,
+                runtime_attr_override = runtime_attr_denovo
+        }   
+    }
+
+    call MergeBedFiles as MergeBedFilesAnnotated {
+        input:
+            bed_files = RunDeNovo.annotation_output,
+            chromosome = chromosome,
+            variant_interpretation_docker = variant_interpretation_docker,
+            runtime_attr_override = runtime_attr_merge_bed
+    }
+
+    call MergeBedFiles as MergeBedFilesFinal {
+        input:
+            bed_files = RunDeNovo.denovo_output,
+            chromosome = chromosome,
+            variant_interpretation_docker = variant_interpretation_docker,
+            runtime_attr_override = runtime_attr_merge_bed
+    }
+
+    output {
+        File per_chromosome_annotation_output_file = MergeBedFilesAnnotated.per_chromosome_denovo_output
+        File per_chromosome_final_output_file = MergeBedFilesFinal.per_chromosome_denovo_output
+    }
+}
+
+task RunDeNovo {
+
+    input {
+        File bed_input
+        File ped_input
+        File vcf_input
+        File disorder_input
+        String chromosome
+        File raw_proband
+        File raw_parents
+        File raw_depth_proband
+        File raw_depth_parents
+        File exclude_regions
+        Array[File] coverage_indices
+        File batch_bincov_index
+        File sample_batches
+        File python_config
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float vcf_size = size(vcf_input, "GB")
+    Float bed_size = size(bed_input, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 16,
+        disk_gb: ceil(15 + vcf_size + bed_size * 1.5),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File denovo_output = "~{basename}.denovo.bed.gz"
+        File annotation_output = "~{basename}.annotation.bed.gz"
+    }
+
+    String basename = basename(vcf_input, ".vcf.gz")
+    command {
+        set -exuo pipefail
+
+        export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`
+
+        bcftools view ~{vcf_input} | grep -v ^## | bgzip -c > ~{basename}.noheader.vcf.gz
+        python /src/denovo/denovo_svs.py \
+            --bed ~{bed_input} \
+            --ped ~{ped_input} \
+            --vcf ~{basename}.noheader.vcf.gz \
+            --disorder ~{disorder_input} \
+            --out ~{basename}.annotation.bed \
+            --out_de_novo ~{basename}.denovo.bed \
+            --raw_proband ~{raw_proband} \
+            --raw_parents ~{raw_parents} \
+            --raw_depth_proband ~{raw_depth_proband} \
+            --raw_depth_parents ~{raw_depth_parents} \
+            --config ~{python_config} \
+            --exclude_regions ~{exclude_regions} \
+            --coverage ~{batch_bincov_index} \
+            --sample_batches ~{sample_batches} \
+            --verbose True
+
+        bgzip ~{basename}.denovo.bed
+        bgzip ~{basename}.annotation.bed
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task MergeBedFiles {
+
+    input {
+        Array[File] bed_files
+        String chromosome
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float bed_files_size = size(bed_files, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + (bed_files_size) * 2.0),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File per_chromosome_denovo_output = "~{chromosome}.denovo.merged.bed.gz"
+    }
+
+    command {
+        set -exuo pipefail
+
+        zcat ~{bed_files[0]} | awk 'NR==1' > ~{chromosome}.denovo.merged.bed
+        zcat ~{sep=" " bed_files} | grep -v ^chrom >> ~{chromosome}.denovo.merged.bed
+        bgzip ~{chromosome}.denovo.merged.bed
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
\ No newline at end of file
diff --git a/wdl/ReformatRawFiles.wdl b/wdl/ReformatRawFiles.wdl
new file mode 100644
index 000000000..5b6f676c0
--- /dev/null
+++ b/wdl/ReformatRawFiles.wdl
@@ -0,0 +1,223 @@
+version 1.0
+    
+import "Structs.wdl"
+import "Utils.wdl" as Utils
+import "TasksMakeCohortVcf.wdl" as TasksMakeCohortVcf
+
+workflow ReformatRawFiles {
+
+    input {
+        Array[String] contigs
+        File raw_files_list
+        File ped_input
+        String variant_interpretation_docker
+        String sv_base_mini_docker
+        Boolean depth
+        RuntimeAttr? runtime_attr_vcf_to_bed
+        RuntimeAttr? runtime_attr_merge_bed
+        RuntimeAttr? runtime_attr_divide_by_chrom
+        RuntimeAttr? runtime_attr_reformat_bed
+    }
+
+    Array[String] raw_files = transpose(read_tsv(raw_files_list))[1]
+
+    scatter(raw_file in raw_files) {
+        call Utils.VcfToBed as VcfToBed {
+            input:
+                vcf_file=raw_file,
+                args="--info SVTYPE",
+                variant_interpretation_docker=variant_interpretation_docker,
+                runtime_attr_override = runtime_attr_vcf_to_bed
+        }
+    }
+
+    call TasksMakeCohortVcf.CatUncompressedFiles as MergeBedFiles {
+        input:
+            shards=VcfToBed.bed_output,
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_attr_merge_bed
+    }
+
+    scatter (contig in contigs) {
+        call RawDivideByChrom {
+            input:
+                bed_file=MergeBedFiles.outfile,
+                chromosome=contig,
+                variant_interpretation_docker=variant_interpretation_docker,
+                runtime_attr_override=runtime_attr_divide_by_chrom
+        }
+
+        if (depth) {
+            call RawReformatBedDepth {
+                input:
+                    per_chromosome_bed_file=RawDivideByChrom.per_chromosome_bed_output,
+                    ped_input=ped_input,
+                    chromosome=contig,
+                    variant_interpretation_docker=variant_interpretation_docker,
+                    runtime_attr_override=runtime_attr_reformat_bed
+            }
+        }
+
+        if (!(depth)) {
+            call RawReformatBed {
+                input:
+                    per_chromosome_bed_file=RawDivideByChrom.per_chromosome_bed_output,
+                    ped_input=ped_input,
+                    chromosome=contig,
+                    variant_interpretation_docker=variant_interpretation_docker,
+                    runtime_attr_override=runtime_attr_reformat_bed
+            }
+        }
+        File reformatted_parents_output_ = select_first([RawReformatBed.reformatted_parents_output, RawReformatBedDepth.reformatted_parents_depth_output])
+        File reformatted_proband_output_ = select_first([RawReformatBed.reformatted_proband_output, RawReformatBedDepth.reformatted_proband_depth_output])
+    }
+
+
+    output {
+        Array[File] reformatted_parents_raw_files = reformatted_parents_output_
+        Array[File] reformatted_proband_raw_files = reformatted_proband_output_
+    }
+}
+
+task RawDivideByChrom {
+
+    input {
+        File bed_file
+        String chromosome
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(bed_file, "GB")
+    Float base_mem_gb = 3.75
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + input_size * 1.3),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File per_chromosome_bed_output = "${chromosome}.bed.gz"
+    }
+
+    command {
+        set -exuo pipefail
+        
+        zcat ~{bed_file} | \
+        awk '$1 == "~{chromosome}"' | \
+        bgzip -c > ~{chromosome}.bed.gz
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task RawReformatBed {
+
+    input {
+        File per_chromosome_bed_file
+        File ped_input
+        String chromosome
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float bed_file_size = size(per_chromosome_bed_file, "GB")
+    Float ped_size = size(ped_input, "GB")
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + ped_size + bed_file_size * 2.0),
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+       File reformatted_proband_output = "${chromosome}.proband.reformatted.sorted.bed.gz"
+       File reformatted_parents_output = "${chromosome}.parents.reformatted.sorted.bed.gz"
+    }
+
+    command {
+        set -exuo pipefail
+
+        # Reformat bed file
+        Rscript /src/denovo/reformat_raw_bed.R ~{per_chromosome_bed_file} ~{ped_input} ~{chromosome}.proband.reformatted.bed ~{chromosome}.parents.reformatted.bed
+        bedtools sort -i ~{chromosome}.proband.reformatted.bed | bgzip -c > ~{chromosome}.proband.reformatted.sorted.bed.gz
+        bedtools sort -i ~{chromosome}.parents.reformatted.bed | bgzip -c > ~{chromosome}.parents.reformatted.sorted.bed.gz
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}   
+
+task RawReformatBedDepth {
+    input {
+        File per_chromosome_bed_file
+        File ped_input
+        String chromosome
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float bed_file_size = size(per_chromosome_bed_file, "GB")
+    Float ped_size = size(ped_input, "GB")
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + ped_size + bed_file_size * 2.0),
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File reformatted_proband_depth_output = "${chromosome}.proband.depth.reformatted.sorted.bed.gz"
+        File reformatted_parents_depth_output = "${chromosome}.parents.depth.reformatted.sorted.bed.gz"
+    }
+
+    command {
+        set -exuo pipefail
+
+        # Reformat bed file
+        Rscript /src/denovo/reformat_raw_bed.R ~{per_chromosome_bed_file} ~{ped_input} ~{chromosome}.proband.reformatted.bed ~{chromosome}.parents.reformatted.bed
+        bedtools sort -i ~{chromosome}.proband.reformatted.bed | bgzip > ~{chromosome}.proband.depth.reformatted.sorted.bed.gz
+        bedtools sort -i ~{chromosome}.parents.reformatted.bed | bgzip > ~{chromosome}.parents.depth.reformatted.sorted.bed.gz
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}   
diff --git a/wdl/RunDeNovoSVs.wdl b/wdl/RunDeNovoSVs.wdl
new file mode 100644
index 000000000..2b8dbcfe9
--- /dev/null
+++ b/wdl/RunDeNovoSVs.wdl
@@ -0,0 +1,662 @@
+version 1.0
+    
+import "Structs.wdl"
+import "ReformatRawFiles.wdl" as raw
+import "TasksMakeCohortVcf.wdl" as miniTasks
+import "DeNovoSVsScatter.wdl" as runDeNovo
+import "Utils.wdl" as util
+
+workflow DeNovoSV {
+
+    input {
+        File vcf_file
+        File? vcf_index
+        File ped_input
+        File batch_raw_file
+        File batch_depth_raw_file
+        File batch_bincov_index
+
+        Array[String] contigs
+        String prefix
+
+        File? family_ids_txt
+
+        File python_config
+        File genomic_disorder_input
+
+        File exclude_regions
+        File sample_batches
+
+        Int records_per_shard
+
+        String variant_interpretation_docker
+        String sv_base_mini_docker
+        String sv_pipeline_updates_docker
+        String python_docker
+        RuntimeAttr? runtime_attr_gd
+        RuntimeAttr? runtime_attr_denovo
+        RuntimeAttr? runtime_attr_raw_vcf_to_bed
+        RuntimeAttr? runtime_attr_raw_merge_bed
+        RuntimeAttr? runtime_attr_subset_vcf
+        RuntimeAttr? runtime_attr_vcf_to_bed
+        RuntimeAttr? runtime_attr_raw_divide_by_chrom
+        RuntimeAttr? runtime_attr_raw_reformat_bed
+        RuntimeAttr? runtime_attr_merge_final_bed_files
+        RuntimeAttr? runtime_attr_create_plots
+        RuntimeAttr? runtime_override_shard_vcf
+        RuntimeAttr? runtime_attr_clean_ped
+        RuntimeAttr? runtime_attr_call_outliers
+        RuntimeAttr? runtime_attr_get_batched_files
+        RuntimeAttr? runtime_attr_subset_by_samples
+        RuntimeAttr? runtime_attr_merge_gd
+        RuntimeAttr? runtime_attr_batch_vcf
+        RuntimeAttr? runtime_attr_merge
+    }
+
+    # family_ids_txt is a text file containg one family id per line.
+    # If this file is given, subset all other input files to only include the necessary batches.
+    if (defined(family_ids_txt)) {
+        File family_ids_txt_ = select_first([family_ids_txt])
+        call GetBatchedFiles {
+            input:
+                batch_raw_file = batch_raw_file,
+                batch_depth_raw_file = batch_depth_raw_file,
+                ped_input = ped_input,
+                family_ids_txt = family_ids_txt_,
+                sample_batches = sample_batches,
+                batch_bincov_index = batch_bincov_index,
+                python_docker=python_docker,
+                runtime_attr_override = runtime_attr_get_batched_files
+        }
+
+        call util.SubsetVcfBySamplesList {
+            input:
+                vcf = vcf_file,
+                vcf_idx = vcf_index,
+                list_of_samples = GetBatchedFiles.samples,
+                outfile_name = prefix,
+                keep_af = true,
+                remove_private_sites = false,
+                sv_base_mini_docker = sv_base_mini_docker,
+                runtime_attr_override = runtime_attr_subset_by_samples
+        }
+    }
+
+    # Makes a ped file of singletons, duos, and trios for input into the de novo script (only including families of interest)
+    call CleanPed {
+        input:
+            ped_input = ped_input,
+            vcf_input = select_first([SubsetVcfBySamplesList.vcf_subset, vcf_file]),
+            variant_interpretation_docker=variant_interpretation_docker,
+            runtime_attr_override = runtime_attr_clean_ped
+    }
+
+    # Splits raw files into probands and parents and reformats to have chrom_svtype_sample as the first column for probands and chrom_svtype_famid as the first column for parents
+    call raw.ReformatRawFiles as ReformatRawFiles {
+        input:
+            contigs = contigs,
+            raw_files_list = select_first([GetBatchedFiles.batch_raw_files_list, batch_raw_file]),
+            ped_input = CleanPed.cleaned_ped,
+            depth = false,
+            variant_interpretation_docker = variant_interpretation_docker,
+            sv_base_mini_docker = sv_base_mini_docker,
+            runtime_attr_vcf_to_bed = runtime_attr_raw_vcf_to_bed,
+            runtime_attr_merge_bed = runtime_attr_raw_merge_bed,
+            runtime_attr_divide_by_chrom = runtime_attr_raw_divide_by_chrom,
+            runtime_attr_reformat_bed = runtime_attr_raw_reformat_bed
+    }
+
+    # Splits raw files into probands and parents and reformats to have chrom_svtype_sample as the first column for probands and chrom_svtype_famid as the first column for parents
+    call raw.ReformatRawFiles as ReformatDepthRawFiles {
+        input:
+            contigs = contigs,
+            raw_files_list = select_first([GetBatchedFiles.batch_depth_raw_files_list, batch_depth_raw_file]),
+            ped_input = CleanPed.cleaned_ped,
+            depth = true,
+            variant_interpretation_docker = variant_interpretation_docker,
+            sv_base_mini_docker = sv_base_mini_docker,
+            runtime_attr_vcf_to_bed = runtime_attr_raw_vcf_to_bed,
+            runtime_attr_merge_bed = runtime_attr_raw_merge_bed,
+            runtime_attr_divide_by_chrom = runtime_attr_raw_divide_by_chrom,
+            runtime_attr_reformat_bed = runtime_attr_raw_reformat_bed
+    }
+    
+    scatter (i in range(length(contigs))) {
+        # Generates a list of genomic disorder regions in the vcf input as well as in the depth raw files
+        call GetGenomicDisorders {
+            input:
+                genomic_disorder_input=genomic_disorder_input,
+                ped = ped_input,
+                vcf_file = select_first([SubsetVcfBySamplesList.vcf_subset, vcf_file]),
+                depth_raw_file_proband = ReformatDepthRawFiles.reformatted_proband_raw_files[i],
+                depth_raw_file_parents = ReformatDepthRawFiles.reformatted_parents_raw_files[i],
+                chromosome=contigs[i],
+                variant_interpretation_docker=variant_interpretation_docker,
+                runtime_attr_override = runtime_attr_gd
+        }
+
+        # Splits vcf by chromosome
+        call SubsetVcf {
+            input:
+                vcf_file = select_first([SubsetVcfBySamplesList.vcf_subset, vcf_file]),
+                chromosome=contigs[i],
+                variant_interpretation_docker=variant_interpretation_docker,
+                runtime_attr_override = runtime_attr_subset_vcf
+        }
+
+        # Shards vcf
+        call miniTasks.ScatterVcf as ScatterVcf {
+            input:
+                vcf=SubsetVcf.vcf_output,
+                prefix=prefix,
+                records_per_shard=records_per_shard,
+                sv_pipeline_docker=sv_pipeline_updates_docker,
+                runtime_attr_override=runtime_override_shard_vcf
+        }
+    
+        # Runs the de novo calling python script on each shard and outputs a per chromosome list of de novo SVs
+        call runDeNovo.DeNovoSVsScatter as GetDeNovo {
+            input:
+                ped_input=CleanPed.cleaned_ped,
+                vcf_files=ScatterVcf.shards,
+                disorder_input=GetGenomicDisorders.gd_output_for_denovo,
+                chromosome=contigs[i],
+                raw_proband=ReformatRawFiles.reformatted_proband_raw_files[i],
+                raw_parents=ReformatRawFiles.reformatted_parents_raw_files[i],
+                raw_depth_proband=ReformatDepthRawFiles.reformatted_proband_raw_files[i],
+                raw_depth_parents=ReformatDepthRawFiles.reformatted_parents_raw_files[i],
+                exclude_regions = exclude_regions,
+                sample_batches = sample_batches,
+                batch_bincov_index = select_first([GetBatchedFiles.batch_bincov_index_subset, batch_bincov_index]),
+                python_config=python_config,
+                variant_interpretation_docker=variant_interpretation_docker,
+                runtime_attr_denovo = runtime_attr_denovo,
+                runtime_attr_vcf_to_bed = runtime_attr_vcf_to_bed
+        }
+    }
+
+    # Merges the per chromosome final de novo SV outputs
+    call MergeDenovoBedFiles {
+        input:
+            bed_files = GetDeNovo.per_chromosome_final_output_file,
+            variant_interpretation_docker=variant_interpretation_docker,
+            runtime_attr_override = runtime_attr_merge_final_bed_files
+    }
+
+    # Outputs a final callset of de novo SVs as well as outlier de novo SV calls
+    call CallOutliers {
+        input:
+            bed_file = MergeDenovoBedFiles.merged_denovo_output,
+            variant_interpretation_docker=variant_interpretation_docker,
+            runtime_attr_override = runtime_attr_call_outliers
+    }
+
+    # Generates plots for QC
+    call CreatePlots {
+        input:
+            bed_file = CallOutliers.final_denovo_nonOutliers_output,
+            ped_input = ped_input,
+            variant_interpretation_docker=variant_interpretation_docker,
+            runtime_attr_override = runtime_attr_create_plots
+    }
+
+    # Merges the genomic disorder region output from each chromosome to compile a list of genomic disorder regions
+    call MergeGenomicDisorders {
+        input:
+            genomic_disorder_input=GetGenomicDisorders.gd_output_from_depth_raw_files,
+            variant_interpretation_docker=variant_interpretation_docker,
+            runtime_attr_override = runtime_attr_merge_gd
+    }
+
+    output {
+        File cleaned_ped = CleanPed.cleaned_ped
+        File final_denovo_nonOutliers = CallOutliers.final_denovo_nonOutliers_output
+        File final_denovo_outliers = CallOutliers.final_denovo_outliers_output
+        File final_denovo_nonOutliers_plots = CreatePlots.output_plots
+        Array [File] denovo_output_annotated = GetDeNovo.per_chromosome_annotation_output_file
+        File gd_depth = MergeGenomicDisorders.gd_output_from_depth
+        File gd_vcf = GetGenomicDisorders.gd_output_from_final_vcf[1]
+    }
+}
+
+task SubsetVcf {
+
+    input {
+        File vcf_file
+        String chromosome
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(vcf_file, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + input_size * 1.5),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File vcf_output = "~{chromosome}.vcf.gz"
+        File no_header_vcf_output = "~{chromosome}.noheader.vcf.gz"
+    }
+
+    command <<<
+        set -euxo pipefail
+
+        bcftools index ~{vcf_file}
+        bcftools view ~{vcf_file} --regions ~{chromosome} -O z -o  ~{chromosome}.vcf.gz
+        bcftools view ~{chromosome}.vcf.gz | grep -v ^## | bgzip -c > ~{chromosome}.noheader.vcf.gz
+    >>>
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task GetGenomicDisorders {
+
+    input {
+        File vcf_file
+        File ped
+        File depth_raw_file_proband
+        File depth_raw_file_parents
+        String chromosome
+        File genomic_disorder_input
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(select_all([vcf_file, ped, genomic_disorder_input, depth_raw_file_parents, depth_raw_file_proband]), "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + input_size),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File gd_output_from_final_vcf = "gd.variants.from.final.vcf.txt.gz"
+        File gd_output_from_depth_raw_files = "~{chromosome}.gd.variants.in.depth.raw.files.txt.gz"
+        File gd_output_for_denovo = "annotated.gd.variants.names.txt"
+    }
+
+    command <<<
+        set -euxo pipefail
+
+        sort -k1,1 -k2,2n ~{genomic_disorder_input} > sorted.genomic.txt
+        bedtools intersect -wa -wb -f 0.3 -r -a ~{vcf_file} -b ~{genomic_disorder_input} | cut -f 3 |sort -u > annotated.gd.variants.names.txt
+        
+        echo "Done with first line"
+
+        bedtools intersect -wa -wb -f 0.3 -r -a ~{vcf_file} -b ~{genomic_disorder_input} > gd.variants.from.final.vcf.txt
+        bgzip gd.variants.from.final.vcf.txt
+
+        echo "Done with GD from vcf"
+        
+        Rscript /src/denovo/create_per_sample_bed.R ~{genomic_disorder_input} unsorted.gd.per.sample.txt unsorted.gd.per.family.txt ~{ped} ~{chromosome}
+        sort -k1,1 -k2,2n unsorted.gd.per.sample.txt > gd.per.sample.txt
+        sort -k1,1 -k2,2n unsorted.gd.per.family.txt > gd.per.family.txt
+        cat ~{depth_raw_file_parents} | gunzip | sort -k1,1 -k2,2n | bgzip -c > sorted.depth.parents.bed.gz
+        cat ~{depth_raw_file_proband} | gunzip | sort -k1,1 -k2,2n | bgzip -c > sorted.depth.proband.bed.gz
+
+        echo "Done with R script"
+
+        bedtools intersect -wa -wb -f 0.3 -r -sorted -a gd.per.sample.txt -b sorted.depth.proband.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.proband.txt
+        bedtools intersect -wa -wb -f 0.3 -r -sorted -a gd.per.family.txt -b sorted.depth.parents.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.parents.txt
+        
+        echo "done with intersect in depth variants"
+
+        bedtools coverage -wa -wb -sorted -a gd.per.family.txt -b sorted.depth.parents.bed.gz | awk '{if ($NF>=0.30) print }' > ~{chromosome}.coverage.parents.txt
+        bedtools coverage -wa -wb -sorted -a gd.per.sample.txt -b sorted.depth.proband.bed.gz | awk '{if ($NF>=0.30) print }' > ~{chromosome}.coverage.proband.txt
+
+        echo "done with coverage in depth variants"
+
+        cat ~{chromosome}.coverage.parents.txt ~{chromosome}.coverage.proband.txt > ~{chromosome}.coverage.txt
+
+        echo "done with cat"
+
+        bedtools intersect -wa -wb -f 0.3 -sorted -a gd.per.sample.txt -b sorted.depth.proband.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.proband.no.r.txt
+        bedtools intersect -wa -wb -f 0.3 -sorted -a gd.per.family.txt -b sorted.depth.parents.bed.gz > ~{chromosome}.gd.variants.in.depth.raw.file.parents.no.r.txt
+
+        echo "done with intersect no -r"
+
+        cat ~{chromosome}.gd.variants.in.depth.raw.file.proband.no.r.txt ~{chromosome}.gd.variants.in.depth.raw.file.parents.no.r.txt > ~{chromosome}.remove.txt
+
+        echo "done with cat"
+
+        bedtools intersect -v -wb -b ~{chromosome}.remove.txt -a ~{chromosome}.coverage.txt > ~{chromosome}.kept.coverage.txt
+
+        echo "done with grep"
+
+        cat ~{chromosome}.gd.variants.in.depth.raw.file.proband.txt ~{chromosome}.gd.variants.in.depth.raw.file.parents.txt ~{chromosome}.kept.coverage.txt > ~{chromosome}.gd.variants.in.depth.raw.files.txt
+        bgzip ~{chromosome}.gd.variants.in.depth.raw.files.txt
+        echo "done with cat"
+    >>>
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task MergeGenomicDisorders {
+
+    input {
+        Array[File] genomic_disorder_input
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(genomic_disorder_input, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + input_size),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File gd_output_from_depth = "gd.raw.files.output.txt.gz"
+    }
+
+    command {
+        set -euxo pipefail
+
+        zcat ~{sep=" " genomic_disorder_input} > gd.raw.files.output.txt
+        bgzip gd.raw.files.output.txt
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task MergeDenovoBedFiles {
+
+    input {
+        Array[File] bed_files
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float bed_files_size = size(bed_files, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + (bed_files_size) * 2.0),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File merged_denovo_output = "denovo.merged.bed.gz"
+    }
+
+    command {
+        set -exuo pipefail
+
+        zcat ~{bed_files[0]} | awk 'NR<=1' > denovo.merged.bed
+        zcat ~{sep=" " bed_files} | grep -v ^chrom >> denovo.merged.bed
+        bgzip denovo.merged.bed
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task CallOutliers {
+
+    input {
+        File bed_file
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float bed_files_size = size(bed_file, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 16, # 3.75
+        disk_gb: ceil(10 + (bed_files_size) * 2.0),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File final_denovo_nonOutliers_output = "final.denovo.merged.bed.gz"
+        File final_denovo_outliers_output = "final.denovo.merged.outliers.bed.gz"
+        File final_denovo_allSamples_output = "final.denovo.merged.allSamples.bed.gz"
+    }
+
+    command {
+        set -exuo pipefail
+
+        python /src/denovo/denovo_outliers.py --bed ~{bed_file}
+
+        bgzip final.denovo.merged.bed
+        bgzip final.denovo.merged.outliers.bed
+        bgzip final.denovo.merged.allSamples.bed
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task CreatePlots {
+
+    input {
+        File bed_file
+        File ped_input
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(select_all([bed_file, ped_input]), "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 16, # 3.75
+        disk_gb: ceil(10 + input_size * 1.2),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File output_plots = "output_plots.pdf"
+    }
+
+    command {
+        set -exuo pipefail
+
+        Rscript /src/denovo/denovo_sv_plots.R ~{bed_file} ~{ped_input} output_plots.pdf
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task CleanPed {
+
+    input {
+        File ped_input
+        File vcf_input
+        String variant_interpretation_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float ped_size = size(ped_input, "GB")
+    Float vcf_size = size(vcf_input, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + vcf_size + ped_size * 1.5),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File cleaned_ped = "subset_cleaned_ped.txt"
+    }
+
+    command {
+        set -exuo pipefail
+
+        # TODO: this script should get the name of the output file it generates as an input argument.
+        # The output filename is currently hardcoded to be 'clean_ped.txt'.
+        Rscript /src/denovo/clean_ped.R ~{ped_input}
+        cut -f2 cleaned_ped.txt | awk 'NR > 1' > all_samples.txt
+        bcftools query -l ~{vcf_input} > samples_to_include_in_ped.txt
+
+        # Note: grep returns a non-success exit code (i.e., other than `0`) when it cannot find the
+        # match in the following scripts. We do not expect it to find a match for every entry.
+        # Hence, to avoid exit with unsuccessful code, we can either drop pipefail from above or use `|| true`.
+
+        grep -w -v -f samples_to_include_in_ped.txt all_samples.txt > excluded_samples.txt || true
+        grep -w -f excluded_samples.txt cleaned_ped.txt | cut -f1 | sort -u > excluded_families.txt || true
+        grep -w -v -f excluded_families.txt cleaned_ped.txt > subset_cleaned_ped.txt || true
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}
+
+task GetBatchedFiles {
+
+    input {
+        File batch_raw_file
+        File batch_depth_raw_file
+        File family_ids_txt
+        File ped_input
+        File sample_batches
+        File batch_bincov_index
+        String python_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(select_all([batch_raw_file, batch_depth_raw_file, ped_input, sample_batches, batch_bincov_index, family_ids_txt]), "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + input_size),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+    
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File batch_raw_files_list = "batch_raw_files_list.txt"
+        File batch_depth_raw_files_list = "batch_depth_raw_files_list.txt"
+        File batch_bincov_index_subset = "batch_bincov_index.txt"
+        File samples = "samples.txt"
+    }
+
+    command {
+        set -exuo pipefail
+
+        if grep -q -w -f ~{family_ids_txt} ~{ped_input}; then
+            grep -w -f ~{family_ids_txt} ~{ped_input} | cut -f2 | sort -u > samples.txt
+        else
+            echo "No matching family IDs from family_ids_txt found in ped_input file." >&2
+            exit 1
+        fi
+
+        if grep -q -w -f samples.txt ~{sample_batches}; then
+            grep -w -f samples.txt ~{sample_batches} | cut -f2 | sort -u > batches.txt
+        else
+            echo "No matching individual IDs found in the sample_batches file." >&2
+            exit 1
+        fi
+
+        grep -w -f batches.txt ~{batch_bincov_index} > batch_bincov_index.txt
+        grep -w -f batches.txt ~{batch_raw_file} > batch_raw_files_list.txt
+        grep -w -f batches.txt ~{batch_depth_raw_file} > batch_depth_raw_files_list.txt
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: python_docker
+    }
+}
\ No newline at end of file
diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl
index 16efe98b9..d4a343895 100644
--- a/wdl/Utils.wdl
+++ b/wdl/Utils.wdl
@@ -782,3 +782,48 @@ task SubsetVcfBySamplesList {
     maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
   }
 }
+
+task VcfToBed {
+
+    input {
+        File vcf_file
+        String variant_interpretation_docker
+        String? args
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float vcf_size = size(vcf_file, "GB")
+
+    RuntimeAttr default_attr = object {
+        mem_gb: 3.75,
+        disk_gb: ceil(10 + vcf_size * 1.5),
+        cpu_cores: 1,
+        preemptible_tries: 2,
+        max_retries: 1,
+        boot_disk_gb: 8
+    }
+
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+        File bed_output = "~{basename}.bed.gz"
+    }
+
+    String basename = basename(vcf_file, ".vcf.gz")
+    command {
+        set -exuo pipefail
+
+        svtk vcf2bed ~{vcf_file} ~{args} ~{basename}.bed
+        bgzip ~{basename}.bed
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+        docker: variant_interpretation_docker
+    }
+}