Skip to content

Commit

Permalink
zUMIs2.6.0 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
cziegenhain committed Jan 14, 2020
1 parent 6d528b2 commit 976fd4d
Show file tree
Hide file tree
Showing 14 changed files with 286 additions and 305 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ We provide a script to convert zUMIs output into loom file automatically based o
To convert zUMIs output to loom, simply run `Rscript rds2loom.R myRun.yaml`.

## Changelog
14 Oct 2019: zUMIs2.5.6: small fixes and optimization when using PE data.
14 Jan 2020: [zUMIs2.6.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.6.0). Large update to the output bam file which is now always coordinate sorted for better compatibility with downstream tools. Please review the new [bam tags](https://github.com/sdparekh/zUMIs/wiki/Output#explanation-of-the-bam-tags-zumis-uses). UMI error correction by hamming distance now also gets added to the bam file. Faster processing speeds when downsampling and using UMI hamming distance.

02 Sep 2019: zUMIs2.5.5: new BC detection algorithm using the [inflection](https://cran.r-project.org/web/packages/inflection/index.html) R package.
14 Oct 2019: zUMIs2.5.6: small fixes and optimization when using PE data.

02 Sep 2019: zUMIs2.5.5: new BC detection algorithm using the [inflection](https://cran.r-project.org/web/packages/inflection/index.html) R package.

09 Aug 2019: zUMIs2.5.4: zUMIs is now independent of the version of the Rsubread package. To improve code stability and reduce version-dependency, zUMIs now uses inbuilt precompiled Rsubread::featureCounts code.

Expand Down
236 changes: 92 additions & 144 deletions UMIstuffFUN.R

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion correct_BCtag.pl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ BEGIN
}

print BCBAM $read[0],"\t",$read[1],"\t",$read[2],"\t",$read[3],"\t",$read[4],"\t",$read[5],"\t",$read[6],"\t",$read[7],"\t",$read[8],"\t",
$read[9],"\t",$read[10],"\t","BX:Z:",$thisBC,"\t","BC:Z:",$correctBC,"\t",$read[12],"\n";
$read[9],"\t",$read[10],"\t","BX:Z:",$thisBC,"\t","BC:Z:",$correctBC,"\t",$read[12],"\t",$read[13],"\t",$read[14],"\n";

}
close INBAM;
Expand Down
7 changes: 5 additions & 2 deletions correct_UBtag.pl
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,15 @@ BEGIN
$UXtag = $UXmatches[0];
$UXtag =~ s/UX:Z://g;

my @GEmatches = grep { /XT:Z:/ } @readarr;
my @GEmatches = grep { /GE:Z:/ } @readarr;
if (@GEmatches == 0){ #if not exon, maybe there is an intron gene tag?
my @GEmatches = grep { /GI:Z:/ } @readarr;
}

if (!@GEmatches == 0){
#print "GE found";
$GEtag = $GEmatches[0];
$GEtag =~ s/XT:Z://g;
$GEtag =~ s/G[EI]:Z://g;

if (defined($ubmap{$GEtag})) {
#print "GE found";
Expand Down
6 changes: 3 additions & 3 deletions fqfilter_v2.pl
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,10 @@ BEGIN
$bclist{$bcseq}++;
#print $lay,"\n";
if($lay eq "SE"){
print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\n";
print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
}else{
print BCBAM $ridtmp,"\t77\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\n";
print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\n";
print BCBAM $ridtmp,"\t77\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
}
}
}
Expand Down
9 changes: 5 additions & 4 deletions misc/demultiplex_BC.pl
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
#!/usr/bin/perl
use warnings;

if(@ARGV != 3)
if(@ARGV != 4)
{
print
"\n#####################################################################################
Usage: perl $0 <output_folder> <project_name> <samtools-executable> \n
Usage: perl $0 <output_folder> <project_name> <bam_file> <samtools-executable> \n
Please drop your suggestions and clarifications to <christoph.ziegenhain\@ki.se>\n
######################################################################################\n\n";
exit;
}
BEGIN{
$out=$ARGV[0];
$name=$ARGV[1];
$samtoolsexc=$ARGV[2];
$bam=$ARGV[2];
$samtoolsexc=$ARGV[3];
}


Expand All @@ -25,7 +26,7 @@ BEGIN

#initialize output files with header
$demuxout="${out}/zUMIs_output/demultiplexed/";
$exbam="${out}/${name}.filtered.tagged.Aligned.out.bam.ex.featureCounts.bam";
$exbam="$bam";

unless ( -d $demuxout) {
mkdir $demuxout;
Expand Down
Binary file added misc/fcountsLib2
Binary file not shown.
4 changes: 2 additions & 2 deletions misc/featureCounts.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
featureCounts <- function(files,annot.inbuilt="mm10",annot.ext=NULL,isGTFAnnotationFile=FALSE,GTF.featureType="exon",GTF.attrType="gene_id",GTF.attrType.extra=NULL,chrAliases=NULL,useMetaFeatures=TRUE,allowMultiOverlap=FALSE,minOverlap=1,fracOverlap=0,fracOverlapFeature=0,largestOverlap=FALSE,nonOverlap=NULL,nonOverlapFeature=NULL,readShiftType="upstream",readShiftSize=0,readExtension5=0,readExtension3=0,read2pos=NULL,countMultiMappingReads=TRUE,fraction=FALSE,isLongRead=FALSE,minMQS=0,splitOnly=FALSE,nonSplitOnly=FALSE,primaryOnly=FALSE,ignoreDup=FALSE,strandSpecific=0,juncCounts=FALSE,genome=NULL,isPairedEnd=FALSE,requireBothEndsMapped=FALSE,checkFragLength=FALSE,minFragLength=50,maxFragLength=600,countChimericFragments=TRUE,autosort=TRUE,nthreads=1,byReadGroup=FALSE,reportReads=NULL,reportReadsPath=NULL,maxMOp=10,tmpDir=".",verbose=FALSE, fcounts_clib = NULL)
featureCounts <- function(files,annot.inbuilt="mm10",annot.ext=NULL,isGTFAnnotationFile=FALSE,GTF.featureType="exon",GTF.attrType="gene_id",GTF.attrType.extra=NULL,chrAliases=NULL,useMetaFeatures=TRUE,allowMultiOverlap=FALSE,minOverlap=1,fracOverlap=0,fracOverlapFeature=0,largestOverlap=FALSE,nonOverlap=NULL,nonOverlapFeature=NULL,readShiftType="upstream",readShiftSize=0,readExtension5=0,readExtension3=0,read2pos=NULL,countMultiMappingReads=TRUE,fraction=FALSE,isLongRead=FALSE,minMQS=0,splitOnly=FALSE,nonSplitOnly=FALSE,primaryOnly=FALSE,ignoreDup=FALSE,strandSpecific=0,juncCounts=FALSE,genome=NULL,isPairedEnd=FALSE,requireBothEndsMapped=FALSE,checkFragLength=FALSE,minFragLength=50,maxFragLength=600,countChimericFragments=TRUE,autosort=TRUE,nthreads=1,byReadGroup=FALSE,reportReads=NULL,reportReadsPath=NULL,maxMOp=10,tmpDir=".",verbose=FALSE, fcounts_clib = NULL, isIntronInput)
{
flag <- FALSE
files <- normalizePath(files, mustWork=T)
Expand Down Expand Up @@ -114,7 +114,7 @@ featureCounts <- function(files,annot.inbuilt="mm10",annot.ext=NULL,isGTFAnnotat
if(!is.null(nonOverlapFeature)) max_missing_bases_in_feature <- nonOverlapFeature
if(!is.null(GTF.attrType.extra))GTF.attrType.extra_str <- paste(GTF.attrType.extra, collapse="\t")

cmd <- paste("readSummary",ann,files_C,fout,as.numeric(isPairedEnd),minFragLength,maxFragLength,0,as.numeric(allowMultiOverlap),as.numeric(useMetaFeatures),nthreads,as.numeric(isGTFAnnotationFile),strandSpecific,reportReads_C,as.numeric(requireBothEndsMapped),as.numeric(!countChimericFragments),as.numeric(checkFragLength),GTF.featureType,GTF.attrType,minMQS,as.numeric(countMultiMappingReads),chrAliases_C," ",as.numeric(FALSE),14,readExtension5,readExtension3,minOverlap,split_C,read2pos_C," ",as.numeric(ignoreDup),as.numeric(!autosort),as.numeric(fraction),as.numeric(largestOverlap),PE_orientation,as.numeric(juncCounts),genome_C,maxMOp,0,as.numeric(fracOverlap),as.character(tmpDir),"0",as.numeric(byReadGroup),as.numeric(isLongRead),as.numeric(verbose),as.numeric(fracOverlapFeature), as.numeric(do_detection_calls), as.numeric(max_missing_bases_in_read), as.numeric(max_missing_bases_in_feature), as.numeric(primaryOnly), reportReadsPath, GTF.attrType.extra_str, annot.screen.output, readShiftType,readShiftSize ,sep=",")
cmd <- paste("readSummary",ann,files_C,fout,as.numeric(isPairedEnd),minFragLength,maxFragLength,0,as.numeric(allowMultiOverlap),as.numeric(useMetaFeatures),nthreads,as.numeric(isGTFAnnotationFile),strandSpecific,reportReads_C,as.numeric(requireBothEndsMapped),as.numeric(!countChimericFragments),as.numeric(checkFragLength),GTF.featureType,GTF.attrType,minMQS,as.numeric(countMultiMappingReads),chrAliases_C," ",as.numeric(FALSE),14,readExtension5,readExtension3,minOverlap,split_C,read2pos_C," ",as.numeric(ignoreDup),as.numeric(!autosort),as.numeric(fraction),as.numeric(largestOverlap),PE_orientation,as.numeric(juncCounts),genome_C,maxMOp,0,as.numeric(fracOverlap),as.character(tmpDir),"0",as.numeric(byReadGroup),as.numeric(isLongRead),as.numeric(verbose),as.numeric(fracOverlapFeature), as.numeric(do_detection_calls), as.numeric(max_missing_bases_in_read), as.numeric(max_missing_bases_in_feature), as.numeric(primaryOnly), reportReadsPath, GTF.attrType.extra_str, annot.screen.output, readShiftType,readShiftSize ,isIntronInput,sep=",")
n <- length(unlist(strsplit(cmd,",")))
dyn.load(fcounts_clib)
C_args <- .C("R_readSummary_wrapper",as.integer(n),as.character(cmd))
Expand Down
18 changes: 13 additions & 5 deletions runVelocyto.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,27 @@ if(is.null(opt$mem_limit)){
}else{
mempercpu <- round(opt$mem_limit/opt$num_threads,0)
if(mempercpu==0){
mempercpu <- 1
mempercpu <- 1
}
}

featfile_vector <- c(paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam"),
paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam"))

featfile <- featfile_vector[which(file.exists(featfile_vector))[1]]


##########################

print(Sys.time())
#prepare the bam file for use with velocyto
print("Preparing bam file for velocyto...")
retag_cmd <- paste0(samtoolsexc," view -@ 2 -h ",paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam | sed 's/BC:Z:/CB:Z:/'"))
retag_cmd <- paste0(samtoolsexc," view -@ 2 -h ",featfile," | sed 's/BC:Z:/CB:Z:/'")
velobam <- paste0(opt$out_dir,"/",opt$project,".tagged.forVelocyto.bam")
sort_cmd <- paste0(samtoolsexc," sort -m ",mempercpu,"G -O BAM -@ ",opt$num_threads," -o ",velobam )

system(paste(retag_cmd,sort_cmd,sep=" | "))
#sort_cmd <- paste0(samtoolsexc," sort -m ",mempercpu,"G -O BAM -@ ",opt$num_threads," -o ",velobam )
out_cmd <- paste0(samtoolsexc," view -b -@ ",opt$num_threads," -o ",velobam," - " )
#system(paste(retag_cmd,sort_cmd,sep=" | "))
system(paste(retag_cmd,out_cmd,sep=" | "))

print(Sys.time())

Expand Down
3 changes: 2 additions & 1 deletion runfeatureCountFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ suppressWarnings(suppressMessages(require(AnnotationDbi)))
strandSpecific=strand,
isPairedEnd=T,
countChimericFragments=F,
fcounts_clib = fcounts_clib)$stat
fcounts_clib = fcounts_clib,
isIntronInput = ifelse(type == "in", 1, 0))$stat
fn<-paste0(abamfile,".featureCounts.bam")
nfn<-paste0(abamfile,".",type,".featureCounts.bam")

Expand Down
118 changes: 61 additions & 57 deletions statsFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,8 @@ splitRG_stats<-function(bccount,mem){
return(bccount)
}

prep_samtools_stats <- function(featfiles,bccount,cores,samtoolsexc){
prep_samtools_stats <- function(featfile,bccount,inex,cores,samtoolsexc){
print("Extracting reads from bam file(s)...")
nfiles=length(featfiles)
nchunks <- length(unique(bccount$chunkID))
all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")

Expand All @@ -43,43 +42,40 @@ prep_samtools_stats <- function(featfiles,bccount,cores,samtoolsexc){
rgfile <- all_rgfiles[i]
chunks <- bccount[chunkID==i]$XC
write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)

}

headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
write(headerXX,"freadHeader")

headercommand <- "cat freadHeader > "
layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
samcommand <- paste(samtoolsexc," view -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x UB",layoutflag," -@")
grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/XS:Z://' | sed 's/XT:Z://' | grep -F -f "
samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x UB -x XS -x UX -x EN -x IS -x IN",layoutflag," -@")

outfiles_ex <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".ex.",1:nchunks,".txt")
system(paste(headercommand,outfiles_ex,collapse = "; "))

if(length(featfiles)==1){
cpusperchunk <- round(cores/nchunks,0)
ex_cmd <- paste(samcommand,cpusperchunk,featfiles[1],grepcommand,all_rgfiles,">>",outfiles_ex," & ",collapse = " ")
outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
system(paste(headercommand,outfiles,collapse = "; "))

system(paste(ex_cmd,"wait"))
}else{
cpusperchunk <- round(cores/(2*nchunks),0)
ex_cmd <- paste(samcommand,cpusperchunk,featfiles[1],grepcommand,all_rgfiles,">>",outfiles_ex," & ",collapse = " ")
cpusperchunk <- round(cores/nchunks,0)

outfiles_in <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".in.",1:nchunks,".txt")
system(paste(headercommand,outfiles_in,collapse = "; "))
if(inex == FALSE){
grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/ES:Z://' | sed 's/GE:Z://' | grep -F -f "
ex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

in_cmd <- paste(samcommand,cpusperchunk,featfiles[2],grepcommand,all_rgfiles,">>",outfiles_in," & ",collapse = " ")
system(paste(ex_cmd,"wait"))
}else{
grepcommand <- " | cut -f12,13,14,15 | sed 's/BC:Z://' | sed 's/ES:Z://' | sed 's/Z://g' | grep -F -f "
inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

system(paste(ex_cmd,in_cmd,"wait"))
system(paste(inex_cmd,"wait"))
}

system("rm freadHeader")
system(paste("rm",all_rgfiles))

return(outfiles_ex)
return(outfiles)
}

sumstatBAM <- function(featfiles,cores,outdir,user_seq,bc,outfile,samtoolsexc){
sumstatBAM <- function(featfile,cores,outdir,user_seq,bc,inex,outfile,samtoolsexc){
require(data.table)
# chunk barcodes
bccount_file <- paste0(opt$out_dir,"/", opt$project, ".BCstats.txt")
Expand All @@ -89,55 +85,63 @@ sumstatBAM <- function(featfiles,cores,outdir,user_seq,bc,outfile,samtoolsexc){
}
bccount <- splitRG_stats(bccount=bccount, mem= opt$mem_limit)

samouts <- prep_samtools_stats(featfiles = featfiles,
bccount = bccount,
cores = opt$num_threads,
samtoolsexc=samtoolsexc)
samouts <- prep_samtools_stats(featfile = featfile,
bccount = bccount,
inex = inex,
cores = opt$num_threads,
samtoolsexc= samtoolsexc)


for(i in unique(bccount$chunkID)){
print(paste("Working on chunk",i))
#rgfile = paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".currentRGgroup.txt")
#chunks = bccount[chunkID==i]$XC
#write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)

## minifunction for string operations
#headerXX<-paste( c(paste0("V",1:3)) ,collapse="\t")
#write(headerXX,paste(outdir,"freadHeader",sep="/"))
#samcommand<-paste("cat freadHeader; ",samtoolsexc," view -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x UB -@",cores)
samfile_ex <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".ex.",i,".txt")
if(grepl(pattern = ".filtered.tagged.Aligned.out.bam.in.featureCounts.bam",featfiles[2])){
samfile_in <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".in.",i,".txt")
}else{
samfile_in <- samfile_ex
}

samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",i,".txt")

#tmp<-data.table::fread(paste(samcommand,featfiles[1],"| cut -f12,13,14 | sed 's/BC:Z://' | sed 's/XS:Z://' | sed 's/XT:Z://' | grep -F -f ",rgfile), na.strings=c(""),
tmp <- data.table::fread(samfile_ex, na.strings=c(""),
if(inex==FALSE){
tmp<-data.table::fread(samfile, na.strings=c(""),
select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","XS","GE") )[
#,"GEin":=fread(paste(samcommand,featfiles[2],"| cut -f12,13,14 | sed 's/BC:Z://' | sed 's/XT:Z://' | grep -F -f ",rgfile),select=3,header=T,fill=T,na.strings=c(""),colClasses = "character")
,"GEin":=fread(samfile_in,select=3,header=T,fill=T,na.strings=c(""),colClasses = "character")
][ ,"ftype":="NA"
][is.na(GEin)==F,ftype:="Intron"
][is.na(GE)==F ,ftype:="Exon"
][is.na(GE) ,GE:=GEin
][ftype!="NA", XS:=ftype
][GE %in% user_seq[,V1], XS:="User"
][,RG:=as.character(RG)
][!(RG %in% bc[,XC]), RG:="bad"
][ ,c("GEin","GE","ftype"):=NULL
][,list(.N),by=c("RG","XS")
][,type:=.rmUnassigned(XS)
][type=="NoFeatures",type:="Intergenic"
][,XS:=NULL]

,"ftype":="NA"
][is.na(GE)==F, ftype:="Exon"
][ftype!="NA", XS:=ftype
][GE %in% user_seq[,V1], XS:="User"
][,RG:=as.character(RG)
][!(RG %in% bc[,XC]), RG:="bad"
][ ,c("GEin","GE","ftype"):=NULL
][,list(.N),by=c("RG","XS")
][,type:=.rmUnassigned(XS)
][type=="NoFeatures",type:="Intergenic"
][,XS:=NULL]
}else{
tmp<-data.table::fread(samfile, na.strings=c(""),
select=c(1,2,3,4),header=T,fill=T,colClasses = "character" , col.names = c("RG","XS","V3","V4") )[
][ , V3_id := substr(V3,1,2)
][ , V4_id := substr(V4,1,2)
][ , V3 := substr(V3,4,nchar(V3))
][ , V4 := substr(V4,4,nchar(V4))
][ V3_id == "GE", GE := V3
][ V3_id == "GI", GEin := V3
][ V4_id == "GI", GEin := V4
][ ,c("V3_id","V4_id","V3","V4") := NULL
][ ,"ftype":="NA"
][is.na(GEin)==F,ftype:="Intron"
][is.na(GE)==F, ftype:="Exon"
][is.na(GE),GE:=GEin
][ftype!="NA", XS:=ftype
][GE %in% user_seq[,V1], XS:="User"
][,RG:=as.character(RG)
][!(RG %in% bc[,XC]), RG:="bad"
][ ,c("GE","ftype"):=NULL
][,list(.N),by=c("RG","XS")
][,type:=.rmUnassigned(XS)
][type=="NoFeatures",type:="Intergenic"
][,XS:=NULL]
}
if(i==1){
mapCount<-tmp
}else{
mapCount<-rbind(mapCount,tmp)
}
system(paste("rm ",samfile_ex,samfile_in))
system(paste("rm ",samfile))
}
saveRDS(mapCount,file=outfile)
#system(paste0("rm ",outdir,"/freadHeader"))
Expand Down
Loading

0 comments on commit 976fd4d

Please sign in to comment.