diff --git a/.gitignore b/.gitignore index f284581c..2401ffb3 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ other_test/ MACS3/IO/BedGraphIO.c MACS3/IO/Parser.c MACS3/IO/PeakIO.c +MACS3/IO/BAM.c MACS3/Signal/BedGraph.c MACS3/Signal/CallPeakUnit.c MACS3/Signal/FixWidthTrack.c @@ -28,6 +29,7 @@ MACS3/Signal/PairedEndTrack.c MACS3/Signal/PeakDetect.c MACS3/Signal/PeakModel.c MACS3/Signal/Pileup.c +MACS3/Signal/PileupV2.c MACS3/Signal/Prob.c MACS3/Signal/RACollection.c MACS3/Signal/ReadAlignment.c @@ -37,6 +39,8 @@ MACS3/Signal/Signal.c MACS3/Signal/SignalProcessing.c MACS3/Signal/UnitigRACollection.c MACS3/Signal/VariantStat.c +MACS3/Signal/PeakVariants.c +MACS3/Signal/PosReadsInfo.c # MacOSX temp .DS_Store diff --git a/MACS3/Signal/PeakVariants.py b/MACS3/Signal/PeakVariants.py new file mode 100644 index 00000000..451b38b0 --- /dev/null +++ b/MACS3/Signal/PeakVariants.py @@ -0,0 +1,402 @@ +# cython: language_level=3 +# cython: profile=True +# Time-stamp: <2024-10-22 17:12:29 Tao Liu> + +"""Module for SAPPER PeakVariants class. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included +with the distribution). +""" + +# ------------------------------------ +# python modules +# ------------------------------------ +from copy import copy +import cython +from cython.cimports.cpython import bool + + +@cython.cclass +class Variant: + v_ref_pos: cython.long + v_ref_allele: str + v_alt_allele: str + v_GQ: cython.int + v_filter: str + v_type: str + v_mutation_type: str + v_top1allele: str + v_top2allele: str + v_DPT: cython.int + v_DPC: cython.int + v_DP1T: cython.int + v_DP2T: cython.int + v_DP1C: cython.int + v_DP2C: cython.int + v_PLUS1T: cython.int + v_PLUS2T: cython.int + v_MINUS1T: cython.int + v_MINUS2T: cython.int + v_deltaBIC: cython.float + v_BIC_homo_major: cython.float + v_BIC_homo_minor: cython.float + v_BIC_heter_noAS: cython.float + v_BIC_heter_AS: cython.float + v_AR: cython.float + v_GT: str + v_DP: cython.int + v_PL_00: cython.int + v_PL_01: cython.int + v_PL_11: cython.int + + def __init__(self, + ref_allele: str, + alt_allele: str, + GQ: cython.int, + filter: str, + type: str, + mutation_type: str, + top1allele: str, + top2allele: str, + DPT: cython.int, + DPC: cython.int, + DP1T: cython.int, + DP2T: cython.int, + DP1C: cython.int, + DP2C: cython.int, + PLUS1T: cython.int, + PLUS2T: cython.int, + MINUS1T: cython.int, + MINUS2T: cython.int, + deltaBIC: cython.float, + BIC_homo_major: cython.float, + BIC_homo_minor: cython.float, + BIC_heter_noAS: cython.float, + BIC_heter_AS: cython.float, + AR: cython.float, + GT: str, + DP: cython.int, + PL_00: cython.int, + PL_01: cython.int, + PL_11: cython.int): + self.v_ref_allele = ref_allele + self.v_alt_allele = alt_allele + self.v_GQ = GQ + self.v_filter = filter + self.v_type = type + self.v_mutation_type = mutation_type + self.v_top1allele = top1allele + self.v_top2allele = top2allele + self.v_DPT = DPT + self.v_DPC = DPC + self.v_DP1T = DP1T + self.v_DP2T = DP2T + self.v_DP1C = DP1C + self.v_DP2C = DP2C + self.v_PLUS1T = PLUS1T + self.v_PLUS2T = PLUS2T + self.v_MINUS1T = MINUS1T + self.v_MINUS2T = MINUS2T + self.v_deltaBIC = deltaBIC + self.v_BIC_homo_major = BIC_homo_major + self.v_BIC_homo_minor = BIC_homo_minor + self.v_BIC_heter_noAS = BIC_heter_noAS + self.v_BIC_heter_AS = BIC_heter_AS + self.v_AR = AR + self.v_GT = GT + self.v_DP = DP + self.v_PL_00 = PL_00 + self.v_PL_01 = PL_01 + self.v_PL_11 = PL_11 + + def __getstate__(self): + # self.v_ref_pos, + return (self.v_ref_allele, + self.v_alt_allele, + self.v_GQ, + self.v_filter, + self.v_type, + self.v_mutation_type, + self.v_top1allele, + self.v_top2allele, + self.v_DPT, + self.v_DPC, + self.v_DP1T, + self.v_DP2T, + self.v_DP1C, + self.v_DP2C, + self.v_PLUS1T, + self.v_PLUS2T, + self.v_MINUS1T, + self.v_MINUS2T, + self.v_deltaBIC, + self.v_BIC_homo_major, + self.v_BIC_homo_minor, + self.v_BIC_heter_noAS, + self.v_BIC_heter_AS, + self.v_AR, + self.v_GT, + self.v_DP, + self.v_PL_00, + self.v_PL_01, + self.v_PL_11) + + def __setstate__(self, state): + # self.v_ref_pos, + (self.v_ref_allele, + self.v_alt_allele, + self.v_GQ, + self.v_filter, + self.v_type, + self.v_mutation_type, + self.v_top1allele, + self.v_top2allele, + self.v_DPT, + self.v_DPC, + self.v_DP1T, + self.v_DP2T, + self.v_DP1C, + self.v_DP2C, + self.v_PLUS1T, + self.v_PLUS2T, + self.v_MINUS1T, + self.v_MINUS2T, + self.v_deltaBIC, + self.v_BIC_homo_major, + self.v_BIC_homo_minor, + self.v_BIC_heter_noAS, + self.v_BIC_heter_AS, + self.v_AR, + self.v_GT, + self.v_DP, + self.v_PL_00, + self.v_PL_01, + self.v_PL_11) = state + + @cython.ccall + def is_indel(self) -> bool: + if self.v_mutation_type.find("Insertion") != -1 or self.v_mutation_type.find("Deletion") != -1: + return True + else: + return False + + @cython.ccall + def is_only_del(self) -> bool: + if self.v_mutation_type == "Deletion": + return True + else: + return False + + @cython.ccall + def is_only_insertion(self) -> bool: + if self.v_mutation_type == "Insertion": + return True + else: + return False + + def __getitem__(self, keyname): + if keyname == "ref_allele": + return self.v_ref_allele + elif keyname == "alt_allele": + return self.v_alt_allele + elif keyname == "top1allele": + return self.v_top1allele + elif keyname == "top2allele": + return self.v_top2allele + elif keyname == "type": + return self.type + elif keyname == "mutation_type": + return self.mutation_type + else: + raise Exception("keyname is not accessible:", keyname) + + def __setitem__(self, keyname, v): + if keyname == "ref_allele": + self.v_ref_allele = v + elif keyname == "alt_allele": + self.v_alt_allele = v + elif keyname == "top1allele": + self.v_top1allele = v + elif keyname == "top2allele": + self.v_top2allele = v + elif keyname == "type": + self.type = v + elif keyname == "mutation_type": + self.mutation_type = v + else: + raise Exception("keyname is not accessible:", keyname) + + @cython.ccall + def is_refer_biased_01(self, + ar: cython.float = 0.85) -> bool: + if self.v_AR >= ar and self.v_ref_allele == self.v_top1allele: + return True + else: + return False + + @cython.ccall + def top1isreference(self) -> bool: + if self.v_ref_allele == self.v_top1allele: + return True + else: + return False + + @cython.ccall + def top2isreference(self) -> bool: + if self.v_ref_allele == self.v_top2allele: + return True + else: + return False + + @cython.ccall + def toVCF(self) -> str: + return "\t".join((self.v_ref_allele, self.v_alt_allele, "%d" % self.v_GQ, self.v_filter, + "M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % + (self.v_type, self.v_mutation_type, self.v_DPT, self.v_DPC, self.v_DP1T, self.v_top1allele, + self.v_DP2T, self.v_top2allele, self.v_DP1C, self.v_top1allele, self.v_DP2C, self.v_top2allele, + self.v_PLUS1T, self.v_PLUS2T, self.v_MINUS1T, self.v_MINUS2T, + self.v_deltaBIC, + self.v_BIC_homo_major, self.v_BIC_homo_minor, self.v_BIC_heter_noAS,self.v_BIC_heter_AS, + self.v_AR + ), + "GT:DP:GQ:PL", + "%s:%d:%d:%d,%d,%d" % (self.v_GT, self.v_DP, self.v_GQ, self.v_PL_00, self.v_PL_01, self.v_PL_11) + )) + + +@cython.cclass +class PeakVariants: + chrom: str + d_Variants: dict + start: cython.long + end: cython.long + refseq: bytes + + def __init__(self, + chrom: str, + start: cython.long, + end: cython.long, + s: bytes): + self.chrom = chrom + self.d_Variants = {} + self.start = start + self.end = end + self.refseq = s + + def __getstate__(self): + return (self.d_Variants, self.chrom) + + def __setstate__(self, state): + (self.d_Variants, self.chrom) = state + + @cython.ccall + def n_variants(self) -> cython.int: + return len(self.d_Variants) + + @cython.ccall + def add_variant(self, p: cython.long, v: Variant): + self.d_Variants[p] = v + + @cython.ccall + def has_indel(self) -> bool: + p: cython.long + + for p in sorted(self.d_Variants.keys()): + if self.d_Variants[p].is_indel(): + return True + return False + + @cython.ccall + def has_refer_biased_01(self) -> bool: + p: cython.long + + for p in sorted(self.d_Variants.keys()): + if self.d_Variants[p].is_refer_biased_01(): + return True + return False + + @cython.ccall + def get_refer_biased_01s(self) -> list: + ret_poss: list = [] + p: cython.long + + for p in sorted(self.d_Variants.keys()): + if self.d_Variants[p].is_refer_biased_01(): + ret_poss.append(p) + return ret_poss + + @cython.ccall + def remove_variant(self, p: cython.long): + assert p in self.d_Variants + self.d_Variants.pop(p) + + @cython.ccall + def replace_variant(self, p: cython.long, v: Variant): + assert p in self.d_Variants + self.d_Variants[p] = v + + @cython.ccall + def fix_indels(self): + p0: cython.long + p1: cython.long + p: cython.long + + # merge continuous deletion + p0 = -1 #start of deletion chunk + p1 = -1 #end of deletion chunk + for p in sorted(self.d_Variants.keys()): + if p == p1+1 and self.d_Variants[p].is_only_del() and self.d_Variants[p0].is_only_del(): + # we keep p0, remove p, and add p's ref_allele to p0, keep other information as in p0 + if self.d_Variants[p0].top1isreference: + if self.d_Variants[p0]["top1allele"] == "*": + self.d_Variants[p0]["top1allele"] = "" + self.d_Variants[p0]["top1allele"] += self.d_Variants[p]["ref_allele"] + elif self.d_Variants[p0].top2isreference: + if self.d_Variants[p0]["top2allele"] == "*": + self.d_Variants[p0]["top2allele"] = "" + self.d_Variants[p0]["top2allele"] += self.d_Variants[p]["ref_allele"] + self.d_Variants[p0]["ref_allele"] += self.d_Variants[p]["ref_allele"] + self.d_Variants.pop(p) + p1 = p + else: + p0 = p + p1 = p + + # fix deletion so that if the preceding base is 0/0 -- i.e. not in d_Variants, the reference base will be added. + for p in sorted(self.d_Variants.keys()): + if self.d_Variants[p].is_only_del(): + if not ((p-1) in self.d_Variants): + if p > self.start: # now add the reference base + self.d_Variants[p-1] = copy(self.d_Variants[p]) + rs = str(self.refseq) + self.d_Variants[p-1]["ref_allele"] = rs[p - self.start] + self.d_Variants[p-1]["ref_allele"] + self.d_Variants[p-1]["alt_allele"] = rs[p - self.start] + if self.d_Variants[p].top1isreference: + self.d_Variants[p-1]["top1allele"] = self.d_Variants[p-1]["ref_allele"] + self.d_Variants[p-1]["top2allele"] = self.d_Variants[p-1]["alt_allele"] + elif self.d_Variants[p].top2isreference: + self.d_Variants[p-1]["top1allele"] = self.d_Variants[p-1]["alt_allele"] + self.d_Variants[p-1]["top2allele"] = self.d_Variants[p-1]["ref_allele"] + self.d_Variants.pop(p) + + # remove indel if a deletion is immediately following an + # insertion -- either a third genotype is found which is not + # allowed in this version of sapper, or problem caused by + # assembling in a simple repeat region. + for p in sorted(self.d_Variants.keys()): + if self.d_Variants[p].is_only_del(): + if (p-1) in self.d_Variants and self.d_Variants[p-1].is_only_insertion(): + self.d_Variants.pop(p) + self.d_Variants.pop(p - 1) + return + + @cython.ccall + def toVCF(self) -> str: + p: cython.long + res: str + + res = "" + for p in sorted(self.d_Variants.keys()): + res += "\t".join((self.chrom, str(p+1), ".", self.d_Variants[p].toVCF())) + "\n" + return res diff --git a/MACS3/Signal/PeakVariants.pyx b/MACS3/Signal/PeakVariants.pyx deleted file mode 100644 index 485988d1..00000000 --- a/MACS3/Signal/PeakVariants.pyx +++ /dev/null @@ -1,358 +0,0 @@ -# cython: language_level=3 -# cython: profile=True -# Time-stamp: <2020-12-04 22:11:09 Tao Liu> - -"""Module for SAPPER PeakVariants class. - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD License (see the file COPYING included -with the distribution). -""" - -# ------------------------------------ -# python modules -# ------------------------------------ -from copy import copy -from cpython cimport bool - -cdef class Variant: - cdef: - long v_ref_pos - str v_ref_allele - str v_alt_allele - int v_GQ - str v_filter - str v_type - str v_mutation_type - str v_top1allele - str v_top2allele - int v_DPT - int v_DPC - int v_DP1T - int v_DP2T - int v_DP1C - int v_DP2C - int v_PLUS1T - int v_PLUS2T - int v_MINUS1T - int v_MINUS2T - float v_deltaBIC - float v_BIC_homo_major - float v_BIC_homo_minor - float v_BIC_heter_noAS - float v_BIC_heter_AS - float v_AR - str v_GT - int v_DP - int v_PL_00 - int v_PL_01 - int v_PL_11 - - def __init__ ( self, str ref_allele, str alt_allele, int GQ, str filter, str type, str mutation_type, - str top1allele, str top2allele, int DPT, int DPC, int DP1T, int DP2T, int DP1C, int DP2C, - int PLUS1T, int PLUS2T, int MINUS1T, int MINUS2T, - float deltaBIC, float BIC_homo_major, float BIC_homo_minor, float BIC_heter_noAS, float BIC_heter_AS, - float AR, str GT, int DP, int PL_00, int PL_01, int PL_11): - self.v_ref_allele = ref_allele - self.v_alt_allele = alt_allele - self.v_GQ = GQ - self.v_filter = filter - self.v_type = type - self.v_mutation_type = mutation_type - self.v_top1allele = top1allele - self.v_top2allele = top2allele - self.v_DPT = DPT - self.v_DPC = DPC - self.v_DP1T = DP1T - self.v_DP2T = DP2T - self.v_DP1C = DP1C - self.v_DP2C = DP2C - self.v_PLUS1T = PLUS1T - self.v_PLUS2T = PLUS2T - self.v_MINUS1T = MINUS1T - self.v_MINUS2T = MINUS2T - self.v_deltaBIC = deltaBIC - self.v_BIC_homo_major = BIC_homo_major - self.v_BIC_homo_minor = BIC_homo_minor - self.v_BIC_heter_noAS = BIC_heter_noAS - self.v_BIC_heter_AS = BIC_heter_AS - self.v_AR = AR - self.v_GT = GT - self.v_DP = DP - self.v_PL_00 = PL_00 - self.v_PL_01 = PL_01 - self.v_PL_11 = PL_11 - - def __getstate__ ( self ): - return ( - #self.v_ref_pos, - self.v_ref_allele, - self.v_alt_allele, - self.v_GQ, - self.v_filter, - self.v_type, - self.v_mutation_type, - self.v_top1allele, - self.v_top2allele, - self.v_DPT, - self.v_DPC, - self.v_DP1T, - self.v_DP2T, - self.v_DP1C, - self.v_DP2C, - self.v_PLUS1T, - self.v_PLUS2T, - self.v_MINUS1T, - self.v_MINUS2T, - self.v_deltaBIC, - self.v_BIC_homo_major, - self.v_BIC_homo_minor, - self.v_BIC_heter_noAS, - self.v_BIC_heter_AS, - self.v_AR, - self.v_GT, - self.v_DP, - self.v_PL_00, - self.v_PL_01, - self.v_PL_11 ) - - def __setstate__ ( self, state ): - ( #self.v_ref_pos, - self.v_ref_allele, - self.v_alt_allele, - self.v_GQ, - self.v_filter, - self.v_type, - self.v_mutation_type, - self.v_top1allele, - self.v_top2allele, - self.v_DPT, - self.v_DPC, - self.v_DP1T, - self.v_DP2T, - self.v_DP1C, - self.v_DP2C, - self.v_PLUS1T, - self.v_PLUS2T, - self.v_MINUS1T, - self.v_MINUS2T, - self.v_deltaBIC, - self.v_BIC_homo_major, - self.v_BIC_homo_minor, - self.v_BIC_heter_noAS, - self.v_BIC_heter_AS, - self.v_AR, - self.v_GT, - self.v_DP, - self.v_PL_00, - self.v_PL_01, - self.v_PL_11 ) = state - - cpdef bool is_indel ( self ): - if self.v_mutation_type.find("Insertion") != -1 or self.v_mutation_type.find("Deletion") != -1: - return True - else: - return False - - cpdef bool is_only_del ( self ): - if self.v_mutation_type == "Deletion": - return True - else: - return False - - cpdef bool is_only_insertion ( self ): - if self.v_mutation_type == "Insertion": - return True - else: - return False - - def __getitem__ ( self, keyname ): - if keyname == "ref_allele": - return self.v_ref_allele - elif keyname == "alt_allele": - return self.v_alt_allele - elif keyname == "top1allele": - return self.v_top1allele - elif keyname == "top2allele": - return self.v_top2allele - elif keyname == "type": - return self.type - elif keyname == "mutation_type": - return self.mutation_type - else: - raise Exception("keyname is not accessible:", keyname) - - def __setitem__ ( self, keyname, v ): - if keyname == "ref_allele": - self.v_ref_allele = v - elif keyname == "alt_allele": - self.v_alt_allele = v - elif keyname == "top1allele": - self.v_top1allele = v - elif keyname == "top2allele": - self.v_top2allele = v - elif keyname == "type": - self.type = v - elif keyname == "mutation_type": - self.mutation_type = v - else: - raise Exception("keyname is not accessible:", keyname) - - cpdef bool is_refer_biased_01 ( self, float ar=0.85 ): - if self.v_AR >= ar and self.v_ref_allele == self.v_top1allele: - return True - else: - return False - - - cpdef bool top1isreference ( self ): - if self.v_ref_allele == self.v_top1allele: - return True - else: - return False - - cpdef bool top2isreference ( self ): - if self.v_ref_allele == self.v_top2allele: - return True - else: - return False - - cpdef str toVCF ( self ): - return "\t".join( ( self.v_ref_allele, self.v_alt_allele, "%d" % self.v_GQ, self.v_filter, - "M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % \ - (self.v_type, self.v_mutation_type, self.v_DPT, self.v_DPC, self.v_DP1T, self.v_top1allele, - self.v_DP2T, self.v_top2allele, self.v_DP1C, self.v_top1allele, self.v_DP2C, self.v_top2allele, - self.v_PLUS1T, self.v_PLUS2T, self.v_MINUS1T, self.v_MINUS2T, - self.v_deltaBIC, - self.v_BIC_homo_major, self.v_BIC_homo_minor, self.v_BIC_heter_noAS,self.v_BIC_heter_AS, - self.v_AR - ), - "GT:DP:GQ:PL", - "%s:%d:%d:%d,%d,%d" % (self.v_GT, self.v_DP, self.v_GQ, self.v_PL_00, self.v_PL_01, self.v_PL_11) - ) ) - -cdef class PeakVariants: - cdef: - str chrom - dict d_Variants - long start - long end - bytes refseq - - - def __init__ ( self, str chrom, long start, long end, bytes s ): - self.chrom = chrom - self.d_Variants = {} - self.start = start - self.end = end - self.refseq = s - - def __getstate__ ( self ): - return ( self.d_Variants, self.chrom ) - - def __setstate__ ( self, state ): - ( self.d_Variants, self.chrom ) = state - - cpdef int n_variants ( self ): - return len(self.d_Variants) - - cpdef add_variant ( self, long p, Variant v ): - self.d_Variants[ p ] = v - - cpdef bool has_indel ( self ): - cdef: - long p - for p in sorted( self.d_Variants.keys() ): - if self.d_Variants[ p ].is_indel(): - return True - return False - - cpdef bool has_refer_biased_01 ( self ): - cdef: - long p - for p in sorted( self.d_Variants.keys() ): - if self.d_Variants[ p ].is_refer_biased_01(): - return True - return False - - cpdef list get_refer_biased_01s ( self ): - cdef: - list ret_poss = [] - long p - for p in sorted( self.d_Variants.keys() ): - if self.d_Variants[ p ].is_refer_biased_01(): - ret_poss.append( p ) - return ret_poss - - cpdef remove_variant ( self, long p ): - assert p in self.d_Variants - self.d_Variants.pop( p ) - - cpdef replace_variant ( self, long p, Variant v ): - assert p in self.d_Variants - self.d_Variants[ p ] = v - - cpdef fix_indels ( self ): - cdef: - long p0, p1, p - - # merge continuous deletion - p0 = -1 #start of deletion chunk - p1 = -1 #end of deletion chunk - for p in sorted( self.d_Variants.keys() ): - if p == p1+1 and self.d_Variants[ p ].is_only_del() and self.d_Variants[ p0 ].is_only_del() : - # we keep p0, remove p, and add p's ref_allele to p0, keep other information as in p0 - if self.d_Variants[ p0 ].top1isreference: - if self.d_Variants[ p0 ]["top1allele"] == "*": - self.d_Variants[ p0 ]["top1allele"] = "" - self.d_Variants[ p0 ]["top1allele"] += self.d_Variants[ p ]["ref_allele"] - elif self.d_Variants[ p0 ].top2isreference: - if self.d_Variants[ p0 ]["top2allele"] == "*": - self.d_Variants[ p0 ]["top2allele"] = "" - self.d_Variants[ p0 ]["top2allele"] += self.d_Variants[ p ]["ref_allele"] - self.d_Variants[ p0 ]["ref_allele"] += self.d_Variants[ p ]["ref_allele"] - self.d_Variants.pop ( p ) - p1 = p - else: - p0 = p - p1 = p - - # fix deletion so that if the preceding base is 0/0 -- i.e. not in d_Variants, the reference base will be added. - for p in sorted( self.d_Variants.keys() ): - if self.d_Variants[ p ].is_only_del(): - if not( ( p-1 ) in self.d_Variants ): - if p > self.start: # now add the reference base - self.d_Variants[ p-1 ] = copy(self.d_Variants[ p ]) - rs = str(self.refseq) - self.d_Variants[ p-1 ]["ref_allele"] = rs[ p - self.start ] + self.d_Variants[ p-1 ]["ref_allele"] - self.d_Variants[ p-1 ]["alt_allele"] = rs[ p - self.start ] - if self.d_Variants[ p ].top1isreference: - self.d_Variants[ p-1 ]["top1allele"] = self.d_Variants[ p-1 ]["ref_allele"] - self.d_Variants[ p-1 ]["top2allele"] = self.d_Variants[ p-1 ]["alt_allele"] - elif self.d_Variants[ p ].top2isreference: - self.d_Variants[ p-1 ]["top1allele"] = self.d_Variants[ p-1 ]["alt_allele"] - self.d_Variants[ p-1 ]["top2allele"] = self.d_Variants[ p-1 ]["ref_allele"] - self.d_Variants.pop( p ) - - # remove indel if a deletion is immediately following an - # insertion -- either a third genotype is found which is not - # allowed in this version of sapper, or problem caused by - # assembling in a simple repeat region. - - for p in sorted( self.d_Variants.keys() ): - if self.d_Variants[ p ].is_only_del(): - if ( p-1 ) in self.d_Variants and self.d_Variants[p-1].is_only_insertion(): - self.d_Variants.pop( p ) - self.d_Variants.pop( p - 1 ) - return - - cpdef str toVCF ( self ): - cdef: - long p - str res - res = "" - for p in sorted( self.d_Variants.keys() ): - res += "\t".join( ( self.chrom, str(p+1), ".", self.d_Variants[ p ].toVCF() ) ) + "\n" - return res - - diff --git a/MACS3/Signal/PosReadsInfo.py b/MACS3/Signal/PosReadsInfo.py new file mode 100644 index 00000000..43b52538 --- /dev/null +++ b/MACS3/Signal/PosReadsInfo.py @@ -0,0 +1,670 @@ +# cython: language_level=3 +# cython: profile=True +# Time-stamp: <2024-10-22 16:59:53 Tao Liu> + +"""Module for SAPPER PosReadsInfo class. + +Copyright (c) 2017 Tao Liu + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included +with the distribution). + +@status: experimental +@version: $Revision$ +@author: Tao Liu +@contact: tliu4@buffalo.edu +""" + +# ------------------------------------ +# python modules +# ------------------------------------ +from MACS3.Signal.VariantStat import (CalModel_Homo, + CalModel_Heter_noAS, + CalModel_Heter_AS) +# calculate_GQ, +# calculate_GQ_heterASsig) +from MACS3.Signal.Prob import binomial_cdf +from MACS3.Signal.PeakVariants import Variant + +import cython +import numpy as np +import cython.cimports.numpy as cnp +from cython.cimports.cpython import bool + +LN10 = 2.3025850929940458 + +# ------------------------------------ +# constants +# ------------------------------------ +__version__ = "Parser $Revision$" +__author__ = "Tao Liu " +__doc__ = "All Parser classes" + +# ------------------------------------ +# Misc functions +# ------------------------------------ + +# ------------------------------------ +# Classes +# ------------------------------------ + + +@cython.cclass +class PosReadsInfo: + ref_pos: cython.long + ref_allele: cython.bytes + alt_allele: cython.bytes + filterout: bool # if true, do not output + bq_set_T: dict # {A:[], C:[], G:[], T:[], N:[]} for treatment + bq_set_C: dict + n_reads_T: dict # {A:[], C:[], G:[], T:[], N:[]} for treatment + n_reads_C: dict + n_reads: dict + n_strand: list # [{A:[], C:[], G:[], T:[], N:[]},{A:[], C:[], G:[], T:[], N:[]}] for total appearance on plus strand and minus strand for ChIP sample only + n_tips: dict # count of nt appearing at tips + top1allele: cython.bytes + top2allele: cython.bytes + top12alleles_ratio: cython.float + lnL_homo_major: cython.double + lnL_heter_AS: cython.double + lnL_heter_noAS: cython.double + lnL_homo_minor: cython.double + BIC_homo_major: cython.double + BIC_heter_AS: cython.double + BIC_heter_noAS: cython.double + BIC_homo_minor: cython.double + PL_00: cython.double + PL_01: cython.double + PL_11: cython.double + deltaBIC: cython.double + heter_noAS_kc: cython.int + heter_noAS_ki: cython.int + heter_AS_kc: cython.int + heter_AS_ki: cython.int + heter_AS_alleleratio: cython.double + GQ_homo_major: cython.int + GQ_heter_noAS: cython.int + GQ_heter_AS: cython.int # phred scale of prob by standard formular + GQ_heter_ASsig: cython.int # phred scale of prob, to measure the difference between AS and noAS + GQ: cython.double + GT: str + type: str + mutation_type: str # SNV or Insertion or Deletion + hasfermiinfor: bool # if no fermi bam overlap in the position, false; if fermi bam in the position GT: N, false; if anyone of top2allele is not in fermi GT NTs, false; + fermiNTs: bytearray + + def __cinit__(self): + self.filterout = False + self.GQ = 0 + self.GT = "unsure" + self.alt_allele = b'.' + + def __init__(self, + ref_pos: cython.long, + ref_allele: cython.bytes): + self.ref_pos = ref_pos + self.ref_allele = ref_allele + self.bq_set_T = {ref_allele: [], b'A': [], b'C': [], b'G': [], b'T': [], b'N': [], b'*': []} + self.bq_set_C = {ref_allele: [], b'A': [], b'C': [], b'G': [], b'T': [], b'N': [], b'*': []} + self.n_reads_T = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0} + self.n_reads_C = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0} + self.n_reads = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0} + self.n_strand = [{ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0}, + {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0}] + self.n_tips = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0} + + def __getstate__(self): + return (self.ref_pos, + self.ref_allele, + self.alt_allele, + self.filterout, + self.bq_set_T, + self.bq_set_C, + self.n_reads_T, + self.n_reads_C, + self.n_reads, + self.n_strand, + self.n_tips, + self.top1allele, + self.top2allele, + self.top12alleles_ratio, + self.lnL_homo_major, + self.lnL_heter_AS, + self.lnL_heter_noAS, + self.lnL_homo_minor, + self.BIC_homo_major, + self.BIC_heter_AS, + self.BIC_heter_noAS, + self.BIC_homo_minor, + self.heter_noAS_kc, + self.heter_noAS_ki, + self.heter_AS_kc, + self.heter_AS_ki, + self.heter_AS_alleleratio, + self.GQ_homo_major, + self.GQ_heter_noAS, + self.GQ_heter_AS, + self.GQ_heter_ASsig, + self.GQ, + self.GT, + self.type, + self.hasfermiinfor, + self.fermiNTs) + + def __setstate__(self, state): + (self.ref_pos, + self.ref_allele, + self.alt_allele, + self.filterout, + self.bq_set_T, + self.bq_set_C, + self.n_reads_T, + self.n_reads_C, + self.n_reads, + self.n_strand, + self.n_tips, + self.top1allele, + self.top2allele, + self.top12alleles_ratio, + self.lnL_homo_major, + self.lnL_heter_AS, + self.lnL_heter_noAS, + self.lnL_homo_minor, + self.BIC_homo_major, + self.BIC_heter_AS, + self.BIC_heter_noAS, + self.BIC_homo_minor, + self.heter_noAS_kc, + self.heter_noAS_ki, + self.heter_AS_kc, + self.heter_AS_ki, + self.heter_AS_alleleratio, + self.GQ_homo_major, + self.GQ_heter_noAS, + self.GQ_heter_AS, + self.GQ_heter_ASsig, + self.GQ, + self.GT, + self.type, + self.hasfermiinfor, + self.fermiNTs) = state + + @cython.ccall + def filterflag(self) -> bool: + return self.filterout + + @cython.ccall + def apply_GQ_cutoff(self, + min_homo_GQ: cython.int = 50, + min_heter_GQ: cython.int = 100): + if self.filterout: + return + if self.type.startswith('homo') and self.GQ < min_homo_GQ: + self.filterout = True + elif self.type.startswith('heter') and self.GQ < min_heter_GQ: + self.filterout = True + return + + @cython.ccall + def apply_deltaBIC_cutoff(self, + min_delta_BIC: cython.float = 10): + if self.filterout: + return + if self.deltaBIC < min_delta_BIC: + self.filterout = True + return + + @cython.ccall + def add_T(self, + read_index: cython.int, + read_allele: cython.bytes, + read_bq: cython.int, + strand: cython.int, + tip: bool, + Q: cython.int = 20): + """ Strand 0: plus, 1: minus + + Q is the quality cutoff. By default, only consider Q20 or read_bq > 20. + """ + if read_bq <= Q: + return + if not self.n_reads.has_key(read_allele): + self.bq_set_T[read_allele] = [] + self.bq_set_C[read_allele] = [] + self.n_reads_T[read_allele] = 0 + self.n_reads_C[read_allele] = 0 + self.n_reads[read_allele] = 0 + self.n_strand[0][read_allele] = 0 + self.n_strand[1][read_allele] = 0 + self.n_tips[read_allele] = 0 + self.bq_set_T[read_allele].append(read_bq) + self.n_reads_T[read_allele] += 1 + self.n_reads[read_allele] += 1 + self.n_strand[strand][read_allele] += 1 + if tip: + self.n_tips[read_allele] += 1 + + @cython.ccall + def add_C(self, + read_index: cython.int, + read_allele: cython.bytes, + read_bq: cython.int, + strand: cython.int, + Q: cython.int = 20): + if read_bq <= Q: + return + if not self.n_reads.has_key(read_allele): + self.bq_set_T[read_allele] = [] + self.bq_set_C[read_allele] = [] + self.n_reads_T[read_allele] = 0 + self.n_reads_C[read_allele] = 0 + self.n_reads[read_allele] = 0 + self.n_strand[0][read_allele] = 0 + self.n_strand[1][read_allele] = 0 + self.n_tips[read_allele] = 0 + self.bq_set_C[read_allele].append(read_bq) + self.n_reads_C[read_allele] += 1 + self.n_reads[read_allele] += 1 + + @cython.ccall + def raw_read_depth(self, + opt: str = "all") -> cython.int: + if opt == "all": + return sum(self.n_reads.values()) + elif opt == "T": + return sum(self.n_reads_T.values()) + elif opt == "C": + return sum(self.n_reads_C.values()) + else: + raise Exception("opt should be either 'all', 'T' or 'C'.") + + @cython.ccall + def update_top_alleles(self, + min_top12alleles_ratio: cython.float = 0.8, + min_altallele_count: cython.int = 2, + max_allowed_ar: cython.float = 0.95): + """Identify top1 and top2 NT. the ratio of (top1+top2)/total + """ + [self.top1allele, self.top2allele] = sorted(self.n_reads, + key=self.n_reads_T.get, + reverse=True)[:2] + + # if top2 allele count in ChIP is lower than + # min_altallele_count, or when allele ratio top1/(top1+top2) + # is larger than max_allowed_ar in ChIP, we won't consider + # this allele at all. we set values of top2 allele in + # dictionaries to [] and ignore top2 allele entirely. + # max(self.n_strand[0][self.top2allele], self.n_strand[1][self.top2allele]) < min_altallele_count + # if self.ref_pos == 52608504: + # prself: cython.int.ref_pos, self.n_reads_T[self.top1allele], self.n_reads_T[self.top2allele], self.n_reads_C[self.top1allele], self.n_reads_C[self.top2allele] + if self.n_reads_T[self.top1allele] + self.n_reads_T[self.top2allele] == 0: + self.filterout = True + return + + if (len(self.top1allele) == 1 and len(self.top2allele) == 1) and (self.top2allele != self.ref_allele and ((self.n_reads_T[self.top2allele] - self.n_tips[self.top2allele]) < min_altallele_count) or self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele] + self.n_reads_T[self.top2allele]) > max_allowed_ar): + self.bq_set_T[self.top2allele] = [] + self.bq_set_C[self.top2allele] = [] + self.n_reads_T[self.top2allele] = 0 + self.n_reads_C[self.top2allele] = 0 + self.n_reads[self.top2allele] = 0 + self.n_tips[self.top2allele] = 0 + if (self.top1allele != self.ref_allele and (self.n_reads_T[self.top1allele] - self.n_tips[self.top1allele]) < min_altallele_count): + self.bq_set_T[self.top1allele] = [] + self.bq_set_C[self.top1allele] = [] + self.n_reads_T[self.top1allele] = 0 + self.n_reads_C[self.top1allele] = 0 + self.n_reads[self.top1allele] = 0 + self.n_tips[self.top1allele] = 0 + + if self.n_reads_T[self.top1allele] + self.n_reads_T[self.top2allele] == 0: + self.filterout = True + return + + self.top12alleles_ratio = (self.n_reads[self.top1allele] + self.n_reads[self.top2allele]) / sum(self.n_reads.values()) + if self.top12alleles_ratio < min_top12alleles_ratio: + self.filterout = True + return + + if self.top1allele == self.ref_allele and self.n_reads[self.top2allele] == 0: + # This means this position only contains top1allele which is the ref_allele. So the GT must be 0/0 + self.type = "homo_ref" + self.filterout = True + return + return + + @cython.ccall + def top12alleles(self): + print(self.ref_pos, self.ref_allele) + print("Top1allele", self.top1allele, "Treatment", + self.bq_set_T[self.top1allele], "Control", + self.bq_set_C[self.top1allele]) + print("Top2allele", self.top2allele, "Treatment", + self.bq_set_T[self.top2allele], "Control", + self.bq_set_C[self.top2allele]) + + @cython.ccall + def call_GT(self, max_allowed_ar: cython.float = 0.99): + """Require update_top_alleles being called. + """ + top1_bq_T: cnp.ndarray(cython.int, ndim=1) + top2_bq_T: cnp.ndarray(cython.int, ndim=1) + top1_bq_C: cnp.ndarray(cython.int, ndim=1) + top2_bq_C: cnp.ndarray(cython.int, ndim=1) + tmp_mutation_type: list + tmp_alt: cython.bytes + + if self.filterout: + return + + top1_bq_T = np.array(self.bq_set_T[self.top1allele], dtype="i4") + top2_bq_T = np.array(self.bq_set_T[self.top2allele], dtype="i4") + top1_bq_C = np.array(self.bq_set_C[self.top1allele], dtype="i4") + top2_bq_C = np.array(self.bq_set_C[self.top2allele], dtype="i4") + (self.lnL_homo_major, self.BIC_homo_major) = CalModel_Homo(top1_bq_T, + top1_bq_C, + top2_bq_T, + top2_bq_C) + (self.lnL_homo_minor, self.BIC_homo_minor) = CalModel_Homo(top2_bq_T, + top2_bq_C, + top1_bq_T, + top1_bq_C) + (self.lnL_heter_noAS, self.BIC_heter_noAS) = CalModel_Heter_noAS(top1_bq_T, + top1_bq_C, + top2_bq_T, + top2_bq_C) + (self.lnL_heter_AS, self.BIC_heter_AS) = CalModel_Heter_AS(top1_bq_T, + top1_bq_C, + top2_bq_T, + top2_bq_C, + max_allowed_ar) + + # if self.ref_pos == 71078525: + # print "---" + # prlen: cython.int(top1_bq_T), len(top1_bq_C), len(top2_bq_T), len(top2_bq_C) + # prself: cython.int.lnL_homo_major, self.lnL_homo_minor, self.lnL_heter_noAS, self.lnL_heter_AS + # prself: cython.int.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS, self.BIC_heter_AS + + if self.top1allele != self.ref_allele and self.n_reads[self.top2allele] == 0: + # in this case, there is no top2 nt (or socalled minor + # allele) in either treatment or control, we should assume + # it's a 1/1 genotype. We will take 1/1 if it passes BIC + # test (deltaBIC >=2), and will skip this loci if it can't + # pass the test. + + self.deltaBIC = min(self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor) - self.BIC_homo_major + if self.deltaBIC < 2: + self.filterout = True + return + + self.type = "homo" + self.GT = "1/1" + + self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 + self.PL_01 = -10.0 * max(self.lnL_heter_noAS, self.lnL_heter_AS) / LN10 + self.PL_11 = -10.0 * self.lnL_homo_major / LN10 + + self.PL_00 = max(0, self.PL_00 - self.PL_11) + self.PL_01 = max(0, self.PL_01 - self.PL_11) + self.PL_11 = 0 + + self.GQ = min(self.PL_00, self.PL_01) + self.alt_allele = self.top1allele + else: + # assign GQ, GT, and type + if self.ref_allele != self.top1allele and self.BIC_homo_major + 2 <= self.BIC_homo_minor and self.BIC_homo_major + 2 <= self.BIC_heter_noAS and self.BIC_homo_major + 2 <= self.BIC_heter_AS: + self.type = "homo" + self.deltaBIC = min(self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor) - self.BIC_homo_major + self.GT = "1/1" + self.alt_allele = self.top1allele + + self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 + self.PL_01 = -10.0 * max(self.lnL_heter_noAS, self.lnL_heter_AS) / LN10 + self.PL_11 = -10.0 * self.lnL_homo_major / LN10 + + self.PL_00 = self.PL_00 - self.PL_11 + self.PL_01 = self.PL_01 - self.PL_11 + self.PL_11 = 0 + + self.GQ = min(self.PL_00, self.PL_01) + + elif self.BIC_heter_noAS + 2 <= self.BIC_homo_major and self.BIC_heter_noAS + 2 <= self.BIC_homo_minor and self.BIC_heter_noAS + 2 <= self.BIC_heter_AS: + self.type = "heter_noAS" + self.deltaBIC = min(self.BIC_homo_major, self.BIC_homo_minor) - self.BIC_heter_noAS + + self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 + self.PL_01 = -10.0 * self.lnL_heter_noAS / LN10 + self.PL_11 = -10.0 * self.lnL_homo_major / LN10 + + self.PL_00 = self.PL_00 - self.PL_01 + self.PL_11 = self.PL_11 - self.PL_01 + self.PL_01 = 0 + + self.GQ = min(self.PL_00, self.PL_11) + + elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor and self.BIC_heter_AS + 2 <= self.BIC_heter_noAS: + self.type = "heter_AS" + self.deltaBIC = min(self.BIC_homo_major, self.BIC_homo_minor) - self.BIC_heter_AS + + self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 + self.PL_01 = -10.0 * self.lnL_heter_AS / LN10 + self.PL_11 = -10.0 * self.lnL_homo_major / LN10 + + self.PL_00 = self.PL_00 - self.PL_01 + self.PL_11 = self.PL_11 - self.PL_01 + self.PL_01 = 0 + + self.GQ = min(self.PL_00, self.PL_11) + + elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor: + # can't decide if it's noAS or AS + self.type = "heter_unsure" + self.deltaBIC = min(self.BIC_homo_major, self.BIC_homo_minor) - max(self.BIC_heter_AS, self.BIC_heter_noAS) + + self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 + self.PL_01 = -10.0 * max(self.lnL_heter_noAS, self.lnL_heter_AS) / LN10 + self.PL_11 = -10.0 * self.lnL_homo_major / LN10 + + self.PL_00 = self.PL_00 - self.PL_01 + self.PL_11 = self.PL_11 - self.PL_01 + self.PL_01 = 0 + + self.GQ = min(self.PL_00, self.PL_11) + + elif self.ref_allele == self.top1allele and self.BIC_homo_major < self.BIC_homo_minor and self.BIC_homo_major < self.BIC_heter_noAS and self.BIC_homo_major < self.BIC_heter_AS: + self.type = "homo_ref" + # we do not calculate GQ if type is homo_ref + self.GT = "0/0" + self.filterout = True + else: + self.type = "unsure" + self.filterout = True + + if self.type.startswith("heter"): + if self.ref_allele == self.top1allele: + self.alt_allele = self.top2allele + self.GT = "0/1" + elif self.ref_allele == self.top2allele: + self.alt_allele = self.top1allele + self.GT = "0/1" + else: + self.alt_allele = self.top1allele+b','+self.top2allele + self.GT = "1/2" + + tmp_mutation_type = [] + for tmp_alt in self.alt_allele.split(b','): + if tmp_alt == b'*': + tmp_mutation_type.append("Deletion") + elif len(tmp_alt) > 1: + tmp_mutation_type.append("Insertion") + else: + tmp_mutation_type.append("SNV") + self.mutation_type = ",".join(tmp_mutation_type) + return + + @cython.cfunc + def SB_score_ChIP(self, + a: cython.int, + b: cython.int, + c: cython.int, + d: cython.int) -> cython.float: + """ calculate score for filtering variants with strange strand biases. + + a: top1/major allele plus strand + b: top2/minor allele plus strand + c: top1/major allele minus strand + d: top2/minor allele minus strand + + Return a value: cython.float so that if this value >= 1, the variant will be filtered out. + """ + p1_l: cython.double + p1_r: cython.double + p2_l: cython.double + p2_r: cython.double + + if a + b == 0 or c + d == 0: + # if major allele and minor allele both bias to the same strand, allow it + return 0.0 + + # Rule: + # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it. + # or there is no significant bias (0.5) in top2 allele. + + # pra: cython.int, b, c, d + p1_l = binomial_cdf(a, (a+c), 0.5, lower=True) # alternative: less than 0.5 + p1_r = binomial_cdf(c, (a+c), 0.5, lower=True) # greater than 0.5 + p2_l = binomial_cdf(b, (b+d), 0.5, lower=True) # alternative: less than 0.5 + p2_r = binomial_cdf(d, (b+d), 0.5, lower=True) # greater than 0.5 + # prp1_l: cython.int, p1_r, p2_l, p2_r + + if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05): + # we reject loci where the significant biases are inconsistent between top1 and top2 alleles. + return 1.0 + else: + # if b<=2 and d=0 or b=0 and d<=2 -- highly possible FPs + # if (b<=2 and d==0 or b==0 and d<=2): + # return 1 + # can't decide + return 0.0 + + @cython.cfunc + def SB_score_ATAC(self, + a: cython.int, + b: cython.int, + c: cython.int, + d: cython.int) -> cython.float: + """ calculate score for filtering variants with strange strand biases. + + ATAC-seq version + + a: top1/major allele plus strand + b: top2/minor allele plus strand + c: top1/major allele minus strand + d: top2/minor allele minus strand + + Return a value: cython.float so that if this value >= 1, the variant will be filtered out. + """ + p1_l: cython.double + p1_r: cython.double + p2_l: cython.double + p2_r: cython.double + + if a+b == 0 or c+d == 0: + # if major allele and minor allele both bias to the same strand, allow it + return 0.0 + + # Rule: + # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it. + # or there is no significant bias (0.5) in top2 allele. + # pra: cython.int, b, c, d + p1_l = binomial_cdf(a, (a+c), 0.5, lower=True) # alternative: less than 0.5 + p1_r = binomial_cdf(c, (a+c), 0.5, lower=True) # greater than 0.5 + p2_l = binomial_cdf(b, (b+d), 0.5, lower=True) # alternative: less than 0.5 + p2_r = binomial_cdf(d, (b+d), 0.5, lower=True) # greater than 0.5 + # prp1_l: cython.int, p1_r, p2_l, p2_r + + if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05): + # we reject loci where the significant biases are inconsistent between top1 and top2 alleles. + return 1.0 + else: + # can't decide + return 0.0 + + @cython.ccall + def to_vcf(self) -> str: + """Output REF,ALT,QUAL,FILTER,INFO,FORMAT, SAMPLE columns. + """ + vcf_ref: str + vcf_alt: str + vcf_qual: str + vcf_filter: str + vcf_info: str + vcf_format: str + vcf_sample: str + + vcf_ref = self.ref_allele.decode() + vcf_alt = self.alt_allele.decode() + vcf_qual = "%d" % self.GQ + vcf_filter = "." + vcf_info = (b"M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % + (self.type.encode(), + self.mutation_type.encode(), + sum(self.n_reads_T.values()), + sum(self.n_reads_C.values()), + self.n_reads_T[self.top1allele], + self.top1allele, + self.n_reads_T[self.top2allele], + self.top2allele, + self.n_reads_C[self.top1allele], + self.top1allele, + self.n_reads_C[self.top2allele], + self.top2allele, + self.n_strand[0][self.top1allele], + self.n_strand[0][self.top2allele], + self.n_strand[1][self.top1allele], + self.n_strand[1][self.top2allele], + self.deltaBIC, + self.BIC_homo_major, + self.BIC_homo_minor, + self.BIC_heter_noAS, + self.BIC_heter_AS, + self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]) + )).decode() + vcf_format = "GT:DP:GQ:PL" + vcf_sample = "%s:%d:%d:%d,%d,%d" % (self.GT, self.raw_read_depth(opt="all"), self.GQ, self.PL_00, self.PL_01, self.PL_11) + return "\t".join((vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample)) + + @cython.ccall + def toVariant(self): + v: Variant + + v = Variant(self.ref_allele.decode(), + self.alt_allele.decode(), + self.GQ, + '.', + self.type, + self.mutation_type, + self.top1allele.decode(), + self.top2allele.decode(), + sum(self.n_reads_T.values()), + sum(self.n_reads_C.values()), + self.n_reads_T[self.top1allele], + self.n_reads_T[self.top2allele], + self.n_reads_C[self.top1allele], + self.n_reads_C[self.top2allele], + self.n_strand[0][self.top1allele], + self.n_strand[0][self.top2allele], + self.n_strand[1][self.top1allele], + self.n_strand[1][self.top2allele], + self.deltaBIC, + self.BIC_homo_major, + self.BIC_homo_minor, + self.BIC_heter_noAS, + self.BIC_heter_AS, + self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]), + self.GT, + self.raw_read_depth(opt="all"), + self.PL_00, + self.PL_01, + self.PL_11) + return v diff --git a/MACS3/Signal/PosReadsInfo.pyx b/MACS3/Signal/PosReadsInfo.pyx deleted file mode 100644 index 93d38348..00000000 --- a/MACS3/Signal/PosReadsInfo.pyx +++ /dev/null @@ -1,600 +0,0 @@ -# cython: language_level=3 -# cython: profile=True -# Time-stamp: <2020-12-04 23:10:35 Tao Liu> - -"""Module for SAPPER PosReadsInfo class. - -Copyright (c) 2017 Tao Liu - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD License (see the file COPYING included -with the distribution). - -@status: experimental -@version: $Revision$ -@author: Tao Liu -@contact: tliu4@buffalo.edu -""" - -# ------------------------------------ -# python modules -# ------------------------------------ -from MACS3.Signal.VariantStat import CalModel_Homo, CalModel_Heter_noAS, CalModel_Heter_AS, calculate_GQ, calculate_GQ_heterASsig -from MACS3.Signal.Prob import binomial_cdf -from MACS3.Signal.PeakVariants import Variant - -from cpython cimport bool - -import numpy as np -cimport numpy as np -from numpy cimport uint32_t, uint64_t, int32_t, float32_t - -LN10 = 2.3025850929940458 - -cdef extern from "stdlib.h": - ctypedef unsigned int size_t - size_t strlen(char *s) - void *malloc(size_t size) - void *calloc(size_t n, size_t size) - void free(void *ptr) - int strcmp(char *a, char *b) - char * strcpy(char *a, char *b) - long atol(char *bytes) - int atoi(char *bytes) - -# ------------------------------------ -# constants -# ------------------------------------ -__version__ = "Parser $Revision$" -__author__ = "Tao Liu " -__doc__ = "All Parser classes" - -# ------------------------------------ -# Misc functions -# ------------------------------------ - -# ------------------------------------ -# Classes -# ------------------------------------ - -cdef class PosReadsInfo: - cdef: - long ref_pos - bytes ref_allele - bytes alt_allele - bool filterout # if true, do not output - - dict bq_set_T #{A:[], C:[], G:[], T:[], N:[]} for treatment - dict bq_set_C - dict n_reads_T #{A:[], C:[], G:[], T:[], N:[]} for treatment - dict n_reads_C - dict n_reads - - list n_strand #[{A:[], C:[], G:[], T:[], N:[]},{A:[], C:[], G:[], T:[], N:[]}] for total appearance on plus strand and minus strand for ChIP sample only - dict n_tips # count of nt appearing at tips - - bytes top1allele - bytes top2allele - float top12alleles_ratio - - double lnL_homo_major,lnL_heter_AS,lnL_heter_noAS,lnL_homo_minor - double BIC_homo_major,BIC_heter_AS,BIC_heter_noAS,BIC_homo_minor - double PL_00, PL_01, PL_11 - double deltaBIC - int heter_noAS_kc, heter_noAS_ki - int heter_AS_kc, heter_AS_ki - double heter_AS_alleleratio - - int GQ_homo_major,GQ_heter_noAS,GQ_heter_AS #phred scale of prob by standard formular - int GQ_heter_ASsig #phred scale of prob, to measure the difference between AS and noAS - - double GQ - - str GT - str type - str mutation_type # SNV or Insertion or Deletion - - bool hasfermiinfor #if no fermi bam overlap in the position, false; if fermi bam in the position GT: N, false; if anyone of top2allele is not in fermi GT NTs, false; - bytearray fermiNTs # - - def __cinit__ ( self ): - self.filterout = False - self.GQ = 0 - self.GT = "unsure" - self.alt_allele = b'.' - - def __init__ ( self, long ref_pos, bytes ref_allele ): - self.ref_pos = ref_pos - self.ref_allele = ref_allele - self.bq_set_T = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] } - self.bq_set_C = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] } - self.n_reads_T = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } - self.n_reads_C = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } - self.n_reads = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } - self.n_strand = [ { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }, { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } ] - self.n_tips = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } - - - #cpdef void merge ( self, PosReadsInfo PRI2 ): - # """Merge two PRIs. No check available. - # - # """ - # assert self.ref_pos == PRI2.ref_pos - # assert self.ref_allele == PRI2.ref_allele - # for b in set( self.n_reads.keys() ).union( set( PRI2.n_reads.keys() ) ): - # self.bq_set_T[ b ] = self.bq_set_T.get( b, []).extend( PRI2.bq_set_T.get( b, [] ) ) - # self.bq_set_C[ b ] = self.bq_set_C.get( b, []).extend( PRI2.bq_set_C.get( b, [] ) ) - # self.n_reads_T[ b ] = self.n_reads_T.get( b, 0) + PRI2.n_reads_T.get( b, 0 ) - # self.n_reads_C[ b ] = self.n_reads_C.get( b, 0) + PRI2.n_reads_C.get( b, 0 ) - # self.n_reads[ b ] = self.n_reads.get( b, 0) + PRI2.n_reads.get( b, 0 ) - # return - - def __getstate__ ( self ): - return ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout, - self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips, - self.top1allele, self.top2allele, self.top12alleles_ratio, - self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor, - self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor, - self.heter_noAS_kc, self.heter_noAS_ki, - self.heter_AS_kc, self.heter_AS_ki, - self.heter_AS_alleleratio, - self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS, - self.GQ_heter_ASsig, - self.GQ, - self.GT, - self.type, - self.hasfermiinfor, - self.fermiNTs ) - - def __setstate__ ( self, state ): - ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout, - self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips, - self.top1allele, self.top2allele, self.top12alleles_ratio, - self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor, - self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor, - self.heter_noAS_kc, self.heter_noAS_ki, - self.heter_AS_kc, self.heter_AS_ki, - self.heter_AS_alleleratio, - self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS, - self.GQ_heter_ASsig, - self.GQ, - self.GT, - self.type, - self.hasfermiinfor, - self.fermiNTs ) = state - - cpdef bool filterflag ( self ): - return self.filterout - - cpdef void apply_GQ_cutoff ( self, int min_homo_GQ = 50, int min_heter_GQ = 100 ): - if self.filterout: - return - if self.type.startswith('homo') and self.GQ < min_homo_GQ: - self.filterout = True - elif self.type.startswith('heter') and self.GQ < min_heter_GQ: - self.filterout = True - return - - cpdef void apply_deltaBIC_cutoff ( self, float min_delta_BIC = 10 ): - if self.filterout: - return - if self.deltaBIC < min_delta_BIC: - self.filterout = True - return - - cpdef void add_T ( self, int read_index, bytes read_allele, int read_bq, int strand, bool tip, int Q=20 ): - """ Strand 0: plus, 1: minus - - Q is the quality cutoff. By default, only consider Q20 or read_bq > 20. - """ - if read_bq <= Q: - return - if not self.n_reads.has_key( read_allele ): - self.bq_set_T[read_allele] = [] - self.bq_set_C[read_allele] = [] - self.n_reads_T[read_allele] = 0 - self.n_reads_C[read_allele] = 0 - self.n_reads[read_allele] = 0 - self.n_strand[ 0 ][ read_allele ] = 0 - self.n_strand[ 1 ][ read_allele ] = 0 - self.n_tips[read_allele] = 0 - self.bq_set_T[read_allele].append( read_bq ) - self.n_reads_T[ read_allele ] += 1 - self.n_reads[ read_allele ] += 1 - self.n_strand[ strand ][ read_allele ] += 1 - if tip: self.n_tips[ read_allele ] += 1 - - cpdef void add_C ( self, int read_index, bytes read_allele, int read_bq, int strand, int Q=20 ): - if read_bq <= Q: - return - if not self.n_reads.has_key( read_allele ): - self.bq_set_T[read_allele] = [] - self.bq_set_C[read_allele] = [] - self.n_reads_T[read_allele] = 0 - self.n_reads_C[read_allele] = 0 - self.n_reads[read_allele] = 0 - self.n_strand[ 0 ][ read_allele ] = 0 - self.n_strand[ 1 ][ read_allele ] = 0 - self.n_tips[read_allele] = 0 - self.bq_set_C[read_allele].append( read_bq ) - self.n_reads_C[ read_allele ] += 1 - self.n_reads[ read_allele ] += 1 - #self.n_strand[ strand ][ read_allele ] += 1 - - cpdef int raw_read_depth ( self, str opt = "all" ): - if opt == "all": - return sum( self.n_reads.values() ) - elif opt == "T": - return sum( self.n_reads_T.values() ) - elif opt == "C": - return sum( self.n_reads_C.values() ) - else: - raise Exception( "opt should be either 'all', 'T' or 'C'." ) - - cpdef void update_top_alleles ( self, float min_top12alleles_ratio = 0.8, int min_altallele_count = 2, float max_allowed_ar = 0.95 ): - #cpdef update_top_alleles ( self, float min_top12alleles_ratio = 0.8 ): - """Identify top1 and top2 NT. the ratio of (top1+top2)/total - """ - cdef: - float r - - [self.top1allele, self.top2allele] = sorted(self.n_reads, key=self.n_reads_T.get, reverse=True)[:2] - - # if top2 allele count in ChIP is lower than - # min_altallele_count, or when allele ratio top1/(top1+top2) - # is larger than max_allowed_ar in ChIP, we won't consider - # this allele at all. we set values of top2 allele in - # dictionaries to [] and ignore top2 allele entirely. - - # max(self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top2allele ]) < min_altallele_count - #if self.ref_pos == 52608504: - # print self.ref_pos, self.n_reads_T[ self.top1allele ], self.n_reads_T[ self.top2allele ], self.n_reads_C[ self.top1allele ], self.n_reads_C[ self.top2allele ] - if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0: - self.filterout = True - return - - if (len(self.top1allele)==1 and len(self.top2allele)==1) and ( self.top2allele != self.ref_allele and ( ( self.n_reads_T[ self.top2allele ] - self.n_tips[ self.top2allele ] ) < min_altallele_count ) or \ - self.n_reads_T[ self.top1allele ]/(self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ]) > max_allowed_ar ): - self.bq_set_T[ self.top2allele ] = [] - self.bq_set_C[ self.top2allele ] = [] - self.n_reads_T[ self.top2allele ] = 0 - self.n_reads_C[ self.top2allele ] = 0 - self.n_reads[ self.top2allele ] = 0 - self.n_tips[ self.top2allele ] = 0 - if ( self.top1allele != self.ref_allele and ( self.n_reads_T[ self.top1allele ] - self.n_tips[ self.top1allele ] ) < min_altallele_count ): - self.bq_set_T[ self.top1allele ] = [] - self.bq_set_C[ self.top1allele ] = [] - self.n_reads_T[ self.top1allele ] = 0 - self.n_reads_C[ self.top1allele ] = 0 - self.n_reads[ self.top1allele ] = 0 - self.n_tips[ self.top1allele ] = 0 - - if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0: - self.filterout = True - return - - self.top12alleles_ratio = ( self.n_reads[ self.top1allele ] + self.n_reads[ self.top2allele ] ) / sum( self.n_reads.values() ) - if self.top12alleles_ratio < min_top12alleles_ratio: - self.filterout = True - return - - if self.top1allele == self.ref_allele and self.n_reads[ self.top2allele ] == 0: - # This means this position only contains top1allele which is the ref_allele. So the GT must be 0/0 - self.type = "homo_ref" - self.filterout = True - return - return - - cpdef void top12alleles ( self ): - print ( self.ref_pos, self.ref_allele) - print ("Top1allele",self.top1allele, "Treatment", self.bq_set_T[self.top1allele], "Control", self.bq_set_C[self.top1allele]) - print ("Top2allele",self.top2allele, "Treatment", self.bq_set_T[self.top2allele], "Control", self.bq_set_C[self.top2allele]) - - cpdef void call_GT ( self, float max_allowed_ar = 0.99 ): - """Require update_top_alleles being called. - """ - cdef: - np.ndarray[np.int32_t, ndim=1] top1_bq_T - np.ndarray[np.int32_t, ndim=1] top2_bq_T - np.ndarray[np.int32_t, ndim=1] top1_bq_C - np.ndarray[np.int32_t, ndim=1] top2_bq_C - int i - list top1_bq_T_l - list top2_bq_T_l - list top1_bq_C_l - list top2_bq_C_l - list tmp_mutation_type - bytes tmp_alt - - if self.filterout: - return - - top1_bq_T = np.array( self.bq_set_T[ self.top1allele ], dtype="int32" ) - top2_bq_T = np.array( self.bq_set_T[ self.top2allele ], dtype="int32" ) - top1_bq_C = np.array( self.bq_set_C[ self.top1allele ], dtype="int32" ) - top2_bq_C = np.array( self.bq_set_C[ self.top2allele ], dtype="int32" ) - (self.lnL_homo_major, self.BIC_homo_major) = CalModel_Homo( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C ) - (self.lnL_homo_minor, self.BIC_homo_minor) = CalModel_Homo( top2_bq_T, top2_bq_C, top1_bq_T, top1_bq_C ) - (self.lnL_heter_noAS, self.BIC_heter_noAS) = CalModel_Heter_noAS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C ) - (self.lnL_heter_AS, self.BIC_heter_AS) = CalModel_Heter_AS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C, max_allowed_ar ) - - #if self.ref_pos == 71078525: - # print "---" - # print len( top1_bq_T ), len( top1_bq_C ), len( top2_bq_T ), len( top2_bq_C ) - # print self.lnL_homo_major, self.lnL_homo_minor, self.lnL_heter_noAS, self.lnL_heter_AS - # print self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS, self.BIC_heter_AS - - if self.top1allele != self.ref_allele and self.n_reads[ self.top2allele ] == 0: - # in this case, there is no top2 nt (or socalled minor - # allele) in either treatment or control, we should assume - # it's a 1/1 genotype. We will take 1/1 if it passes BIC - # test (deltaBIC >=2), and will skip this loci if it can't - # pass the test. - - self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major - if self.deltaBIC < 2: - self.filterout = True - return - - self.type = "homo" - self.GT = "1/1" - - self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 - self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10 - self.PL_11 = -10.0 * self.lnL_homo_major / LN10 - - self.PL_00 = max( 0, self.PL_00 - self.PL_11 ) - self.PL_01 = max( 0, self.PL_01 - self.PL_11 ) - self.PL_11 = 0 - - self.GQ = min( self.PL_00, self.PL_01 ) - self.alt_allele = self.top1allele - else: - # assign GQ, GT, and type - if self.ref_allele != self.top1allele and self.BIC_homo_major + 2 <= self.BIC_homo_minor and self.BIC_homo_major + 2 <= self.BIC_heter_noAS and self.BIC_homo_major + 2 <= self.BIC_heter_AS: - self.type = "homo" - self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major - self.GT = "1/1" - self.alt_allele = self.top1allele - - self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 - self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10 - self.PL_11 = -10.0 * self.lnL_homo_major / LN10 - - self.PL_00 = self.PL_00 - self.PL_11 - self.PL_01 = self.PL_01 - self.PL_11 - self.PL_11 = 0 - - self.GQ = min( self.PL_00, self.PL_01 ) - - elif self.BIC_heter_noAS + 2 <= self.BIC_homo_major and self.BIC_heter_noAS + 2 <= self.BIC_homo_minor and self.BIC_heter_noAS + 2 <= self.BIC_heter_AS : - self.type = "heter_noAS" - self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_noAS - - self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 - self.PL_01 = -10.0 * self.lnL_heter_noAS / LN10 - self.PL_11 = -10.0 * self.lnL_homo_major / LN10 - - self.PL_00 = self.PL_00 - self.PL_01 - self.PL_11 = self.PL_11 - self.PL_01 - self.PL_01 = 0 - - self.GQ = min( self.PL_00, self.PL_11 ) - - elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor and self.BIC_heter_AS + 2 <= self.BIC_heter_noAS: - self.type = "heter_AS" - self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_AS - - self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 - self.PL_01 = -10.0 * self.lnL_heter_AS / LN10 - self.PL_11 = -10.0 * self.lnL_homo_major / LN10 - - self.PL_00 = self.PL_00 - self.PL_01 - self.PL_11 = self.PL_11 - self.PL_01 - self.PL_01 = 0 - - self.GQ = min( self.PL_00, self.PL_11 ) - - elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor: - # can't decide if it's noAS or AS - self.type = "heter_unsure" - self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - max( self.BIC_heter_AS, self.BIC_heter_noAS ) - - self.PL_00 = -10.0 * self.lnL_homo_minor / LN10 - self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10 - self.PL_11 = -10.0 * self.lnL_homo_major / LN10 - - self.PL_00 = self.PL_00 - self.PL_01 - self.PL_11 = self.PL_11 - self.PL_01 - self.PL_01 = 0 - - self.GQ = min( self.PL_00, self.PL_11 ) - - elif self.ref_allele == self.top1allele and self.BIC_homo_major < self.BIC_homo_minor and self.BIC_homo_major < self.BIC_heter_noAS and self.BIC_homo_major < self.BIC_heter_AS: - self.type = "homo_ref" - # we do not calculate GQ if type is homo_ref - self.GT = "0/0" - self.filterout = True - else: - self.type="unsure" - self.filterout = True - - if self.type.startswith( "heter" ): - if self.ref_allele == self.top1allele: - self.alt_allele = self.top2allele - self.GT = "0/1" - elif self.ref_allele == self.top2allele: - self.alt_allele = self.top1allele - self.GT = "0/1" - else: - self.alt_allele = self.top1allele+b','+self.top2allele - self.GT = "1/2" - # strand bias filter, uncomment following if wish to debug - # calculate SB score - #print "calculate SB score for ", self.ref_pos, "a/b/c/d:", self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ] - #SBscore = self.SB_score_ChIP( self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ] ) - #SBscore = 0 - #if SBscore >= 1: - # print "disgard variant at", self.ref_pos, "type", self.type - # self.filterout = True - - # if self.ref_allele == self.top1allele: - # self.n_strand[ 0 ][ self.top1allele ] + self.n_strand[ 1 ][ self.top1allele ] - # if and self.n_strand[ 0 ][ self.top2allele ] == 0 or self.n_strand[ 1 ][ self.top2allele ] == 0: - # self.filterout = True - # print self.ref_pos - - - # self.deltaBIC = self.deltaBIC - - tmp_mutation_type = [] - for tmp_alt in self.alt_allele.split(b','): - if tmp_alt == b'*': - tmp_mutation_type.append( "Deletion" ) - elif len( tmp_alt ) > 1: - tmp_mutation_type.append( "Insertion" ) - else: - tmp_mutation_type.append( "SNV" ) - self.mutation_type = ",".join( tmp_mutation_type ) - return - - cdef float SB_score_ChIP( self, int a, int b, int c, int d ): - """ calculate score for filtering variants with strange strand biases. - - a: top1/major allele plus strand - b: top2/minor allele plus strand - c: top1/major allele minus strand - d: top2/minor allele minus strand - - Return a float value so that if this value >= 1, the variant will be filtered out. - """ - cdef: - float score - double p - double p1_l, p1_r - double p2_l, p2_r - double top2_sb, top1_sb - - if a+b == 0 or c+d == 0: - # if major allele and minor allele both bias to the same strand, allow it - return 0.0 - - # Rule: - # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it. - # or there is no significant bias (0.5) in top2 allele. - - #print a, b, c, d - p1_l = binomial_cdf( a, (a+c), 0.5, lower=True ) # alternative: less than 0.5 - p1_r = binomial_cdf( c, (a+c), 0.5, lower=True ) # greater than 0.5 - p2_l = binomial_cdf( b, (b+d), 0.5, lower=True ) # alternative: less than 0.5 - p2_r = binomial_cdf( d, (b+d), 0.5, lower=True ) # greater than 0.5 - #print p1_l, p1_r, p2_l, p2_r - - if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05): - # we reject loci where the significant biases are inconsistent between top1 and top2 alleles. - return 1.0 - else: - # if b<=2 and d=0 or b=0 and d<=2 -- highly possible FPs - #if ( b<=2 and d==0 or b==0 and d<=2 ): - # return 1 - # can't decide - return 0.0 - - cdef float SB_score_ATAC( self, int a, int b, int c, int d ): - """ calculate score for filtering variants with strange strand biases. - - ATAC-seq version - - a: top1/major allele plus strand - b: top2/minor allele plus strand - c: top1/major allele minus strand - d: top2/minor allele minus strand - - Return a float value so that if this value >= 1, the variant will be filtered out. - """ - cdef: - float score - double p - double p1_l, p1_r - double p2_l, p2_r - double top2_sb, top1_sb - - if a+b == 0 or c+d == 0: - # if major allele and minor allele both bias to the same strand, allow it - return 0.0 - - # Rule: - # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it. - # or there is no significant bias (0.5) in top2 allele. - - #print a, b, c, d - p1_l = binomial_cdf( a, (a+c), 0.5, lower=True ) # alternative: less than 0.5 - p1_r = binomial_cdf( c, (a+c), 0.5, lower=True ) # greater than 0.5 - p2_l = binomial_cdf( b, (b+d), 0.5, lower=True ) # alternative: less than 0.5 - p2_r = binomial_cdf( d, (b+d), 0.5, lower=True ) # greater than 0.5 - #print p1_l, p1_r, p2_l, p2_r - - if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05): - # we reject loci where the significant biases are inconsistent between top1 and top2 alleles. - return 1.0 - else: - # can't decide - return 0.0 - - cpdef str to_vcf ( self ): - """Output REF,ALT,QUAL,FILTER,INFO,FORMAT, SAMPLE columns. - """ - cdef: - str vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample - - vcf_ref = self.ref_allele.decode() - vcf_alt = self.alt_allele.decode() - vcf_qual = "%d" % self.GQ - vcf_filter = "." - vcf_info = (b"M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % \ - (self.type.encode(), self.mutation_type.encode(), sum( self.n_reads_T.values() ), sum( self.n_reads_C.values() ), - self.n_reads_T[self.top1allele], self.top1allele, self.n_reads_T[self.top2allele], self.top2allele, - self.n_reads_C[self.top1allele], self.top1allele, self.n_reads_C[self.top2allele], self.top2allele, - self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ], - self.deltaBIC, - self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS,self.BIC_heter_AS, - self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]) - )).decode() - vcf_format = "GT:DP:GQ:PL" - vcf_sample = "%s:%d:%d:%d,%d,%d" % (self.GT, self.raw_read_depth( opt = "all" ), self.GQ, self.PL_00, self.PL_01, self.PL_11) - return "\t".join( ( vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample ) ) - - cpdef toVariant ( self ): - cdef: - object v - v = Variant( - self.ref_allele.decode(), - self.alt_allele.decode(), - self.GQ, - '.', - self.type, - self.mutation_type, - self.top1allele.decode(), - self.top2allele.decode(), - sum( self.n_reads_T.values() ), - sum( self.n_reads_C.values() ), - self.n_reads_T[self.top1allele], - self.n_reads_T[self.top2allele], - self.n_reads_C[self.top1allele], - self.n_reads_C[self.top2allele], - self.n_strand[ 0 ][ self.top1allele ], - self.n_strand[ 0 ][ self.top2allele ], - self.n_strand[ 1 ][ self.top1allele ], - self.n_strand[ 1 ][ self.top2allele ], - self.deltaBIC, - self.BIC_homo_major, - self.BIC_homo_minor, - self.BIC_heter_noAS, - self.BIC_heter_AS, - self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]), - self.GT, - self.raw_read_depth( opt = "all" ), - self.PL_00, - self.PL_01, - self.PL_11 ) - return v diff --git a/MACS3/Signal/RACollection.pxd b/MACS3/Signal/RACollection.pxd new file mode 100644 index 00000000..ce14c068 --- /dev/null +++ b/MACS3/Signal/RACollection.pxd @@ -0,0 +1,61 @@ +cdef extern from "fml.h": + ctypedef struct bseq1_t: + int l_seq + char *seq + char *qual # NULL-terminated strings; length expected to match $l_seq + + ctypedef struct magopt_t: + int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth + float min_dratio1, max_bcov, max_bfrac + + ctypedef struct fml_opt_t: + int n_threads # number of threads; don't use multi-threading for small data sets + int ec_k # k-mer length for error correction; 0 for auto estimate + int min_cnt, max_cnt # both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt] + int min_asm_ovlp # min overlap length during assembly + int min_merge_len # during assembly, don't explicitly merge an overlap if shorter than this value + magopt_t mag_opt # graph cleaning options + + ctypedef struct fml_ovlp_t: + unsigned int len_, from_, id_, to_ + #unit32_t from # $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end + #unsigned int id + #unsigned int to # $id: unitig number + + ctypedef struct fml_utg_t: + int len # length of sequence + int nsr # number of supporting reads + char *seq # unitig sequence + char *cov # cov[i]-33 gives per-base coverage at i + int n_ovlp[2] # number of 5'-end [0] and 3'-end [1] overlaps + fml_ovlp_t *ovlp # overlaps, of size n_ovlp[0]+n_ovlp[1] + + void fml_opt_init(fml_opt_t *opt) + fml_utg_t* fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg) + void fml_utg_destroy(int n_utg, fml_utg_t *utg) + void fml_utg_print(int n_utgs, const fml_utg_t *utg) + bseq1_t *bseq_read(const char *fn, int *n) + +# --- end of fermi-lite functions --- + +# --- smith-waterman alignment functions --- + +cdef extern from "swalign.h": + ctypedef struct seq_pair_t: + char *a + unsigned int alen + char *b + unsigned int blen + ctypedef struct align_t: + seq_pair_t *seqs + char *markup; + int start_a + int start_b + int end_a + int end_b + int matches + int gaps + double score + align_t *smith_waterman(seq_pair_t *problem) + void destroy_seq_pair(seq_pair_t *pair) + void destroy_align(align_t *ali) diff --git a/MACS3/Signal/RACollection.py b/MACS3/Signal/RACollection.py new file mode 100644 index 00000000..6f8ca04b --- /dev/null +++ b/MACS3/Signal/RACollection.py @@ -0,0 +1,906 @@ +# cython: language_level=3 +# cython: profile=True +# Time-stamp: <2024-10-22 16:26:57 Tao Liu> + +"""Module for ReadAlignment collection + +Copyright (c) 2024 Tao Liu + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included +with the distribution). + +@status: experimental +@version: $Revision$ +@author: Tao Liu +@contact: vladimir.liu@gmail.com +""" +# ------------------------------------ +# python modules +# ------------------------------------ +from collections import Counter +from operator import itemgetter +from copy import copy + +from MACS3.Signal.ReadAlignment import ReadAlignment +from MACS3.Signal.PosReadsInfo import PosReadsInfo +from MACS3.Signal.UnitigRACollection import UnitigRAs, UnitigCollection +from MACS3.IO.PeakIO import PeakIO + +import cython +from cython.cimports.cpython import bool +# from cython.cimports.cpython.mem import PyMem_Malloc, PyMem_Free + +from cython.cimports.libc.stdlib import malloc, free + +# ------------------------------------ +# constants +# ------------------------------------ +__version__ = "Parser $Revision$" +__author__ = "Tao Liu " +__doc__ = "All Parser classes" + +__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A. + +__CIGARCODE__ = "MIDNSHP=X" + +# ------------------------------------ +# Misc functions +# ------------------------------------ + +# ------------------------------------ +# Classes +# ------------------------------------ + + +@cython.cclass +class RACollection: + """A collection of ReadAlignment objects and the corresponding + PeakIO. + + """ + chrom: bytes + peak: PeakIO # A PeakIO object + RAlists: list # contain ReadAlignment lists for treatment (0) and control (1) + left: cython.long # left position of peak + right: cython.long # right position of peak + length: cython.long # length of peak + RAs_left: cython.long # left position of all RAs in the collection + RAs_right: cython.long # right position of all RAs in the collection + sorted: bool # if sorted by lpos + peak_refseq: bytes # reference sequence in peak region b/w left and right + peak_refseq_ext: bytes # reference sequence in peak region with extension on both sides b/w RAs_left and RAs_right + + def __init__(self, chrom: bytes, peak: PeakIO, RAlist_T: list, RAlist_C: list = []): + """Create RACollection by: object taking: + + 1. peak: a PeakIO indicating: object the peak region. + + 2. RAlist: a python of: list ReadAlignment objects containing + all the reads overlapping the peak region. If no RAlist_C + given, it will be []. + + """ + if len(RAlist_T) == 0: + # no reads, return None + raise Exception("No reads from ChIP sample to construct RAcollection!") + self.chrom = chrom + self.peak = peak + # print(len(RAlist_T),"\n") + # print(len(RAlist_C),"\n") + self.RAlists = [RAlist_T, RAlist_C] + self.left = peak["start"] + self.right = peak["end"] + self.length = self.right - self.left + if RAlist_T: + self.RAs_left = RAlist_T[0]["lpos"] # initial assignment of RAs_left + self.RAs_right = RAlist_T[-1]["rpos"] # initial assignment of RAs_right + self.sort() # it will set self.sorted = True + else: + self.RAs_left = -1 + self.RAs_right = -1 + # check RAs_left and RAs_right + for ra in RAlist_T: + if ra["lpos"] < self.RAs_left: + self.RAs_left = ra["lpos"] + if ra["rpos"] > self.RAs_right: + self.RAs_right = ra["rpos"] + + for ra in RAlist_C: + if ra["lpos"] < self.RAs_left: + self.RAs_left = ra["lpos"] + if ra["rpos"] > self.RAs_right: + self.RAs_right = ra["rpos"] + (self.peak_refseq, self.peak_refseq_ext) = self.__get_peak_REFSEQ() + + def __getitem__(self, keyname): + if keyname == "chrom": + return self.chrom + elif keyname == "left": + return self.left + elif keyname == "right": + return self.right + elif keyname == "RAs_left": + return self.RAs_left + elif keyname == "RAs_right": + return self.RAs_right + elif keyname == "length": + return self.length + elif keyname == "count": + return len(self.RAlists[0]) + len(self.RAlists[1]) + elif keyname == "count_T": + return len(self.RAlists[0]) + elif keyname == "count_C": + return len(self.RAlists[1]) + elif keyname == "peak_refseq": + return self.peak_refseq + elif keyname == "peak_refseq_ext": + return self.peak_refseq_ext + else: + raise KeyError("Unavailable key:", keyname) + + def __getstate__(self): + #return {"chrom":self.chrom, "peak":self.peak, "RAlists":self.RAlists, + # "left":self.left, "right":self.right, "length": self.length, + # "RAs_left":self.RAs_left, "RAs_right":self.RAs_right} + return (self.chrom, self.peak, self.RAlists, self.left, self.right, + self.length, self.RAs_left, self.RAs_right, self.peak_refseq, + self.peak_refseq_ext) + + def __setstate__(self, state): + (self.chrom, self.peak, self.RAlists, self.left, self.right, + self.length, self.RAs_left, self.RAs_right, self.peak_refseq, + self.peak_refseq_ext) = state + + @cython.ccall + def sort(self): + """Sort RAs according to lpos. Should be used after realignment. + + """ + if self.RAlists[0]: + self.RAlists[0].sort(key=itemgetter("lpos")) + if self.RAlists[1]: + self.RAlists[1].sort(key=itemgetter("lpos")) + self.sorted = True + return + + @cython.ccall + def remove_outliers(self, percent: cython.int = 5): + """ Remove outliers with too many n_edits. The outliers with + n_edits in top p% will be removed. + + Default: remove top 5% of reads that have too many differences + with reference genome. + """ + n_edits_list: list + ralist: list + read: ReadAlignment # ReadAlignment object + highest_n_edits: cython.int + new_RAlist: list + i: cython.int + + n_edits_list = [] + for ralist in self.RAlists: + for read in ralist: + n_edits_list.append(read["n_edits"]) + n_edits_list.sort() + highest_n_edits = n_edits_list[int(len(n_edits_list) * (1 - percent * .01))] + + for i in (range(len(self.RAlists))): + new_RAlist = [] + for read in self.RAlists[i]: + if read["n_edits"] <= highest_n_edits: + new_RAlist.append(read) + self.RAlists[i] = new_RAlist + + return + + @cython.ccall + def n_edits_sum(self) -> cython.int: + """ + """ + n_edits_list: list + ralist: list + read: ReadAlignment + c: cython.int + # highest_n_edits: cython.int + + n_edits_list = [] + + for ralist in self.RAlists: + for read in ralist: + n_edits_list.append(read["n_edits"]) + + n_edits_list.sort() + # print (n_edits_list) + c = Counter(n_edits_list) + return c + # print(c) + + @cython.cfunc + def __get_peak_REFSEQ(self) -> tuple: + """Get the reference sequence within the peak region. + + """ + peak_refseq: bytearray + # i: cython.int + # prev_r: cython.long #remember the previous filled right end + start: cython.long + end: cython.long + # ind: cython.long + # ind_r: cython.long + # read: ReadAlignment + # read_refseq_ext: bytearray + # read_refseq: bytearray + + start = min(self.RAs_left, self.left) + end = max(self.RAs_right, self.right) + # print ("left",start,"right",end) + peak_refseq_ext = bytearray(b'N' * (end - start)) + + # for treatment. + peak_refseq_ext = self.__fill_refseq(peak_refseq_ext, + self.RAlists[0]) + # and control if available. + if self.RAlists[1]: + peak_refseq_ext = self.__fill_refseq(peak_refseq_ext, + self.RAlists[1]) + + # trim + peak_refseq = peak_refseq_ext[self.left - start: self.right - start] + return (bytes(peak_refseq), bytes(peak_refseq_ext)) + + @cython.cfunc + def __fill_refseq(self, + seq: bytearray, + ralist: list) -> bytearray: + """Fill refseq sequence of whole peak with refseq sequences of + each read in ralist. + + """ + prev_r: cython.long # previous right position of last + # filled + ind: cython.long + ind_r: cython.long + start: cython.long + # end: cython.long + read: ReadAlignment + read_refseq: bytearray + + start = min(self.RAs_left, self.left) + + # print(len(ralist),"\n") + prev_r = ralist[0]["lpos"] + + for i in range(len(ralist)): + read = ralist[i] + if read["lpos"] > prev_r: + read = ralist[i - 1] + read_refseq = read.get_REFSEQ() + ind = read["lpos"] - start + ind_r = ind + read["rpos"] - read["lpos"] + seq[ind: ind_r] = read_refseq + prev_r = read["rpos"] + # last + read = ralist[-1] + read_refseq = read.get_REFSEQ() + ind = read["lpos"] - start + ind_r = ind + read["rpos"] - read["lpos"] + seq[ind: ind_r] = read_refseq + return seq + + @cython.ccall + def get_PosReadsInfo_ref_pos(self, + ref_pos: cython.long, + ref_nt: bytes, + Q: cython.int = 20): + """Generate a PosReadsInfo for: object a given reference genome + position. + + Return a PosReadsInfo object. + + """ + s: bytearray + bq: bytearray + strand: cython.int + ra: ReadAlignment + # bq_list_t: list = [] + # bq_list_c: list = [] + i: cython.int + pos: cython.int + tip: bool + posreadsinfo_p: PosReadsInfo + + posreadsinfo_p = PosReadsInfo(ref_pos, ref_nt) + + # Treatment group + for i in range(len(self.RAlists[0])): + ra = self.RAlists[0][i] + if ra["lpos"] <= ref_pos and ra["rpos"] > ref_pos: + (s, bq, strand, tip, pos) = ra.get_variant_bq_by_ref_pos(ref_pos) + posreadsinfo_p.add_T(i, bytes(s), bq[0], strand, tip, Q=Q) + + # Control group + for i in range(len(self.RAlists[1])): + ra = self.RAlists[1][i] + if ra["lpos"] <= ref_pos and ra["rpos"] > ref_pos: + (s, bq, strand, tip, pos) = ra.get_variant_bq_by_ref_pos(ref_pos) + posreadsinfo_p.add_C(i, bytes(s), bq[0], strand, Q=Q) + + return posreadsinfo_p + + @cython.ccall + def get_FASTQ(self) -> bytearray: + """Get FASTQ file for all reads in RACollection. + + """ + ra: ReadAlignment + fastq_text: bytearray + + fastq_text = bytearray(b"") + + for ra in self.RAlists[0]: + fastq_text += ra.get_FASTQ() + + for ra in self.RAlists[1]: + fastq_text += ra.get_FASTQ() + + return fastq_text + + @cython.cfunc + def fermi_assemble(self, + fermiMinOverlap: cython.int, + opt_flag: cython.int = 0x80) -> list: + """A wrapper function to call Fermi unitig building functions. + """ + opt: cython.pointer(fml_opt_t) + # c: cython.int + n_seqs: cython.int + n_utg: cython.pointer(cython.int) + seqs: cython.pointer(bseq1_t) + utg: cython.pointer(fml_utg_t) + p: fml_utg_t + + # unitig_k: cython.int + # merge_min_len: cython.int + tmps: bytes + tmpq: bytes + # ec_k: cython.int = -1 + l: cython.long + cseq: cython.pointer(cython.char) + cqual: cython.pointer(cython.char) + i: cython.int + j: cython.int + # tmpunitig: bytes + # unitig: bytes # final unitig + unitig_list: list # contain of: list sequences in format: bytes + # n: cython.pointer(cython.int) + + n_seqs = len(self.RAlists[0]) + len(self.RAlists[1]) + + # prn_seqs: cython.int + + # prepare seq and qual, note, we only extract SEQ according to the + + # strand of reference sequence. + seqs = cython.cast(cython.pointer(bseq1_t), + malloc(n_seqs * cython.sizeof(bseq1_t))) # we rely on fermi-lite to free this mem + + i = 0 + for ra in self.RAlists[0]: + tmps = ra["SEQ"] + tmpq = ra["QUAL"] + l = len(tmps) + # we rely on fermi-lite to free this mem + cseq = cython.cast(cython.pointer(cython.char), + malloc((l+1)*cython.sizeof(cython.char))) + # we rely on fermi-lite to free this mem + cqual = cython.cast(cython.pointer(cython.char), + malloc((l+1)*cython.sizeof(cython.char))) + for j in range(l): + cseq[j] = tmps[j] + cqual[j] = tmpq[j] + 33 + cseq[l] = b'\x00' + cqual[l] = b'\x00' + + seqs[i].seq = cseq + seqs[i].qual = cqual + seqs[i].l_seq = len(tmps) + i += 1 + + # print "@",ra["readname"].decode() + # prcseq: cython.int.decode() + # print "+" + # prcqual: cython.int.decode() + + for ra in self.RAlists[1]: + tmps = ra["SEQ"] + tmpq = ra["QUAL"] + l = len(tmps) + # we rely on fermi-lite to free this mem + cseq = cython.cast(cython.pointer(cython.char), + malloc((l+1)*cython.sizeof(cython.char))) + # we rely on fermi-lite to free this mem + cqual = cython.cast(cython.pointer(cython.char), + malloc((l+1)*cython.sizeof(cython.char))) + for j in range(l): + cseq[j] = tmps[j] + cqual[j] = tmpq[j] + 33 + cseq[l] = b'\x00' + cqual[l] = b'\x00' + + seqs[i].seq = cseq + seqs[i].qual = cqual + seqs[i].l_seq = len(tmps) + i += 1 + # print "@",ra["readname"].decode() + # prcseq: cython.int.decode() + # print "+" + # prcqual: cython.int.decode() + + # if self.RAlists[1]: + # unitig_k=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*fermiOverlapMinRatio) + + # merge_min_len=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*0.5) + # else: + # unitig_k = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio) + + # merge_min_len=int(self.RAlists[0][0]["l"]*0.5) + #fermiMinOverlap = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio) + + # minimum overlap to merge, default 0 + # merge_min_len= max(25, int(self.RAlists[0][0]["l"]*0.5)) + # merge_min_len= int(self.RAlists[0][0]["l"]*0.5) + + # opt = cython.cast(cython.pointer(fml_opt_t), + # PyMem_Malloc(cython.sizeof(fml_opt_t))) + # n_utg = cython.cast(cython.pointer(cython.int), + # PyMem_Malloc(cython.sizeof(int))) + + opt = cython.cast(cython.pointer(fml_opt_t), + malloc(cython.sizeof(fml_opt_t))) + n_utg = cython.cast(cython.pointer(cython.int), + malloc(cython.sizeof(int))) + + fml_opt_init(opt) + # k-mer length for error correction (0 for auto; -1 to disable) + # opt.ec_k = 0 + + # min overlap length during initial assembly + opt.min_asm_ovlp = fermiMinOverlap + + # minimum length to merge, during assembly, don't explicitly merge an overlap if shorter than this value + # opt.min_merge_len = merge_min_len + + # there are more 'options' for mag clean: + # flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth, min_dratio1, max_bcov, max_bfrac + # min_elen (300) will be adjusted + # min_ensr (4), min_insr (3) will be computed + # min_merge_len (0) will be updated using opt.min_merge_len + + # We can adjust: flag (0x40|0x80), min_ovlp (0), min_dratio1 (0.7), max_bdiff (50), max_bdist (512), max_bvtx (64), trim_len (0), trim_depth (6), max_bcov (10.), max_bfrac (0.15) + + # 0x20: MAG_F_AGGRESSIVE pop variant bubbles + # 0x40: MAG_F_POPOPEN aggressive tip trimming + # 0x80: MAG_F_NO_SIMPL skip bubble simplification + opt.mag_opt.flag = opt_flag + + # mag_opt.min_ovlp + #opt.mag_opt.min_ovlp = fermiMinOverlap + + # drop an overlap if its length is below maxOvlpLen*FLOAT + #opt.mag_opt.min_dratio1 = 0.5 + + # retain a bubble if one side is longer than the other side by >INT-bp + #opt.mag_opt.max_bdiff = 10#merge_min_len + + # trim_len: + # trim_depth: Parameter used to trim the open end/tip. If trim_len == 0, do nothing + + # max_bdist: + # max_bvtx: Parameter used to simply bubble while 0x80 flag is set. + #opt.mag_opt.max_bdist = 1024 + #opt.mag_opt.max_bvtx = 128 + + # max_bcov: + # max_bfrac: Parameter used when aggressive bubble removal is not used. Bubble will be removed if its average coverage lower than max_bcov and fraction (cov1/(cov1+cov2)) is lower than max_bfrac + #opt.mag_opt.max_bcov = 10. + #opt.mag_opt.max_bfrac = 0.01 + + utg = fml_assemble(opt, n_seqs, seqs, n_utg) + # get results + unitig_list = [] + for i in range(n_utg[0]): + p = utg[i] + if (p.len < 0): + continue + # unitig = b'' + # for j in range(p.len): + # unitig += [b'A',b'C',b'G',b'T',b'N'][int(p.seq[j]) - 1] + # unitig_list.append(unitig) + unitig_list.append(p.seq) + + fml_utg_destroy(n_utg[0], utg) + + # PyMem_Free(opt) + # PyMem_Free(n_utg) + free(opt) + free(n_utg) + + return unitig_list + + @cython.cfunc + def align_unitig_to_REFSEQ(self, unitig_list: list) -> tuple: + """Note: we use smith waterman, but we don't use linear gap + penalty at this time. + + Also, if unitig is mapped to - strand, we will revcomp the + unitig. So the unitig_will: list be changed in this case. + """ + unitig: bytes + problem: seq_pair_t + results: cython.pointer(align_t) + # tmp: cython.pointer(cython.char) + target: bytes + reference: bytes + target_aln_f: bytes + target_aln_r: bytes + reference_aln_f: bytes + reference_aln_r: bytes + markup_aln_f: bytes + markup_aln_r: bytes + score_f: cython.double + score_r: cython.double + target_alns: list = [] + reference_alns: list = [] + markup_alns: list = [] + aln_scores: list = [] + i: cython.int + + reference = copy(self.peak_refseq_ext+b'\x00') + + for i in range(len(unitig_list)): + unitig = unitig_list[i] + target = copy(unitig + b'\x00') + # we use swalign.c for local alignment (without affine gap + # penalty). Will revise later. + problem.a = target + problem.alen = len(unitig) + problem.b = reference + problem.blen = len(self.peak_refseq_ext) + results = smith_waterman(cython.address(problem)) + target_aln_f = results.seqs.a + reference_aln_f = results.seqs.b + markup_aln_f = results.markup + score_f = results.score + free(results.seqs.a) + free(results.seqs.b) + free(results.markup) + free(results) + # end of local alignment + + # try reverse complement + target = copy(unitig[::-1] + b'\x00') + target = target.translate(__DNACOMPLEMENT__) + problem.a = target + problem.alen = len(unitig) + problem.b = reference + problem.blen = len(self.peak_refseq_ext) + results = smith_waterman(cython.address(problem)) + target_aln_r = results.seqs.a + reference_aln_r = results.seqs.b + markup_aln_r = results.markup + score_r = results.score + free(results.seqs.a) + free(results.seqs.b) + free(results.markup) + free(results) + # end of local alignment + + if score_f > score_r: + target_alns.append(target_aln_f) + reference_alns.append(reference_aln_f) + markup_alns.append(markup_aln_f) + aln_scores.append(score_f) + else: + target_alns.append(target_aln_r) + reference_alns.append(reference_aln_r) + markup_alns.append(markup_aln_r) + aln_scores.append(score_r) + # we will revcomp unitig + unitig = unitig[::-1] + unitig_list[i] = unitig.translate(__DNACOMPLEMENT__) + + return (target_alns, reference_alns, aln_scores, markup_alns) + + @cython.cfunc + def verify_alns(self, unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns, min_score_100: cython.float = 150): + """Remove aln/unitig if it contains too many edits in a small region + + default min score is 150, which means under 2/-3/-5/-2 scoring schema, there are 10 mismatches within 100bps region. + """ + i: cython.int + + for i in range(len(unitig_list)-1, -1, -1): + # pri: cython.int, aln_scores[i] + # prunitig_alns: cython.int[i] + # prmarkup_alns: cython.int[i] + # prreference_alns: cython.int[i] + if aln_scores[i] * 100 / len(markup_alns[i]) < min_score_100: + unitig_list.pop(i) + unitig_alns.pop(i) + reference_alns.pop(i) + aln_scores.pop(i) + markup_alns.pop(i) + return + + @cython.cfunc + def filter_unitig_with_bad_aln(self, unitig_list: list, + target_alns: list, + reference_alns: list, + gratio: float = 0.25) -> tuple: + """Remove unitigs that has too much gaps (both on target and + reference) during alignments. + + """ + pass + + @cython.cfunc + def remap_RAs_w_unitigs(self, unitig_list: list) -> list: + """Remap RAs to unitigs, requiring perfect match. + + Return RAlists_T, RAlists_C, unmapped_racollection. + """ + RAlists_T: list = [] # lists of of: list RAs of ChIP mapped to each unitig + RAlists_C: list = [] + unmapped_RAlist_T: list = [] # of: list RAs of ChIP unmappable to unitigs + unmapped_RAlist_C: list = [] + # RACollection unmapped_ra_collection + flag: cython.int = 0 + i: cython.int + tmp_ra: ReadAlignment + tmp_ra_seq: bytes + unitig: bytes + + for i in range(len(unitig_list)): + RAlists_T.append([]) # for each unitig, there is another of: list RAs + RAlists_C.append([]) + + # assign RAs to unitigs + + for tmp_ra in self.RAlists[0]: + flag = 0 + tmp_ra_seq = tmp_ra["SEQ"] + for i in range(len(unitig_list)): + unitig = unitig_list[i] + if tmp_ra_seq in unitig: + flag = 1 + RAlists_T[i].append(tmp_ra) + break + if flag == 0: + unmapped_RAlist_T.append(tmp_ra) + # print "unmapped:", tmp_ra["SEQ"] + + for tmp_ra in self.RAlists[1]: + flag = 0 + tmp_ra_seq = tmp_ra["SEQ"] + for i in range(len(unitig_list)): + unitig = unitig_list[i] + if tmp_ra_seq in unitig: + flag = 1 + RAlists_C[i].append(tmp_ra) + break + if flag == 0: + unmapped_RAlist_C.append(tmp_ra) + # print "unmapped:", tmp_ra["SEQ"] + + # if unmapped_RAlist_T: + # unmapped_ra_collection = RACollection(self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C) + return [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C] + + @cython.cfunc + def add_to_unitig_list(self, unitig_list, unitigs_2nd) -> list: + """ + """ + i: cython.int + j: cython.int + flag: cython.int + u0: bytes + u1: bytes + new_unitig_list: list + + new_unitig_list = [] + + for i in range(len(unitigs_2nd)): + # initial value: can't be found in unitig_list + flag = 0 + u0 = unitigs_2nd[i] + for j in range(len(unitig_list)): + u1 = unitig_list[j] + if u1.find(u0) != -1: + flag = 1 + break + u1 = u1[::-1].translate(__DNACOMPLEMENT__) + if u1.find(u0) != -1: + flag = 1 + break + if not flag: + new_unitig_list.append(u0) + new_unitig_list.extend(unitig_list) + return new_unitig_list + + @cython.ccall + def build_unitig_collection(self, fermiMinOverlap): + """unitig_and: list tuple_alns are in the same order! + + return UnitigCollection object. + + """ + start: cython.long + end: cython.long + unitigs_2nd: list + # u: bytes + # target_alns: list + reference_alns: list + aln_scores: list + markup_alns: list + # target_alns_2nd: list + # reference_alns_2nd: list + # aln_scores_2nd: list + RAlists_T: list = [] # lists of of: list RAs of ChIP mapped to each unitig + RAlists_C: list = [] + unmapped_RAlist_T: list = [] + unmapped_RAlist_C: list = [] + # tmp_unitig_seq: bytes + tmp_reference_seq: bytes + tmp_unitig_aln: bytes + tmp_reference_aln: bytes + + i: cython.int + j: cython.int + left_padding_ref: cython.long + right_padding_ref: cython.long + left_padding_unitig: cython.long + right_padding_unitig: cython.long + ura_list: list = [] + unmapped_ra_collection: RACollection + # flag: cython.int = 0 + # n_unmapped: cython.int + n_unitigs_0: cython.int + n_unitigs_1: cython.int + + # first round of assembly + # print (" First round to assemble unitigs") + unitig_list = self.fermi_assemble(fermiMinOverlap, opt_flag=0x80) + if len(unitig_list) == 0: + return 0 + + n_unitigs_0 = -1 + n_unitigs_1 = len(unitig_list) + # print " # of Unitigs:", n_unitigs_1 + # print " Map reads to unitigs" + (unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ(unitig_list) + + self.verify_alns(unitig_list, + unitig_alns, + reference_alns, + aln_scores, + markup_alns) + if len(unitig_list) == 0: + # if stop here, it raises a flag that the region may + # contain too many mismapped reads, we return -1 + return -1 + # print (" # of Unitigs:", n_unitigs_1) + + # assign RAs to unitigs + [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C] = self.remap_RAs_w_unitigs(unitig_list) + # prunmapped_ra_collection: cython.int.get_FASTQ().decode() + + # n_unmapped = len(unmapped_RAlist_T) + len(unmapped_RAlist_C) + + while len(unmapped_RAlist_T) > 0 and n_unitigs_1 != n_unitigs_0: + # if there are unmapped reads AND we can get more unitigs + # from last round of assembly, do assembly again + + # print (" # of RAs not mapped, will be assembled again:", n_unmapped) + n_unitigs_0 = n_unitigs_1 + # another round of assembly + unmapped_ra_collection = RACollection(self.chrom, + self.peak, + unmapped_RAlist_T, + unmapped_RAlist_C) + unitigs_2nd = unmapped_ra_collection.fermi_assemble(fermiMinOverlap, + opt_flag=0x80) + + if unitigs_2nd: + unitig_list = self.add_to_unitig_list(unitig_list, unitigs_2nd) + n_unitigs_1 = len(unitig_list) + # print " # of Unitigs:", n_unitigs_1 + # print " Map reads to unitigs" + (unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ(unitig_list) + self.verify_alns(unitig_list, + unitig_alns, + reference_alns, + aln_scores, + markup_alns) + [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C] = self.remap_RAs_w_unitigs(unitig_list) + # n_unmapped = len(unmapped_RAlist_T) + len(unmapped_RAlist_C) + # else: + # for r in unmapped_RAlist_T: + # prr: cython.int.get_FASTQ().decode().lstrip() + + # print (" # of RAs not mapped, will be assembled again with 1/2 of fermiMinOverlap:", n_unmapped) + # another round of assembly + unmapped_ra_collection = RACollection(self.chrom, + self.peak, + unmapped_RAlist_T, + unmapped_RAlist_C) + unitigs_2nd = unmapped_ra_collection.fermi_assemble(fermiMinOverlap/2, + opt_flag=0x80) + + if unitigs_2nd: + unitig_list = self.add_to_unitig_list(unitig_list, unitigs_2nd) + n_unitigs_1 = len(unitig_list) + # print " # of Unitigs:", n_unitigs_1 + # print " Map reads to unitigs" + (unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ(unitig_list) + self.verify_alns(unitig_list, + unitig_alns, + reference_alns, + aln_scores, + markup_alns) + [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C] = self.remap_RAs_w_unitigs(unitig_list) + # n_unmapped = len(unmapped_RAlist_T) + len(unmapped_RAlist_C) + # else: + # for r in unmapped_RAlist_T: + # prr: cython.int.get_FASTQ().decode().lstrip() + if len(unitig_list) == 0: + raise Exception("Shouldn't reach here") + # print (" # of Unitigs:", n_unitigs_1) + + if len(unitig_list) == 0: + return None + # print (" Final round: # of Unitigs:", len(unitig_list)) + # print (" Final round: # of RAs not mapped:", n_unmapped) + + start = min(self.left, self.RAs_left) + end = max(self.right, self.RAs_right) + + # create UnitigCollection + for i in range(len(unitig_list)): + #b'---------------------------AAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAA--------------' + #b'GATATAAATAGGATGTTATGAGTTTTCAAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAAAATACAAATGTATT' + tmp_unitig_aln = unitig_alns[i] + tmp_reference_aln = reference_alns[i] + # tmp_unitig_seq = tmp_unitig_aln.replace(b'-',b'') + tmp_reference_seq = tmp_reference_aln.replace(b'-', b'') + + # prtmp_unitig_aln: cython.int + # prtmp_reference_aln: cython.int + # prtmp_unitig_seq: cython.int + # prtmp_reference_aln: cython.int + + # find the position on self.peak_refseq_ext + left_padding_ref = self.peak_refseq_ext.find(tmp_reference_seq) # this number of nts should be skipped on refseq_ext from left + right_padding_ref = len(self.peak_refseq_ext) - left_padding_ref - len(tmp_reference_seq) # this number of nts should be skipped on refseq_ext from right + + # now, decide the lpos and rpos on reference of this unitig + # first, trim left padding '-' + left_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.lstrip(b'-')) + right_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.rstrip(b'-')) + + tmp_lpos = start + left_padding_ref + tmp_rpos = end - right_padding_ref + + for j in range(left_padding_unitig): + if tmp_reference_aln[j] != b'-': + tmp_lpos += 1 + for j in range(1, right_padding_unitig + 1): + if tmp_reference_aln[-j] != b'-': + tmp_rpos -= 1 + + tmp_unitig_aln = tmp_unitig_aln[left_padding_unitig:(len(tmp_unitig_aln)-right_padding_unitig)] + tmp_reference_aln = tmp_reference_aln[left_padding_unitig:(len(tmp_reference_aln)-right_padding_unitig)] + + ura_list.append(UnitigRAs(self.chrom, tmp_lpos, tmp_rpos, tmp_unitig_aln, tmp_reference_aln, [RAlists_T[i], RAlists_C[i]])) + + return UnitigCollection(self.chrom, self.peak, ura_list) diff --git a/MACS3/Signal/RACollection.pyx b/MACS3/Signal/RACollection.pyx deleted file mode 100644 index c0650dee..00000000 --- a/MACS3/Signal/RACollection.pyx +++ /dev/null @@ -1,898 +0,0 @@ -# cython: language_level=3 -# cython: profile=True -# Time-stamp: <2021-03-10 23:39:52 Tao Liu> - -"""Module for SAPPER BAMParser class - -Copyright (c) 2017 Tao Liu - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD License (see the file COPYING included -with the distribution). - -@status: experimental -@version: $Revision$ -@author: Tao Liu -@contact: tliu4@buffalo.edu -""" -# ------------------------------------ -# python modules -# ------------------------------------ -import struct -from collections import Counter -from operator import itemgetter -from copy import copy - -from MACS3.Signal.ReadAlignment import ReadAlignment -from MACS3.Signal.PosReadsInfo import PosReadsInfo -from MACS3.Signal.UnitigRACollection import UnitigRAs, UnitigCollection -from MACS3.IO.PeakIO import PeakIO - -from cpython cimport bool -from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free - -import numpy as np -cimport numpy as np -from numpy cimport uint32_t, uint64_t, int32_t, int64_t - -from libc.stdlib cimport malloc, free, realloc - -cdef extern from "stdlib.h": - ctypedef unsigned int size_t - size_t strlen(char *s) - #void *malloc(size_t size) - void *calloc(size_t n, size_t size) - #void free(void *ptr) - int strcmp(char *a, char *b) - char * strcpy(char *a, char *b) - long atol(char *bytes) - int atoi(char *bytes) - - -# --- fermi-lite functions --- -#define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default) -#define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default) -#define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default) - -cdef extern from "fml.h": - ctypedef struct bseq1_t: - int32_t l_seq - char *seq - char *qual # NULL-terminated strings; length expected to match $l_seq - - ctypedef struct magopt_t: - int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth - float min_dratio1, max_bcov, max_bfrac - - ctypedef struct fml_opt_t: - int n_threads # number of threads; don't use multi-threading for small data sets - int ec_k # k-mer length for error correction; 0 for auto estimate - int min_cnt, max_cnt # both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt] - int min_asm_ovlp # min overlap length during assembly - int min_merge_len # during assembly, don't explicitly merge an overlap if shorter than this value - magopt_t mag_opt # graph cleaning options - - ctypedef struct fml_ovlp_t: - uint32_t len_, from_, id_, to_ - #unit32_t from # $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end - #uint32_t id - #uint32_t to # $id: unitig number - - ctypedef struct fml_utg_t: - int32_t len # length of sequence - int32_t nsr # number of supporting reads - char *seq # unitig sequence - char *cov # cov[i]-33 gives per-base coverage at i - int n_ovlp[2] # number of 5'-end [0] and 3'-end [1] overlaps - fml_ovlp_t *ovlp # overlaps, of size n_ovlp[0]+n_ovlp[1] - - void fml_opt_init(fml_opt_t *opt) - fml_utg_t* fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg) - void fml_utg_destroy(int n_utg, fml_utg_t *utg) - void fml_utg_print(int n_utgs, const fml_utg_t *utg) - bseq1_t *bseq_read(const char *fn, int *n) - -# --- end of fermi-lite functions --- - -# --- smith-waterman alignment functions --- - -cdef extern from "swalign.h": - ctypedef struct seq_pair_t: - char *a - unsigned int alen - char *b - unsigned int blen - ctypedef struct align_t: - seq_pair_t *seqs - char *markup; - int start_a - int start_b - int end_a - int end_b - int matches - int gaps - double score - align_t *smith_waterman(seq_pair_t *problem) - void destroy_seq_pair(seq_pair_t *pair) - void destroy_align(align_t *ali) - -# ------------------------------------ -# constants -# ------------------------------------ -__version__ = "Parser $Revision$" -__author__ = "Tao Liu " -__doc__ = "All Parser classes" - -__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A. - -__CIGARCODE__ = "MIDNSHP=X" - -# ------------------------------------ -# Misc functions -# ------------------------------------ - -# ------------------------------------ -# Classes -# ------------------------------------ - -cdef class RACollection: - """A collection of ReadAlignment objects and the corresponding - PeakIO. - - """ - cdef: - bytes chrom - object peak # A PeakIO object - list RAlists # contain ReadAlignment lists for treatment (0) and control (1) - long left # left position of peak - long right # right position of peak - long length # length of peak - long RAs_left # left position of all RAs in the collection - long RAs_right # right position of all RAs in the collection - bool sorted # if sorted by lpos - bytes peak_refseq # reference sequence in peak region b/w left and right - bytes peak_refseq_ext # reference sequence in peak region with extension on both sides b/w RAs_left and RAs_right - - def __init__ ( self, chrom, peak, RAlist_T, RAlist_C=[] ): - """Create RACollection object by taking: - - 1. peak: a PeakIO object indicating the peak region. - - 2. RAlist: a python list of ReadAlignment objects containing - all the reads overlapping the peak region. If no RAlist_C - given, it will be []. - - """ - if len(RAlist_T) == 0: - # no reads, return None - raise Exception("No reads from ChIP sample to construct RAcollection!") - self.chrom = chrom - self.peak = peak - #print(len(RAlist_T),"\n") - #print(len(RAlist_C),"\n") - self.RAlists = [ RAlist_T, RAlist_C ] - self.left = peak["start"] - self.right = peak["end"] - self.length = self.right - self.left - if RAlist_T: - self.RAs_left = RAlist_T[0]["lpos"] # initial assignment of RAs_left - self.RAs_right = RAlist_T[-1]["rpos"] # initial assignment of RAs_right - self.sort() # it will set self.sorted = True - else: - self.RAs_left = -1 - self.RAs_right = -1 - # check RAs_left and RAs_right - for ra in RAlist_T: - if ra[ "lpos" ] < self.RAs_left: - self.RAs_left = ra[ "lpos" ] - if ra[ "rpos" ] > self.RAs_right: - self.RAs_right = ra[ "rpos" ] - - for ra in RAlist_C: - if ra[ "lpos" ] < self.RAs_left: - self.RAs_left = ra[ "lpos" ] - if ra[ "rpos" ] > self.RAs_right: - self.RAs_right = ra[ "rpos" ] - (self.peak_refseq, self.peak_refseq_ext) = self.__get_peak_REFSEQ() - - def __getitem__ ( self, keyname ): - if keyname == "chrom": - return self.chrom - elif keyname == "left": - return self.left - elif keyname == "right": - return self.right - elif keyname == "RAs_left": - return self.RAs_left - elif keyname == "RAs_right": - return self.RAs_right - elif keyname == "length": - return self.length - elif keyname == "count": - return len( self.RAlists[ 0 ] )+ len( self.RAlists[ 1 ] ) - elif keyname == "count_T": - return len( self.RAlists[ 0 ] ) - elif keyname == "count_C": - return len( self.RAlists[ 1 ] ) - elif keyname == "peak_refseq": - return self.peak_refseq - elif keyname == "peak_refseq_ext": - return self.peak_refseq_ext - else: - raise KeyError("Unavailable key:", keyname) - - def __getstate__ ( self ): - #return {"chrom":self.chrom, "peak":self.peak, "RAlists":self.RAlists, - # "left":self.left, "right":self.right, "length": self.length, - # "RAs_left":self.RAs_left, "RAs_right":self.RAs_right} - return (self.chrom, self.peak, self.RAlists, self.left, self.right, self.length, self.RAs_left, self.RAs_right, self.peak_refseq, self.peak_refseq_ext) - - def __setstate__ ( self, state ): - (self.chrom, self.peak, self.RAlists, self.left, self.right, self.length, self.RAs_left, self.RAs_right, self.peak_refseq, self.peak_refseq_ext) = state - - cpdef sort ( self ): - """Sort RAs according to lpos. Should be used after realignment. - - """ - if self.RAlists[ 0 ]: - self.RAlists[ 0 ].sort(key=itemgetter("lpos")) - if self.RAlists[ 1 ]: - self.RAlists[ 1 ].sort(key=itemgetter("lpos")) - self.sorted = True - return - - cpdef remove_outliers ( self, int percent = 5 ): - """ Remove outliers with too many n_edits. The outliers with - n_edits in top p% will be removed. - - Default: remove top 5% of reads that have too many differences - with reference genome. - """ - cdef: - list n_edits_list - object read # ReadAlignment object - int highest_n_edits - list new_RAlist - int i - - n_edits_list = [] - for ralist in self.RAlists: - for read in ralist: - n_edits_list.append( read["n_edits"] ) - n_edits_list.sort() - highest_n_edits = n_edits_list[ int( len( n_edits_list ) * (1 - percent * .01) ) ] - - for i in ( range(len(self.RAlists)) ): - new_RAlist = [] - for read in self.RAlists[ i ]: - if read["n_edits"] <= highest_n_edits: - new_RAlist.append( read ) - self.RAlists[ i ] = new_RAlist - - return - - cpdef n_edits_sum ( self ): - """ - """ - cdef: - list n_edits_list - object read - int highest_n_edits - - n_edits_list = [] - - for ralist in self.RAlists: - for read in ralist: - n_edits_list.append( read["n_edits"] ) - - n_edits_list.sort() - # print ( n_edits_list ) - c = Counter( n_edits_list ) - #print( c ) - - cdef tuple __get_peak_REFSEQ ( self ): - """Get the reference sequence within the peak region. - - """ - cdef: - bytearray peak_refseq - int i - long prev_r #remember the previous filled right end - long start - long end - long ind, ind_r - object read - bytearray read_refseq_ext - bytearray read_refseq - - start = min( self.RAs_left, self.left ) - end = max( self.RAs_right, self.right ) - #print ("left",start,"right",end) - peak_refseq_ext = bytearray( b'N' * ( end - start ) ) - - # for treatment. - peak_refseq_ext = self.__fill_refseq ( peak_refseq_ext, self.RAlists[0] ) - # and control if available. - if self.RAlists[1]: - peak_refseq_ext = self.__fill_refseq ( peak_refseq_ext, self.RAlists[1] ) - - # trim - peak_refseq = peak_refseq_ext[ self.left - start: self.right - start ] - return ( bytes( peak_refseq ), bytes( peak_refseq_ext ) ) - - cdef bytearray __fill_refseq ( self, bytearray seq, list ralist ): - """Fill refseq sequence of whole peak with refseq sequences of - each read in ralist. - - """ - cdef: - long prev_r # previous right position of last - # filled - long ind, ind_r - long start, end - object read - bytearray read_refseq - - start = min( self.RAs_left, self.left ) - - #print(len(ralist),"\n") - prev_r = ralist[0]["lpos"] - - for i in range( len( ralist ) ): - read = ralist[ i ] - if read[ "lpos" ] > prev_r: - read = ralist[ i - 1 ] - read_refseq = read.get_REFSEQ() - ind = read["lpos"] - start - ind_r = ind + read["rpos"] - read["lpos"] - seq[ ind: ind_r ] = read_refseq - prev_r = read[ "rpos" ] - # last - read = ralist[ -1 ] - read_refseq = read.get_REFSEQ() - ind = read["lpos"] - start - ind_r = ind + read["rpos"] - read["lpos"] - seq[ ind: ind_r ] = read_refseq - return seq - - cpdef get_PosReadsInfo_ref_pos ( self, long ref_pos, bytes ref_nt, int Q=20 ): - """Generate a PosReadsInfo object for a given reference genome - position. - - Return a PosReadsInfo object. - - """ - cdef: - bytearray s - bytearray bq - int strand - object ra - list bq_list_t = [] - list bq_list_c = [] - int i - int pos - bool tip - - posreadsinfo_p = PosReadsInfo( ref_pos, ref_nt ) - - #Treatment group - for i in range( len( self.RAlists[ 0 ] ) ): - ra = self.RAlists[ 0 ][ i ] - if ra[ "lpos" ] <= ref_pos and ra[ "rpos" ] > ref_pos: - ( s, bq, strand, tip, pos) = ra.get_variant_bq_by_ref_pos( ref_pos ) - posreadsinfo_p.add_T( i, bytes( s ), bq[ 0 ], strand, tip, Q=Q ) - - #Control group - for i in range( len( self.RAlists[ 1 ] ) ): - ra = self.RAlists[ 1 ][ i ] - if ra[ "lpos" ] <= ref_pos and ra[ "rpos" ] > ref_pos: - ( s, bq, strand, tip, pos ) = ra.get_variant_bq_by_ref_pos( ref_pos ) - posreadsinfo_p.add_C( i, bytes( s ), bq[ 0 ], strand, Q=Q ) - - return posreadsinfo_p - - cpdef bytearray get_FASTQ ( self ): - """Get FASTQ file for all reads in RACollection. - - """ - cdef: - object ra - bytearray fastq_text - - fastq_text = bytearray(b"") - - for ra in self.RAlists[0]: - fastq_text += ra.get_FASTQ() - - for ra in self.RAlists[1]: - fastq_text += ra.get_FASTQ() - - return fastq_text - - cdef list fermi_assemble( self, int fermiMinOverlap, int opt_flag = 0x80 ): - """A wrapper function to call Fermi unitig building functions. - """ - cdef: - fml_opt_t *opt - int c, n_seqs - int * n_utg - bseq1_t *seqs - fml_utg_t *utg - fml_utg_t p - - int unitig_k, merge_min_len - bytes tmps - bytes tmpq - int ec_k = -1 - int64_t l - char * cseq - char * cqual - int i, j - bytes tmpunitig - bytes unitig #final unitig - list unitig_list # contain list of sequences in bytes format - int * n - - n_seqs = len(self.RAlists[0]) + len(self.RAlists[1]) - - # print n_seqs - - # prepare seq and qual, note, we only extract SEQ according to the + - # strand of reference sequence. - seqs = malloc( n_seqs * sizeof(bseq1_t) ) # we rely on fermi-lite to free this mem - - i = 0 - for ra in self.RAlists[0]: - tmps = ra["SEQ"] - tmpq = ra["QUAL"] - l = len(tmps) - cseq = malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem - cqual = malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem - for j in range(l): - cseq[ j ] = tmps[ j ] - cqual[ j ] = tmpq[ j ] + 33 - cseq[ l ] = b'\x00' - cqual[ l ]= b'\x00' - - seqs[ i ].seq = cseq - seqs[ i ].qual = cqual - seqs[ i ].l_seq = len(tmps) - i += 1 - - # print "@",ra["readname"].decode() - # print cseq.decode() - # print "+" - # print cqual.decode() - - for ra in self.RAlists[1]: - tmps = ra["SEQ"] - tmpq = ra["QUAL"] - l = len(tmps) - cseq = malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem - cqual = malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem - for j in range(l): - cseq[ j ] = tmps[ j ] - cqual[ j ] = tmpq[ j ] + 33 - cseq[ l ] = b'\x00' - cqual[ l ]= b'\x00' - - seqs[ i ].seq = cseq - seqs[ i ].qual = cqual - seqs[ i ].l_seq = len(tmps) - i += 1 - # print "@",ra["readname"].decode() - # print cseq.decode() - # print "+" - # print cqual.decode() - - # if self.RAlists[1]: - # unitig_k=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*fermiOverlapMinRatio) - - # merge_min_len=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*0.5) - # else: - # unitig_k = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio) - - # merge_min_len=int(self.RAlists[0][0]["l"]*0.5) - #fermiMinOverlap = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio) - - # minimum overlap to merge, default 0 - # merge_min_len= max( 25, int(self.RAlists[0][0]["l"]*0.5) ) - # merge_min_len= int(self.RAlists[0][0]["l"]*0.5) - - opt = PyMem_Malloc( sizeof(fml_opt_t) ) - n_utg = PyMem_Malloc( sizeof(int) ) - - fml_opt_init(opt) - # k-mer length for error correction (0 for auto; -1 to disable) - #opt.ec_k = 0 - - # min overlap length during initial assembly - opt.min_asm_ovlp = fermiMinOverlap - - # minimum length to merge, during assembly, don't explicitly merge an overlap if shorter than this value - # opt.min_merge_len = merge_min_len - - # there are more 'options' for mag clean: - # flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth, min_dratio1, max_bcov, max_bfrac - # min_elen (300) will be adjusted - # min_ensr (4), min_insr (3) will be computed - # min_merge_len (0) will be updated using opt.min_merge_len - - # We can adjust: flag (0x40|0x80), min_ovlp (0), min_dratio1 (0.7), max_bdiff (50), max_bdist (512), max_bvtx (64), trim_len (0), trim_depth (6), max_bcov (10.), max_bfrac (0.15) - - # 0x20: MAG_F_AGGRESSIVE pop variant bubbles - # 0x40: MAG_F_POPOPEN aggressive tip trimming - # 0x80: MAG_F_NO_SIMPL skip bubble simplification - opt.mag_opt.flag = opt_flag - - # mag_opt.min_ovlp - #opt.mag_opt.min_ovlp = fermiMinOverlap - - # drop an overlap if its length is below maxOvlpLen*FLOAT - #opt.mag_opt.min_dratio1 = 0.5 - - # retain a bubble if one side is longer than the other side by >INT-bp - #opt.mag_opt.max_bdiff = 10#merge_min_len - - # trim_len: - # trim_depth: Parameter used to trim the open end/tip. If trim_len == 0, do nothing - - # max_bdist: - # max_bvtx: Parameter used to simply bubble while 0x80 flag is set. - #opt.mag_opt.max_bdist = 1024 - #opt.mag_opt.max_bvtx = 128 - - # max_bcov: - # max_bfrac: Parameter used when aggressive bubble removal is not used. Bubble will be removed if its average coverage lower than max_bcov and fraction (cov1/(cov1+cov2)) is lower than max_bfrac - #opt.mag_opt.max_bcov = 10. - #opt.mag_opt.max_bfrac = 0.01 - - utg = fml_assemble(opt, n_seqs, seqs, n_utg) - # get results - unitig_list = [] - for i in range( n_utg[0] ): - p = utg[ i ] - if (p.len < 0): - continue - #unitig = b'' - #for j in range( p.len ): - # unitig += [b'A',b'C',b'G',b'T',b'N'][int(p.seq[j]) - 1] - #unitig_list.append( unitig ) - unitig_list.append( p.seq ) - - fml_utg_destroy(n_utg[0], utg) - - PyMem_Free( opt ) - PyMem_Free( n_utg ) - - return unitig_list - - cdef tuple align_unitig_to_REFSEQ ( self, list unitig_list ): - """Note: we use smith waterman, but we don't use linear gap - penalty at this time. - - Also, if unitig is mapped to - strand, we will revcomp the - unitig. So the unitig_list will be changed in this case. - """ - - cdef: - bytes unitig - seq_pair_t problem - align_t * results - char * tmp - bytes target - bytes reference - bytes target_aln_f, target_aln_r - bytes reference_aln_f, reference_aln_r - bytes markup_aln_f, markup_aln_r - double score_f, score_r - list target_alns = [] - list reference_alns = [] - list markup_alns = [] - list aln_scores = [] - int i - - reference = copy(self.peak_refseq_ext+b'\x00') - - for i in range( len(unitig_list) ): - unitig = unitig_list[ i ] - target = copy(unitig + b'\x00') - # we use swalign.c for local alignment (without affine gap - # penalty). Will revise later. - problem.a = target - problem.alen = len( unitig ) - problem.b = reference - problem.blen = len( self.peak_refseq_ext ) - results = smith_waterman( &problem ) - target_aln_f = results.seqs.a - reference_aln_f = results.seqs.b - markup_aln_f = results.markup - score_f = results.score - free( results.seqs.a ) - free( results.seqs.b ) - free( results.markup ) - free( results ) - # end of local alignment - - # try reverse complement - target = copy(unitig[::-1] + b'\x00') - target = target.translate( __DNACOMPLEMENT__ ) - problem.a = target - problem.alen = len( unitig ) - problem.b = reference - problem.blen = len( self.peak_refseq_ext ) - results = smith_waterman( &problem ) - target_aln_r = results.seqs.a - reference_aln_r = results.seqs.b - markup_aln_r = results.markup - score_r = results.score - free( results.seqs.a ) - free( results.seqs.b ) - free( results.markup ) - free( results ) - # end of local alignment - - if score_f > score_r: - target_alns.append( target_aln_f ) - reference_alns.append( reference_aln_f ) - markup_alns.append( markup_aln_f ) - aln_scores.append( score_f ) - else: - target_alns.append( target_aln_r ) - reference_alns.append( reference_aln_r ) - markup_alns.append( markup_aln_r ) - aln_scores.append( score_r ) - # we will revcomp unitig - unitig = unitig[::-1] - unitig_list[ i ] = unitig.translate( __DNACOMPLEMENT__ ) - - return ( target_alns, reference_alns, aln_scores, markup_alns ) - - cdef verify_alns( self, unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns, float min_score_100 = 150 ): - """Remove aln/unitig if it contains too many edits in a small region - - default min score is 150, which means under 2/-3/-5/-2 scoring schema, there are 10 mismatches within 100bps region. - """ - cdef: - int i - for i in range( len( unitig_list )-1, -1, -1 ): - #print i, aln_scores[ i ] - #print unitig_alns[ i ] - #print markup_alns[ i ] - #print reference_alns[ i ] - if aln_scores[ i ] * 100 /len( markup_alns[ i ] ) < min_score_100: - unitig_list.pop( i ) - unitig_alns.pop( i ) - reference_alns.pop( i ) - aln_scores.pop( i ) - markup_alns.pop( i ) - return - - - cdef tuple filter_unitig_with_bad_aln ( self, list unitig_list, list target_alns, list reference_alns, float gratio = 0.25 ): - """Remove unitigs that has too much gaps (both on target and reference) during alignments. - """ - pass - - cdef list remap_RAs_w_unitigs ( self, list unitig_list ): - """Remap RAs to unitigs, requiring perfect match. - - Return RAlists_T, RAlists_C, unmapped_racollection. - """ - cdef: - list RAlists_T = [] # lists of list of RAs of ChIP mapped to each unitig - list RAlists_C = [] - list unmapped_RAlist_T = [] # list of RAs of ChIP unmappable to unitigs - list unmapped_RAlist_C = [] - #RACollection unmapped_ra_collection - int flag = 0 - int i - object tmp_ra - bytes tmp_ra_seq, unitig - - for i in range( len(unitig_list) ): - RAlists_T.append([]) # for each unitig, there is another list of RAs - RAlists_C.append([]) - - # assign RAs to unitigs - - for tmp_ra in self.RAlists[0]: - flag = 0 - tmp_ra_seq = tmp_ra["SEQ"] - for i in range( len(unitig_list) ): - unitig = unitig_list[ i ] - if tmp_ra_seq in unitig: - flag = 1 - RAlists_T[ i ].append( tmp_ra ) - break - if flag == 0: - unmapped_RAlist_T.append( tmp_ra ) - #print "unmapped:", tmp_ra["SEQ"] - - for tmp_ra in self.RAlists[1]: - flag = 0 - tmp_ra_seq = tmp_ra["SEQ"] - for i in range( len(unitig_list) ): - unitig = unitig_list[ i ] - if tmp_ra_seq in unitig: - flag = 1 - RAlists_C[ i ].append( tmp_ra ) - break - if flag == 0: - unmapped_RAlist_C.append( tmp_ra ) - #print "unmapped:", tmp_ra["SEQ"] - - #if unmapped_RAlist_T: - #unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C ) - return [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] - - cdef list add_to_unitig_list ( self, unitig_list, unitigs_2nd ): - """ - """ - cdef: - int i,j - int flag - bytes u0, u1 - list new_unitig_list - - new_unitig_list = [] - - for i in range( len(unitigs_2nd) ): - flag = 0 # initial value: can't be found in unitig_list - u0 = unitigs_2nd[ i ] - for j in range( len( unitig_list ) ): - u1 = unitig_list[ j ] - if u1.find( u0 ) != -1: - flag = 1 - break - u1 = u1[::-1].translate(__DNACOMPLEMENT__) - if u1.find( u0 ) != -1: - flag = 1 - break - if not flag: - new_unitig_list.append( u0 ) - new_unitig_list.extend( unitig_list ) - return new_unitig_list - - - cpdef object build_unitig_collection ( self, fermiMinOverlap ): - """unitig_list and tuple_alns are in the same order! - - return UnitigCollection object. - - """ - cdef: - long start, end - list unitigs_2nd - bytes u - list target_alns, reference_alns, aln_scores, markup_alns - list target_alns_2nd, reference_alns_2nd, aln_scores_2nd - list RAlists_T = [] # lists of list of RAs of ChIP mapped to each unitig - list RAlists_C = [] - list unmapped_RAlist_T = [] - list unmapped_RAlist_C = [] - bytes tmp_unitig_seq, tmp_reference_seq - bytes tmp_unitig_aln, tmp_reference_aln, - int i, j - long left_padding_ref, right_padding_ref - long left_padding_unitig, right_padding_unitig - list ura_list = [] - RACollection unmapped_ra_collection - int flag = 0 - int n_unmapped, n_unitigs_0, n_unitigs_1 - - # first round of assembly - # print (" First round to assemble unitigs") - unitig_list = self.fermi_assemble( fermiMinOverlap, opt_flag = 0x80 ) - if len(unitig_list) == 0: - return 0 - - n_unitigs_0 = -1 - n_unitigs_1 = len( unitig_list ) - #print " # of Unitigs:", n_unitigs_1 - #print " Map reads to unitigs" - ( unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ( unitig_list ) - - self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns ) - if len(unitig_list) == 0: - # if stop here, it raises a flag that the region may contain too many mismapped reads, we return -1 - return -1 - # print (" # of Unitigs:", n_unitigs_1) - - # assign RAs to unitigs - [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list ) - #print unmapped_ra_collection.get_FASTQ().decode() - - n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C ) - - while len( unmapped_RAlist_T ) > 0 and n_unitigs_1 != n_unitigs_0: - # if there are unmapped reads AND we can get more unitigs - # from last round of assembly, do assembly again - - # print (" # of RAs not mapped, will be assembled again:", n_unmapped) - n_unitigs_0 = n_unitigs_1 - # another round of assembly - unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C ) - unitigs_2nd = unmapped_ra_collection.fermi_assemble( fermiMinOverlap, opt_flag = 0x80 ) - - if unitigs_2nd: - unitig_list = self.add_to_unitig_list ( unitig_list, unitigs_2nd ) - n_unitigs_1 = len( unitig_list ) - #print " # of Unitigs:", n_unitigs_1 - #print " Map reads to unitigs" - ( unitig_alns, reference_alns, aln_scores, markup_alns ) = self.align_unitig_to_REFSEQ( unitig_list ) - self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns ) - [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list ) - n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C ) - #else: - # for r in unmapped_RAlist_T: - # print r.get_FASTQ().decode().lstrip() - - # print (" # of RAs not mapped, will be assembled again with 1/2 of fermiMinOverlap:", n_unmapped) - # another round of assembly - unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C ) - unitigs_2nd = unmapped_ra_collection.fermi_assemble( fermiMinOverlap/2, opt_flag = 0x80 ) - - if unitigs_2nd: - unitig_list = self.add_to_unitig_list ( unitig_list, unitigs_2nd ) - n_unitigs_1 = len( unitig_list ) - #print " # of Unitigs:", n_unitigs_1 - #print " Map reads to unitigs" - ( unitig_alns, reference_alns, aln_scores, markup_alns ) = self.align_unitig_to_REFSEQ( unitig_list ) - self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns ) - [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list ) - n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C ) - #else: - # for r in unmapped_RAlist_T: - # print r.get_FASTQ().decode().lstrip() - if len(unitig_list) == 0: - raise Exception("Shouldn't reach here") - # print (" # of Unitigs:", n_unitigs_1) - - if len(unitig_list) == 0: - return None - #print (" Final round: # of Unitigs:", len(unitig_list)) - #print (" Final round: # of RAs not mapped:", n_unmapped) - - start = min( self.left, self.RAs_left ) - end = max( self.right, self.RAs_right ) - - # create UnitigCollection - for i in range( len( unitig_list ) ): - #b'---------------------------AAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAA--------------' - #b'GATATAAATAGGATGTTATGAGTTTTCAAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAAAATACAAATGTATT' - tmp_unitig_aln = unitig_alns[ i ] - tmp_reference_aln = reference_alns[ i ] - tmp_unitig_seq = tmp_unitig_aln.replace(b'-',b'') - tmp_reference_seq = tmp_reference_aln.replace(b'-',b'') - - # print tmp_unitig_aln - # print tmp_reference_aln - # print tmp_unitig_seq - # print tmp_reference_aln - - # find the position on self.peak_refseq_ext - left_padding_ref = self.peak_refseq_ext.find( tmp_reference_seq ) # this number of nts should be skipped on refseq_ext from left - right_padding_ref = len(self.peak_refseq_ext) - left_padding_ref - len(tmp_reference_seq) # this number of nts should be skipped on refseq_ext from right - - #now, decide the lpos and rpos on reference of this unitig - #first, trim left padding '-' - left_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.lstrip(b'-')) - right_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.rstrip(b'-')) - - tmp_lpos = start + left_padding_ref - tmp_rpos = end - right_padding_ref - - for j in range( left_padding_unitig ): - if tmp_reference_aln[ j ] != b'-': - tmp_lpos += 1 - for j in range( 1, right_padding_unitig + 1 ): - if tmp_reference_aln[ -j ] != b'-': - tmp_rpos -= 1 - - tmp_unitig_aln = tmp_unitig_aln[ left_padding_unitig:(len(tmp_unitig_aln)-right_padding_unitig)] - tmp_reference_aln = tmp_reference_aln[ left_padding_unitig:(len(tmp_reference_aln)-right_padding_unitig)] - - ura_list.append( UnitigRAs( self.chrom, tmp_lpos, tmp_rpos, tmp_unitig_aln, tmp_reference_aln, [RAlists_T[i], RAlists_C[i]] ) ) - - return UnitigCollection( self.chrom, self.peak, ura_list ) diff --git a/MACS3/Signal/ReadAlignment.pyx b/MACS3/Signal/ReadAlignment.py similarity index 54% rename from MACS3/Signal/ReadAlignment.pyx rename to MACS3/Signal/ReadAlignment.py index d5376350..b174ba9e 100644 --- a/MACS3/Signal/ReadAlignment.pyx +++ b/MACS3/Signal/ReadAlignment.py @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2021-03-10 16:21:51 Tao Liu> +# Time-stamp: <2024-10-22 15:19:55 Tao Liu> """Module for SAPPER ReadAlignment class @@ -12,18 +12,19 @@ # ------------------------------------ # python modules # ------------------------------------ -from cpython cimport bool - -cdef extern from "stdlib.h": - ctypedef unsigned int size_t - size_t strlen(char *s) - void *malloc(size_t size) - void *calloc(size_t n, size_t size) - void free(void *ptr) - int strcmp(char *a, char *b) - char * strcpy(char *a, char *b) - long atol(char *bytes) - int atoi(char *bytes) +import cython +from cython.cimports.cpython import bool + +# cdef extern from "stdlib.h": +# ctypedef unsigned int size_t +# size_t strlen(char *s) +# void *malloc(size_t size) +# void *calloc(size_t n, size_t size) +# void free(void *ptr) +# int strcmp(char *a, char *b) +# char * strcpy(char *a, char *b) +# long atol(char *bytes) +# int atoi(char *bytes) # ------------------------------------ # constants @@ -53,31 +54,35 @@ # ------------------------------------ # Classes # ------------------------------------ -cdef class ReadAlignment: - cdef: - bytes readname - bytes chrom - int lpos - int rpos - int strand # strand information. 0 means forward strand, 1 means reverse strand. - bytes binaryseq - bytes binaryqual - int l # length of read - tuple cigar # each item contains op_l|op - bytes MD - int n_edits # number of edits; higher the number, - # more differences with reference - bytes SEQ # sequence of read regarding to + strand - bytes QUAL # quality of read regarding to + strand - - def __init__ ( self, - bytes readname, - bytes chrom, int lpos, int rpos, - int strand, - bytes binaryseq, - bytes binaryqual, - tuple cigar, - bytes MD ): + + +@cython.cclass +class ReadAlignment: + readname: bytes + chrom: bytes + lpos: cython.int + rpos: cython.int + # strand information. 0 means forward strand, 1 means reverse strand. + strand: cython.int + binaryseq: bytes + binaryqual: bytes + l: cython.int # length of read + cigar: tuple # each item contains op_l|op + MD: bytes + # number of edits; higher the number, more differences with reference + n_edits: cython.int + SEQ: bytes # sequence of read regarding to + strand + QUAL: bytes # quality of read regarding to + strand + + def __init__(self, + readname: bytes, + chrom: bytes, + lpos: cython.int, rpos: cython.int, + strand: cython.int, + binaryseq: bytes, + binaryqual: bytes, + cigar: tuple, + MD: bytes): self.readname = readname self.chrom = chrom self.lpos = lpos @@ -85,34 +90,36 @@ def __init__ ( self, self.strand = strand self.binaryseq = binaryseq self.binaryqual = binaryqual - self.l = len( binaryqual ) + self.l = len(binaryqual) self.cigar = cigar self.MD = MD self.n_edits = self.get_n_edits() (self.SEQ, self.QUAL) = self.__get_SEQ_QUAL() - cdef int get_n_edits( self ): + @cython.cfunc + def get_n_edits(self) -> cython.int: """The number is from self.cigar and self.MD. """ - cdef: - int n_edits - int i, cigar_op, cigar_op_l - char c - + n_edits: cython.int + i: cython.int + cigar_op: cython.int + cigar_op_l: cython.int + c: cython.char + n_edits = 0 for i in self.cigar: # only count insertion or softclip cigar_op = i & 15 cigar_op_l = i >> 4 - if cigar_op in [ 1, 4 ]: # count Insertion or Softclip + if cigar_op in [1, 4]: # count Insertion or Softclip n_edits += cigar_op_l - + for c in self.MD: - if (c > 64 and c < 91): # either deletion in query or mismatch + if (c > 64 and c < 91): # either deletion in query or mismatch n_edits += 1 return n_edits - def __str__ ( self ): + def __str__(self): c = self.chrom.decode() n = self.readname.decode() if self.strand: @@ -121,7 +128,7 @@ def __str__ ( self ): s = "+" return f"{c}\t{self.lpos}\t{self.rpos}\t{n}\t{self.l}\t{s}" - def __getitem__ ( self, keyname ): + def __getitem__(self, keyname): if keyname == "readname": return self.readname elif keyname == "chrom": @@ -151,142 +158,128 @@ def __getitem__ ( self, keyname ): else: raise KeyError("No such key", keyname) - def __getstate__ ( self ): - return ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL ) - - def __setstate__ ( self, state ): - ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL ) = state - - # cpdef bytearray get_SEQ ( self ): - # """Convert binary seq to ascii seq. - - # Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15] + def __getstate__(self): + return (self.readname, self.chrom, self.lpos, self.rpos, self.strand, + self.binaryseq, self.binaryqual, self.l, self.cigar, + self.MD, self.n_edits, self.SEQ, self.QUAL) - # Note: In BAM, if a sequence is mapped to reverse strand, the - # reverse complement seq is written in SEQ field. So the return - # value of this function will not be the original one if the - # read is mapped to - strand. - # """ - # cdef: - # char c - # bytearray seq + def __setstate__(self, state): + (self.readname, self.chrom, self.lpos, self.rpos, self.strand, + self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, + self.n_edits, self.SEQ, self.QUAL) = state - # seq = bytearray(b"") - # for c in self.binaryseq: - # # high - # seq.append( __BAMDNACODE__[c >> 4 & 15] ) - # # low - # seq.append( __BAMDNACODE__[c & 15] ) - # if seq[-1] == b"=": - # # trim the last '=' if it exists - # seq = seq[:-1] - # return seq + @cython.cfunc + def __get_SEQ_QUAL(self) -> tuple: + """Get of: tuple (SEQ, QUAL). - cdef tuple __get_SEQ_QUAL ( self ): - """Get tuple of (SEQ, QUAL). - - Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15] + Rule: for each byte, 1st base in the highest 4bit; 2nd in the + lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15] Note: In BAM, if a sequence is mapped to reverse strand, the reverse complement seq is written in SEQ field. So the return value of this function will not be the original one if the read is mapped to - strand. If you need to original one, do reversecomp for SEQ and reverse QUAL. + """ - cdef: - int i - char c - bytearray seq - bytearray qual + i: cython.int + c: cython.char + seq: bytearray + qual: bytearray seq = bytearray(b"") qual = bytearray(b"") - for i in range( len(self.binaryseq) ): - c = self.binaryseq[ i ] + for i in range(len(self.binaryseq)): + c = self.binaryseq[i] # high - seq.append( __BAMDNACODE__[c >> 4 & 15] ) + seq.append(__BAMDNACODE__[c >> 4 & 15]) # low - seq.append( __BAMDNACODE__[c & 15] ) + seq.append(__BAMDNACODE__[c & 15]) + + for i in range(len(self.binaryqual)): + # qual is the -10log10 p or phred score. + qual.append(self.binaryqual[i]) - for i in range( len( self.binaryqual ) ): - # qual is the -10log10 p or phred score. - qual.append( self.binaryqual[i] ) - if seq[-1] == b"=": # trim the last '=' if it exists seq = seq[:-1] - assert len( seq ) == len( qual ), Exception("Lengths of seq and qual are not consistent!") + assert len(seq) == len(qual), Exception("Lengths of seq and qual are not consistent!") # Example on how to get original SEQ and QUAL: - #if self.strand: + # if self.strand: # seq.reverse() # #compliment - # seq = seq.translate( __DNACOMPLEMENT__ ) + # seq = seq.translate(__DNACOMPLEMENT__) # qual.reverse() - return ( bytes(seq), bytes(qual) ) + return (bytes(seq), bytes(qual)) - - cpdef bytes get_FASTQ ( self ): + @cython.ccall + def get_FASTQ(self) -> bytes: """Get FASTQ format text. """ - cdef: - bytes seq - bytearray qual + seq: bytes + qual: bytearray seq = self.SEQ qual = bytearray(self.QUAL) - for i in range( len( self.QUAL ) ): - # qual is the -10log10 p or phred score, to make FASTQ, we have to add 33 - qual[ i ] += 33 - + for i in range(len(self.QUAL)): + # qual is the -10log10 p or phred score, to make FASTQ, we + # have to add 33 + qual[i] += 33 + # reverse while necessary if self.strand: seq = self.SEQ[::-1] - #compliment - seq = seq.translate( __DNACOMPLEMENT__ ) + # compliment + seq = seq.translate(__DNACOMPLEMENT__) qual = qual[::-1] else: seq = self.SEQ return b"@" + self.readname + b"\n" + seq + b"\n+\n" + qual + b"\n" - cpdef bytearray get_REFSEQ ( self ): + @cython.ccall + def get_REFSEQ(self) -> bytearray: """Fetch reference sequence, using self.MD and self.cigar """ - cdef: - char c - bytearray seq, refseq - int i, cigar_op, cigar_op_l - bytearray MD_op - int ind - bool flag_del # flag for deletion event in query - - seq = bytearray(self.SEQ) # we start with read seq then make modifications + c: cython.char + seq: bytearray + i: cython.int + cigar_op: cython.int + cigar_op_l: cython.int + MD_op: bytearray + ind: cython.int + flag_del: bool # flag for deletion event in query + + # we start with read seq then make modifications + seq = bytearray(self.SEQ) # 2-step proces - # First step: use CIGAR to edit SEQ to remove S (softclip) and I (insert) + # First step: use CIGAR to edit SEQ to remove S (softclip) and + # I (insert) # __CIGARCODE__ = "MIDNSHP=X" # let ind be the index in SEQ ind = 0 for i in self.cigar: cigar_op = i & 15 cigar_op_l = i >> 4 - if cigar_op in [2, 5, 6]: # do nothing for Deletion (we will - # put sequence back in step 2), - # Hardclip and Padding + if cigar_op in [2, 5, 6]: + # do nothing for Deletion (we will + # put sequence back in step 2), + # Hardclip and Padding pass - elif cigar_op in [0, 7, 8]: # M = X alignment match (match or - # mismatch) - # do nothing and move ind + elif cigar_op in [0, 7, 8]: + # M = X alignment match (match or mismatch) do nothing + # and move ind ind += cigar_op_l - elif cigar_op in [ 1, 4 ]: # Remove for Insertion or Softclip - seq[ ind : ind + cigar_op_l ] = b'' + elif cigar_op in [1, 4]: # Remove for Insertion or Softclip + seq[ind: ind + cigar_op_l] = b'' - # now the seq should be at the same length as rpos-lpos + # now the seq should be at the same length as rpos-lpos # Second step: use MD string to edit SEQ to put back 'deleted # seqs' and modify mismatches @@ -305,12 +298,12 @@ def __setstate__ ( self, state ): # right, a mismatch should only be 1 letter surrounded # by digits. ind += int(MD_op) - seq[ ind ] = c + seq[ind] = c ind += 1 # reset MD_op MD_op = bytearray(b'') elif (c > 64 and c < 91) and flag_del: - seq[ ind:ind ] = [c,] + seq[ind:ind] = [c,] ind += 1 elif c == 94: # means Deletion in query. Now, insert a sequnce into @@ -321,62 +314,74 @@ def __setstate__ ( self, state ): MD_op = bytearray(b'') else: raise Exception("Don't understand this operator in MD: %c" % c) - #print( seq.decode() ) + # print(seq.decode()) return seq - - cpdef get_base_by_ref_pos ( self, long ref_pos ): + + @cython.ccall + def get_base_by_ref_pos(self, ref_pos: cython.long): """Get base by ref position. """ - cdef: - int relative_pos, p + relative_pos: cython.int + p: cython.int + assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location") relative_pos = ref_pos - self.lpos - p = self.relative_ref_pos_to_relative_query_pos( relative_pos ) + p = self.relative_ref_pos_to_relative_query_pos(relative_pos) if p == -1: # located in a region deleted in query return None else: - return __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ] + return __BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15] - cpdef get_bq_by_ref_pos ( self, long ref_pos ): + @cython.ccall + def get_bq_by_ref_pos(self, ref_pos: cython.long): """Get base quality by ref position. Base quality is in Phred scale. Returned value is the raw Phred-scaled base quality. """ - cdef: - int relative_pos, p + relative_pos: cython.int + p: cython.int + assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location") relative_pos = ref_pos - self.lpos - p = self.relative_ref_pos_to_relative_query_pos( relative_pos ) + p = self.relative_ref_pos_to_relative_query_pos(relative_pos) if p == -1: # located in a region deleted in query return None else: return self.binaryqual[p] - cpdef tuple get_base_bq_by_ref_pos ( self, long ref_pos ): - """Get base and base quality by ref position. Base quality is in Phred scale. + @cython.ccall + def get_base_bq_by_ref_pos(self, ref_pos: cython.long) -> tuple: + """Get base and base quality by ref position. Base quality is + in Phred scale. Returned bq is the raw Phred-scaled base quality. + """ - cdef: - int relative_pos, p + relative_pos: cython.int + p: cython.int + assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location") relative_pos = ref_pos - self.lpos - p = self.relative_ref_pos_to_relative_query_pos( relative_pos ) + p = self.relative_ref_pos_to_relative_query_pos(relative_pos) if p == -1: # located in a region deleted in query return None else: - return ( __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ], self.binaryqual[p] ) + return (__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15], + self.binaryqual[p]) - cpdef tuple get_variant_bq_by_ref_pos ( self, long ref_pos ): - """Get any variants (different with reference) and base quality by ref position. + @cython.ccall + def get_variant_bq_by_ref_pos(self, + ref_pos: cython.long) -> tuple: + """Get any variants (different with reference) and base + quality by ref position. - variants will be + variants will be 1) =, if identical @@ -390,40 +395,43 @@ def __setstate__ ( self, state ): Base quality is the raw Phred-scaled base quality. """ - cdef: - int i, m, n - int res, p, op, op_l - int pos - bool tip - bytearray refseq - bytes p_refseq, p_seq - bytearray seq_array - bytearray bq_array + i: cython.int + m: cython.int + n: cython.int + res: cython.int + p: cython.int + op: cython.int + op_l: cython.int + pos: cython.int + tip: bool + refseq: bytearray + p_refseq: bytes + seq_array: bytearray + bq_array: bytearray assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location") res = ref_pos - self.lpos # residue p = 0 - refseq = self.get_REFSEQ() - p_refseq = refseq[ res ] + # p_refseq = refseq[res] # -- CIGAR CODE -- - #OP BAM Description - #M 0 alignment match (can be a sequence match or mismatch) insertion to the reference - #I 1 insertion to the reference - #D 2 deletion from the reference - #N 3 skipped region from the reference - #S 4 soft clipping (clipped sequences present in SEQ) - #H 5 hard clipping (clipped sequences NOT present in SEQ) - #P 6 padding (silent deletion from padded reference) - #= 7 sequence match - #X 8 sequence mismatch - - seq_array = bytearray( b'' ) - bq_array = bytearray( b'' ) - - for m in range( len(self.cigar) ): - i = self.cigar[ m ] + # OP BAM Description + # M 0 alignment match (can be a sequence match or mismatch) insertion to the reference + # I 1 insertion to the reference + # D 2 deletion from the reference + # N 3 skipped region from the reference + # S 4 soft clipping (clipped sequences present in SEQ) + # H 5 hard clipping (clipped sequences NOT present in SEQ) + # P 6 padding (silent deletion from padded reference) + # = 7 sequence match + # X 8 sequence mismatch + + seq_array = bytearray(b'') + bq_array = bytearray(b'') + + for m in range(len(self.cigar)): + i = self.cigar[m] op = i & 15 op_l = i >> 4 if op in [0, 7, 8]: # M = X alignment match (match or mismatch) @@ -432,98 +440,101 @@ def __setstate__ ( self, state ): p += res # find the position, now get the ref pos = p - seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] ) - bq_array.append( self.binaryqual[ p ] ) + seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15]) + bq_array.append(self.binaryqual[p]) break elif res == op_l - 1: p += res pos = p - seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] ) - bq_array.append( self.binaryqual[ p ] ) + seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15]) + bq_array.append(self.binaryqual[p]) # now add any insertion later on # get next cigar - if m + 1 == len( self.cigar ): + if m + 1 == len(self.cigar): break - i = self.cigar[ m + 1 ] + i = self.cigar[m + 1] op = i & 15 op_l = i >> 4 - if op == 1: #insertion - for n in range( op_l ): + if op == 1: # insertion + for n in range(op_l): p += 1 - seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] ) - bq_array.append( self.binaryqual[ p ] ) - #print self.SEQ, seq_array + seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15]) + bq_array.append(self.binaryqual[p]) + # prself: cython.int.SEQ, seq_array break else: # go to the next cigar code p += op_l res -= op_l - elif op in [ 2, 3 ]: # D N + elif op in [2, 3]: # D N if res < op_l: # find the position, however ... - # position located in a region in reference that not exists in query + # position located in a region in reference that + # not exists in query pos = p - seq_array.append( b'*' ) - bq_array.append( 93 ) #assign 93 for deletion + seq_array.append(b'*') + bq_array.append(93) # assign 93 for deletion break else: # go to the next cigar code res -= op_l - elif op == 1 : # Insertion + elif op == 1: # Insertion p += op_l # if res == 0: # no residue left, so return a chunk of inserted sequence # print "shouldn't run this code" # # first, add the insertion point - # seq_array = bytearray( b'~' ) - # bq_array.append( self.binaryqual[ p ] ) + # seq_array = bytearray(b'~') + # bq_array.append(self.binaryqual[p]) # # then add the inserted seq - # for i in range( op_l ): + # for i in range(op_l): # p += 1 - # seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] ) - # bq_array.append( self.binaryqual[ p ] ) + # seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p%2)*4)) & 15] ) + # bq_array.append(self.binaryqual[p]) # break # else: # p += op_l - elif op == 4 : # Softclip. If it's Softclip, we'd better not return the extra seq + elif op == 4: # Softclip. If it's Softclip, we'd better not return the extra seq p += op_l if pos == 0 or pos == self.l - 1: tip = True else: tip = False - - return ( seq_array, bq_array, self.strand, tip, pos ) + return (seq_array, bq_array, self.strand, tip, pos) # last position ? - #raise Exception("Not expected to see this") + # raise Exception("Not expected to see this") - cdef int relative_ref_pos_to_relative_query_pos ( self, long relative_ref_pos ): + @cython.cfunc + def relative_ref_pos_to_relative_query_pos(self, + relative_ref_pos: cython.long) -> cython.int: """Convert relative pos on ref to pos on query. """ - cdef: - int p, res, op, op_l + p: cython.int + res: cython.int + op: cython.int + op_l: cython.int + p = 0 res = relative_ref_pos - + for i in self.cigar: op = i & 15 op_l = i >> 4 - if op in [0, 7, 8]: # M = X alignment match (match or mismatch) + if op in [0, 7, 8]: + # M = X alignment match (match or mismatch) if res < op_l: p += res return p else: p += op_l res -= op_l - elif op in [ 2, 3 ]: # D N + elif op in [2, 3]: # D N if res < op_l: - # position located in a region in reference that not exists in query + # position located in a region in reference that + # not exists in query return -1 else: res -= op_l - elif op in [ 1, 4 ]: # I + elif op in [1, 4]: # I p += op_l return p - - -### End ### - diff --git a/MACS3/Signal/UnitigRACollection.py b/MACS3/Signal/UnitigRACollection.py new file mode 100644 index 00000000..0c3f31e8 --- /dev/null +++ b/MACS3/Signal/UnitigRACollection.py @@ -0,0 +1,324 @@ +# cython: language_level=3 +# cython: profile=True +# Time-stamp: <2024-10-22 17:14:11 Tao Liu> + +"""Module + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included +with the distribution). +""" +# ------------------------------------ +# python modules +# ------------------------------------ +from operator import itemgetter + +from MACS3.Signal.ReadAlignment import ReadAlignment +from MACS3.Signal.PosReadsInfo import PosReadsInfo +from MACS3.IO.PeakIO import PeakIO + +import cython +from cython.cimports.cpython import bool + +# ------------------------------------ +# constants +# ------------------------------------ +__version__ = "Parser $Revision$" +__author__ = "Tao Liu " +__doc__ = "All Parser classes" + +__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A. + +__CIGARCODE__ = "MIDNSHP=X" + +# ------------------------------------ +# Misc functions +# ------------------------------------ + +# ------------------------------------ +# Classes +# ------------------------------------ + + +@cython.cclass +class UnitigRAs: + """ + """ + RAlists: list # [RAlists_T, RAlists_C] + seq: bytes + unitig_aln: bytes + reference_aln: bytes + chrom: bytes + lpos: cython.long + rpos: cython.long + unitig_length: cython.long + reference_length: cython.long + aln_length: cython.long + + def __init__(self, + chrom: bytes, + lpos: cython.long, + rpos: cython.long, + unitig_aln: bytes, + reference_aln: bytes, + RAlists: list): + assert len(unitig_aln) == len(reference_aln), Exception("aln on unitig and reference should be the same length!") + self.chrom = chrom + self.lpos = lpos + self.rpos = rpos + self.unitig_aln = unitig_aln + self.reference_aln = reference_aln + self.RAlists = RAlists + # fill in other information + self.seq = self.unitig_aln.replace(b'-', b'') + self.unitig_length = len(self.seq) + self.reference_length = rpos - lpos + self.aln_length = len(unitig_aln) + + def __getitem__(self, keyname): + if keyname == "chrom": + return self.chrom + elif keyname == "lpos": + return self.lpos + elif keyname == "rpos": + return self.rpos + elif keyname == "seq": + return self.seq + elif keyname == "unitig_aln": + return self.unitig_aln + elif keyname == "reference_aln": + return self.reference_aln + elif keyname == "unitig_length": + return self.unitig_length + elif keyname == "reference_length": + return self.reference_length + elif keyname == "aln_length": + return self.aln_length + elif keyname == "count": + return len(self.RAlists[0]) + len(self.RAlists[1]) + else: + raise KeyError("Unavailable key:", keyname) + + def __getstate__(self): + return (self.RAlists, self.seq, self.unitig_aln, self.reference_aln, + self.chrom, self.lpos, self.rpos, self.unitig_length, + self.reference_length, self.aln_length) + + def __setstate__(self, state): + (self.RAlists, self.seq, self.unitig_aln, self.reference_aln, + self.chrom, self.lpos, self.rpos, self.unitig_length, + self.reference_length, self.aln_length) = state + + @cython.ccall + def get_variant_bq_by_ref_pos(self, + ref_pos: cython.long) -> tuple: + """ + return (s, bq_list_t, bq_list_c, strand_list_t, strand_list_c) + """ + i: cython.long + index_aln: cython.long + index_unitig: cython.long + residue: cython.long + ra: ReadAlignment + s: bytes + bq_list_t: list = [] + bq_list_c: list = [] + strand_list_t: list = [] + strand_list_c: list = [] + tip_list_t: list = [] + pos_list_t: list = [] + pos_list_c: list = [] + ra_seq: bytes + ra_pos: cython.long + l_read: cython.int + + # b'TTATTAGAAAAAAT' find = 2 + # b'AAAAATCCCACAGG' + # b'TTTTATTAGAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT' + # b'TTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT' lpos=100 + # | | | + # genome 108 113 120 + # aln 8 13 21 + # unitig 8 13 21 + # ref 8 13 20 + # read1 6 11 + # read2 3 11 + # find the position + residue = ref_pos - self.lpos + 1 + index_aln = 0 + for i in range(self.aln_length): + if self.reference_aln[i] != 45: # 45 means b'-' + residue -= 1 + if residue == 0: + break + index_aln += 1 + + # index_aln should be the position on aln + s = self.unitig_aln[index_aln:index_aln+1] + # find the index on unitig + index_unitig = len(self.unitig_aln[:index_aln+1].replace(b'-', b'')) + + if s == b'-': # deletion + for ra in self.RAlists[0]: + ra_seq = ra["SEQ"] + l_read = ra["l"] + ra_pos = index_unitig - self.seq.find(ra_seq) - 1 + if ra_pos == 0 or ra_pos == l_read - 1: + tip_list_t.append(True) + else: + tip_list_t.append(False) + bq_list_t.append(93) + strand_list_t.append(ra["strand"]) + pos_list_t.append(ra_pos) + for ra in self.RAlists[1]: + ra_seq = ra["SEQ"] + ra_pos = index_unitig - self.seq.find(ra_seq) - 1 + bq_list_c.append(93) + strand_list_c.append(ra["strand"]) + pos_list_c.append(ra_pos) + return (bytearray(b'*'), bq_list_t, bq_list_c, strand_list_t, + strand_list_c, tip_list_t, pos_list_t, pos_list_c) + + if index_aln < self.aln_length - 1: + for i in range(index_aln + 1, self.aln_length): + if self.reference_aln[i] == 45: # insertion detected, 45 means b'-' + s += self.unitig_aln[i:i+1] # we extend the s string to contain the inserted seq + else: + break + + for ra in self.RAlists[0]: # treatment + ra_seq = ra["SEQ"] + l_read = ra["l"] + ra_pos = index_unitig - self.seq.find(ra_seq) - 1 + if ra_pos < l_read and ra_pos >= 0: + pos_list_t.append(ra_pos) + if ra_pos == 0 or ra_pos == l_read - 1: + tip_list_t.append(True) + else: + tip_list_t.append(False) + bq_list_t.append(ra["binaryqual"][ra_pos]) + strand_list_t.append(ra["strand"]) + + for ra in self.RAlists[1]: # control + ra_seq = ra["SEQ"] + l_read = ra["l"] + ra_pos = index_unitig - self.seq.find(ra_seq) - 1 + if ra_pos < l_read and ra_pos >= 0: + pos_list_c.append(ra_pos) + bq_list_c.append(ra["binaryqual"][ra_pos]) + strand_list_c.append(ra["strand"]) + + return (bytearray(s), bq_list_t, bq_list_c, strand_list_t, + strand_list_c, tip_list_t, pos_list_t, pos_list_c) + + +@cython.cclass +class UnitigCollection: + """A collection of ReadAlignment objects and the corresponding + PeakIO. + + """ + chrom: bytes + peak: PeakIO # A PeakIO object + URAs_list: list + left: cython.long # left position of peak + right: cython.long # right position of peak + length: cython.long # length of peak + URAs_left: cython.long # left position of all RAs in the collection + URAs_right: cython.long # right position of all RAs in the collection + is_sorted: bool # if sorted by lpos + + def __init__(self, + chrom: bytes, + peak: PeakIO, + URAs_list: list = []): + self.chrom = chrom + self.peak = peak + self.URAs_list = URAs_list + self.left = peak["start"] + self.right = peak["end"] + self.length = self.right - self.left + self.URAs_left = URAs_list[0]["lpos"] # initial assignment of RAs_left + self.URAs_right = URAs_list[-1]["rpos"] # initial assignment of RAs_right + self.sort() # it will set self.is_sorted = True + # check RAs_left and RAs_right + for ura in URAs_list: + if ura["lpos"] < self.URAs_left: + self.URAs_left = ura["lpos"] + if ura["rpos"] > self.URAs_right: + self.URAs_right = ura["rpos"] + + def __getitem__(self, keyname): + if keyname == "chrom": + return self.chrom + elif keyname == "left": + return self.left + elif keyname == "right": + return self.right + elif keyname == "URAs_left": + return self.URAs_left + elif keyname == "URAs_right": + return self.URAs_right + elif keyname == "length": + return self.length + elif keyname == "count": + return len(self.URAs_list) + elif keyname == "URAs_list": + return self.URAs_list + else: + raise KeyError("Unavailable key:", keyname) + + def __getstate__(self): + return (self.chrom, self.peak, self.URAs_list, self.left, self.right, + self.length, self.URAs_left, self.URAs_right, self.is_sorted) + + def __setstate__(self, state): + (self.chrom, self.peak, self.URAs_list, self.left, self.right, + self.length, self.URAs_left, self.URAs_right, self.is_sorted) = state + + @cython.ccall + def sort(self): + """Sort RAs according to lpos. Should be used after realignment. + + """ + self.URAs_list.sort(key=itemgetter("lpos")) + self.is_sorted = True + return + + @cython.ccall + def get_PosReadsInfo_ref_pos(self, + ref_pos: cython.long, + ref_nt: bytes, + Q: cython.int = 20): + """Generate a PosReadsInfo for: object a given reference genome + position. + + Return a PosReadsInfo object. + + """ + s: bytearray + bq_list_t: list + bq_list_c: list + strand_list_t: list + strand_list_c: list + tip_list_t: list + pos_list_t: list + pos_list_c: list + ura: object + i: cython.int + posreadsinfo_p: PosReadsInfo + + posreadsinfo_p = PosReadsInfo(ref_pos, ref_nt) + for i in range(len(self.URAs_list)): + ura = self.URAs_list[i] + if ura["lpos"] <= ref_pos and ura["rpos"] > ref_pos: + (s, bq_list_t, bq_list_c, strand_list_t, strand_list_c, + tip_list_t, pos_list_t, pos_list_c) = ura.get_variant_bq_by_ref_pos(ref_pos) + for i in range(len(bq_list_t)): + posreadsinfo_p.add_T(i, bytes(s), bq_list_t[i], + strand_list_t[i], tip_list_t[i], Q=Q) + for i in range(len(bq_list_c)): + posreadsinfo_p.add_C(i, bytes(s), bq_list_c[i], + strand_list_c[i], Q=Q) + + return posreadsinfo_p diff --git a/MACS3/Signal/UnitigRACollection.pyx b/MACS3/Signal/UnitigRACollection.pyx deleted file mode 100644 index 33051ffd..00000000 --- a/MACS3/Signal/UnitigRACollection.pyx +++ /dev/null @@ -1,309 +0,0 @@ -# cython: language_level=3 -# cython: profile=True -# Time-stamp: <2022-02-18 11:44:57 Tao Liu> - -"""Module for SAPPER BAMParser class - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD License (see the file COPYING included -with the distribution). -""" -# ------------------------------------ -# python modules -# ------------------------------------ -from collections import Counter -from operator import itemgetter -from copy import copy - -from MACS3.Signal.ReadAlignment import ReadAlignment -from MACS3.Signal.PosReadsInfo import PosReadsInfo -from MACS3.IO.PeakIO import PeakIO - -from cpython cimport bool - -import numpy as np -cimport numpy as np -from numpy cimport uint32_t, uint64_t, int32_t, int64_t - -cdef extern from "stdlib.h": - ctypedef unsigned int size_t - size_t strlen(char *s) - void *malloc(size_t size) - void *calloc(size_t n, size_t size) - void free(void *ptr) - int strcmp(char *a, char *b) - char * strcpy(char *a, char *b) - long atol(char *bytes) - int atoi(char *bytes) - -# ------------------------------------ -# constants -# ------------------------------------ -__version__ = "Parser $Revision$" -__author__ = "Tao Liu " -__doc__ = "All Parser classes" - -__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A. - -__CIGARCODE__ = "MIDNSHP=X" - -# ------------------------------------ -# Misc functions -# ------------------------------------ - -# ------------------------------------ -# Classes -# ------------------------------------ - -cdef class UnitigRAs: - """ - """ - cdef: - list RAlists # [RAlists_T, RAlists_C] - bytes seq - bytes unitig_aln - bytes reference_aln - bytes chrom - long lpos - long rpos - long unitig_length - long reference_length - long aln_length - - def __init__ ( self, bytes chrom, long lpos, long rpos, bytes unitig_aln, bytes reference_aln, list RAlists ): - assert len( unitig_aln )==len( reference_aln ), Exception("aln on unitig and reference should be the same length!") - self.chrom = chrom - self.lpos = lpos - self.rpos = rpos - self.unitig_aln = unitig_aln - self.reference_aln = reference_aln - self.RAlists = RAlists - # fill in other information - self.seq = self.unitig_aln.replace(b'-',b'') - self.unitig_length = len( self.seq ) - self.reference_length = rpos - lpos - self.aln_length = len( unitig_aln ) - - def __getitem__ ( self, keyname ): - if keyname == "chrom": - return self.chrom - elif keyname == "lpos": - return self.lpos - elif keyname == "rpos": - return self.rpos - elif keyname == "seq": - return self.seq - elif keyname == "unitig_aln": - return self.unitig_aln - elif keyname == "reference_aln": - return self.reference_aln - elif keyname == "unitig_length": - return self.unitig_length - elif keyname == "reference_length": - return self.reference_length - elif keyname == "aln_length": - return self.aln_length - elif keyname == "count": - return len( self.RAlists[0] ) + len( self.RAlists[1] ) - else: - raise KeyError("Unavailable key:", keyname) - - def __getstate__ ( self ): - return (self.RAlists, self.seq, self.unitig_aln, self.reference_aln, self.chrom, self.lpos, self.rpos, self.unitig_length, self.reference_length, self.aln_length ) - - def __setstate__ ( self, state ): - (self.RAlists, self.seq, self.unitig_aln, self.reference_aln, self.chrom, self.lpos, self.rpos, self.unitig_length, self.reference_length, self.aln_length ) = state - - - cpdef tuple get_variant_bq_by_ref_pos( self, long ref_pos ): - """ - - return ( s, bq_list_t, bq_list_c, strand_list_t, strand_list_c ) - """ - cdef: - long i - long index_aln - long index_unitig - long residue - object ra - bytes s - list bq_list_t = [] - list bq_list_c = [] - list strand_list_t = [] - list strand_list_c = [] - list tip_list_t = [] - list pos_list_t = [] - list pos_list_c = [] - bytes ra_seq - long ra_pos - int p_seq - int l_read - - # b'TTATTAGAAAAAAT' find = 2 - # b'AAAAATCCCACAGG' - #b'TTTTATTAGAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT' - #b'TTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT' lpos=100 - # | | | - #genome 108 113 120 - #aln 8 13 21 - #unitig 8 13 21 - #ref 8 13 20 - #read1 6 11 - #read2 3 11 - # find the position - residue = ref_pos - self.lpos + 1 - index_aln = 0 - for i in range( self.aln_length ): - if self.reference_aln[ i ] != 45: # 45 means b'-' - residue -= 1 - if residue == 0: - break - index_aln += 1 - - # index_aln should be the position on aln - s = self.unitig_aln[ index_aln:index_aln+1 ] - # find the index on unitig - index_unitig = len( self.unitig_aln[:index_aln+1].replace(b'-',b'') ) - - if s == b'-': #deletion - for ra in self.RAlists[ 0 ]: - ra_seq = ra["SEQ"] - l_read = ra["l"] - ra_pos = index_unitig - self.seq.find( ra_seq ) - 1 - if ra_pos == 0 or ra_pos == l_read -1: - tip_list_t.append( True ) - else: - tip_list_t.append( False ) - bq_list_t.append( 93 ) - strand_list_t.append( ra["strand"] ) - pos_list_t.append( ra_pos ) - for ra in self.RAlists[ 1 ]: - ra_seq = ra["SEQ"] - ra_pos = index_unitig - self.seq.find( ra_seq ) - 1 - bq_list_c.append( 93 ) - strand_list_c.append( ra["strand"] ) - pos_list_c.append( ra_pos ) - return ( bytearray(b'*'), bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c ) - - if index_aln < self.aln_length - 1: - for i in range( index_aln + 1, self.aln_length ): - if self.reference_aln[ i ] == 45: #insertion detected, 45 means b'-' - s += self.unitig_aln[ i:i+1 ] # we extend the s string to contain the inserted seq - else: - break - - for ra in self.RAlists[0]: #treatment - ra_seq = ra["SEQ"] - l_read = ra["l"] - ra_pos = index_unitig - self.seq.find( ra_seq ) - 1 - if ra_pos < l_read and ra_pos >= 0: - pos_list_t.append( ra_pos ) - if ra_pos == 0 or ra_pos == l_read -1: - tip_list_t.append( True ) - else: - tip_list_t.append( False ) - bq_list_t.append( ra["binaryqual"][ra_pos] ) - strand_list_t.append( ra["strand"] ) - - for ra in self.RAlists[1]: #control - ra_seq = ra["SEQ"] - l_read = ra["l"] - ra_pos = index_unitig - self.seq.find( ra_seq ) - 1 - if ra_pos < l_read and ra_pos >= 0: - pos_list_c.append( ra_pos ) - bq_list_c.append( ra["binaryqual"][ra_pos] ) - strand_list_c.append( ra["strand"] ) - - return (bytearray(s), bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c ) - -cdef class UnitigCollection: - """A collection of ReadAlignment objects and the corresponding - PeakIO. - - """ - cdef: - bytes chrom - object peak # A PeakIO object - list URAs_list - long left # left position of peak - long right # right position of peak - long length # length of peak - long URAs_left # left position of all RAs in the collection - long URAs_right # right position of all RAs in the collection - bool sorted # if sorted by lpos - - def __init__ ( self, chrom, peak, URAs_list=[] ): - self.chrom = chrom - self.peak = peak - self.URAs_list = URAs_list - self.left = peak["start"] - self.right = peak["end"] - self.length = self.right - self.left - self.URAs_left = URAs_list[ 0 ]["lpos"] # initial assignment of RAs_left - self.URAs_right = URAs_list[-1]["rpos"] # initial assignment of RAs_right - self.sort() # it will set self.sorted = True - # check RAs_left and RAs_right - for ura in URAs_list: - if ura[ "lpos" ] < self.URAs_left: - self.URAs_left = ura[ "lpos" ] - if ura[ "rpos" ] > self.URAs_right: - self.URAs_right = ura[ "rpos" ] - - def __getitem__ ( self, keyname ): - if keyname == "chrom": - return self.chrom - elif keyname == "left": - return self.left - elif keyname == "right": - return self.right - elif keyname == "URAs_left": - return self.URAs_left - elif keyname == "URAs_right": - return self.URAs_right - elif keyname == "length": - return self.length - elif keyname == "count": - return len( self.URAs_list ) - elif keyname == "URAs_list": - return self.URAs_list - else: - raise KeyError("Unavailable key:", keyname) - - def __getstate__ ( self ): - return (self.chrom, self.peak, self.URAs_list, self.left, self.right, self.length, self.URAs_left, self.URAs_right, self.sorted) - - def __setstate__ ( self, state ): - (self.chrom, self.peak, self.URAs_list, self.left, self.right, self.length, self.URAs_left, self.URAs_right, self.sorted) = state - - cpdef sort ( self ): - """Sort RAs according to lpos. Should be used after realignment. - - """ - self.URAs_list.sort(key=itemgetter("lpos")) - self.sorted = True - return - - cpdef object get_PosReadsInfo_ref_pos ( self, long ref_pos, bytes ref_nt, int Q=20 ): - """Generate a PosReadsInfo object for a given reference genome - position. - - Return a PosReadsInfo object. - - """ - cdef: - bytearray s, bq - list bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c - object ura - int i - - posreadsinfo_p = PosReadsInfo( ref_pos, ref_nt ) - for i in range( len( self.URAs_list ) ): - ura = self.URAs_list[ i ] - if ura[ "lpos" ] <= ref_pos and ura[ "rpos" ] > ref_pos: - ( s, bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c ) = ura.get_variant_bq_by_ref_pos( ref_pos ) - for i in range( len(bq_list_t) ): - posreadsinfo_p.add_T( i, bytes(s), bq_list_t[i], strand_list_t[i], tip_list_t[i], Q=Q ) - for i in range( len(bq_list_c) ): - posreadsinfo_p.add_C( i, bytes(s), bq_list_c[i], strand_list_c[i], Q=Q ) - - return posreadsinfo_p diff --git a/MACS3/Signal/VariantStat.py b/MACS3/Signal/VariantStat.py new file mode 100644 index 00000000..b154a03d --- /dev/null +++ b/MACS3/Signal/VariantStat.py @@ -0,0 +1,549 @@ +# cython: language_level=3 +# cython: profile=True +# Time-stamp: <2024-10-22 14:47:22 Tao Liu> + +"""Module for SAPPER BAMParser class + +Copyright (c) 2017 Tao Liu + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included +with the distribution). + +@status: experimental +@version: $Revision$ +@author: Tao Liu +@contact: tliu4@buffalo.edu +""" + +# ------------------------------------ +# python modules +# ------------------------------------ +import cython + +import cython.cimports.numpy as cnp +from cython.cimports.cpython import bool + +from math import log1p, exp, log + +LN10 = 2.3025850929940458 +LN10_tenth = 0.23025850929940458 + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.ccall +def CalModel_Homo(top1_bq_T: cnp.ndarray(cython.int, ndim=1), + top1_bq_C: cnp.ndarray(cython.int, ndim=1), + top2_bq_T: cnp.ndarray(cython.int, ndim=1), + top2_bq_C: cnp.ndarray(cython.int, ndim=1)) -> tuple: + """Return (lnL, BIC). + + """ + i: cython.int + lnL: cython.double + BIC: cython.double + + lnL = 0 + # Phred score is Phred = -10log_{10} E, where E is the error rate. + # to get the 1-E: 1-E = 1-exp(Phred/-10*M_LN10) = 1-exp(Phred * -LOG10_E_tenth) + for i in range(top1_bq_T.shape[0]): + lnL += log1p(-exp(-top1_bq_T[i]*LN10_tenth)) + for i in range(top1_bq_C.shape[0]): + lnL += log1p(-exp(-top1_bq_C[i]*LN10_tenth)) + + for i in range(top2_bq_T.shape[0]): + lnL += log(exp(-top2_bq_T[i]*LN10_tenth)) + for i in range(top2_bq_C.shape[0]): + lnL += log(exp(-top2_bq_C[i]*LN10_tenth)) + + BIC = -2*lnL # no free variable, no penalty + return (lnL, BIC) + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.ccall +def CalModel_Heter_noAS(top1_bq_T: cnp.ndarray(cython.int, ndim=1), + top1_bq_C: cnp.ndarray(cython.int, ndim=1), + top2_bq_T: cnp.ndarray(cython.int, ndim=1), + top2_bq_C: cnp.ndarray(cython.int, ndim=1)) -> tuple: + """Return (lnL, BIC) + + k_T + k_C + """ + k_T: cython.int + k_C: cython.int + lnL: cython.double + BIC: cython.double + tn_T: cython.int + tn_C: cython.int + # tn: cython.int # total observed NTs + lnL_T: cython.double + lnL_C: cython.double # log likelihood for treatment and control + + lnL = 0 + BIC = 0 + # for k_T + # total oberseved treatment reads from top1 and top2 NTs + tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0] + + if tn_T == 0: + raise Exception("Total number of treatment reads is 0!") + else: + (lnL_T, k_T) = GreedyMaxFunctionNoAS(top1_bq_T.shape[0], + top2_bq_T.shape[0], + tn_T, + top1_bq_T, + top2_bq_T) + lnL += lnL_T + BIC += -2*lnL_T + + # for k_C + tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0] + + if tn_C == 0: + pass + else: + (lnL_C, k_C) = GreedyMaxFunctionNoAS(top1_bq_C.shape[0], + top2_bq_C.shape[0], + tn_C, + top1_bq_C, + top2_bq_C) + lnL += lnL_C + BIC += -2*lnL_C + + # tn = tn_C + tn_T + + # we penalize big model depending on the number of reads/samples + if tn_T == 0: + BIC += log(tn_C) + elif tn_C == 0: + BIC += log(tn_T) + else: + BIC += log(tn_T) + log(tn_C) + + return (lnL, BIC) + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.ccall +def CalModel_Heter_AS(top1_bq_T: cnp.ndarray(cython.int, ndim=1), + top1_bq_C: cnp.ndarray(cython.int, ndim=1), + top2_bq_T: cnp.ndarray(cython.int, ndim=1), + top2_bq_C: cnp.ndarray(cython.int, ndim=1), + max_allowed_ar: cython.float = 0.99) -> tuple: + """Return (lnL, BIC) + + kc + ki + AS_alleleratio + """ + k_T: cython.int + k_C: cython.int + lnL: cython.double + BIC: cython.double + tn_T: cython.int + tn_C: cython.int + # tn: cython.int # total observed NTs + lnL_T: cython.double + lnL_C: cython.double # log likelihood for treatment and control + AS_alleleratio: cython.double # allele ratio + + lnL = 0 + BIC = 0 + + # Treatment + tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0] + + if tn_T == 0: + raise Exception("Total number of treatment reads is 0!") + else: + (lnL_T, k_T, AS_alleleratio) = GreedyMaxFunctionAS(top1_bq_T.shape[0], + top2_bq_T.shape[0], + tn_T, + top1_bq_T, + top2_bq_T, + max_allowed_ar) + # print ">>>",lnL_T, k_T, AS_alleleratio + lnL += lnL_T + BIC += -2*lnL_T + + # control + tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0] + + if tn_C == 0: + pass + else: + # We assume control will not have allele preference + (lnL_C, k_C) = GreedyMaxFunctionNoAS(top1_bq_C.shape[0], + top2_bq_C.shape[0], + tn_C, + top1_bq_C, + top2_bq_C) + lnL += lnL_C + BIC += -2*lnL_C + + # we penalize big model depending on the number of reads/samples + if tn_T == 0: + BIC += log(tn_C) + elif tn_C == 0: + BIC += 2 * log(tn_T) + else: + BIC += 2 * log(tn_T) + log(tn_C) + + return (lnL, BIC) + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cfunc +def GreedyMaxFunctionAS(m: cython.int, + n: cython.int, + tn: cython.int, + me: cnp.ndarray(cython.int, ndim=1), + ne: cnp.ndarray(cython.int, ndim=1), + max_allowed_ar: cython.float = 0.99) -> tuple: + """Return lnL, k and alleleratio in tuple. + + Note: I only translate Liqing's C++ code into pyx here. Haven't + done any review. + + """ + dnew: cython.double + dold: cython.double + rold: cython.double + rnew: cython.double + kold: cython.int + knew: cython.int + btemp: bool + k0: cython.int + dl: cython.double + dr: cython.double + d0: cython.double + d1l: cython.double + d1r: cython.double + + assert m+n == tn + btemp = False + if tn == 1: # only 1 read; I don't expect this to be run... + dl = calculate_ln(m, n, tn, me, ne, 0, 0) + dr = calculate_ln(m, n, tn, me, ne, 1, 1) + + if dl > dr: + k = 0 + return (dl, 0, 0) + else: + k = 1 + return (dr, 1, 1) + elif m == 0: # no top1 nt + return (calculate_ln(m, n, tn, me, ne, 0, m, max_allowed_ar), + m, + 1-max_allowed_ar) + # k0 = m + 1 + elif m == tn: # all reads are top1 + return (calculate_ln(m, n, tn, me, ne, 1, m, max_allowed_ar), + m, + max_allowed_ar) + else: + k0 = m + + d0 = calculate_ln(m, n, tn, me, ne, float(k0)/tn, k0, max_allowed_ar) + d1l = calculate_ln(m, n, tn, me, ne, float(k0-1)/tn, k0-1, max_allowed_ar) + d1r = calculate_ln(m, n, tn, me, ne, float(k0+1)/tn, k0+1, max_allowed_ar) + + if d0 > d1l-1e-8 and d0 > d1r-1e-8: + k = k0 + ar = float(k0)/tn + return (d0, k, ar) + elif d1l > d0: + dold = d1l + kold = k0-1 + rold = float(k0-1)/tn + while kold > 1: # disable: when kold=1 still run, than knew=0 is the final run + knew = kold - 1 + rnew = float(knew)/tn + + dnew = calculate_ln(m, + n, + tn, + me, + ne, + rnew, + knew, + max_allowed_ar) + + if (dnew-1e-8 < dold): + btemp = True + break + kold = knew + dold = dnew + rold = rnew + + if btemp: # maximum L value is in [1,m-1]; + k = kold + ar = rold + return (dold, k, ar) + else: # L(k=0) is the max for [0,m-1] + k = kold + ar = rold + return (dold, k, ar) + + elif d1r > d0: + dold = d1r + kold = k0 + 1 + rold = float(k0 + 1)/tn + while kold < tn - 1: # //disable: when kold=tn-1 still run, than knew=tn is the final run + knew = kold + 1 + + rnew = float(knew)/tn + + dnew = calculate_ln(m, + n, + tn, + me, + ne, + rnew, + knew, + max_allowed_ar) + + if dnew - 1e-8 < dold: + btemp = True + break + kold = knew + dold = dnew + rold = rnew + + if btemp: # maximum L value is in [m+1,tn-1] + k = kold + ar = rold + return (dold, k, ar) + else: # L(k=tn) is the max for [m+1,tn] + k = kold + ar = rold + return (dold, k, ar) + else: + raise Exception("error in GreedyMaxFunctionAS") + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cfunc +def GreedyMaxFunctionNoAS(m: cython.int, + n: cython.int, + tn: cython.int, + me: cnp.ndarray(cython.int, ndim=1), + ne: cnp.ndarray(cython.int, ndim=1)) -> tuple: + """Return lnL, and k in tuple. + + Note: I only translate Liqing's C++ code into pyx here. Haven't + done any review. + + """ + dnew: cython.double + dold: cython.double + kold: cython.int + knew: cython.int + btemp: bool + k0: cython.int + bg_r: cython.double + dl: cython.double + dr: cython.double + d0: cython.double + d1l: cython.double + d1r: cython.double + + btemp = False + bg_r = 0.5 + + if tn == 1: + dl = calculate_ln(m, n, tn, me, ne, bg_r, 0) + dr = calculate_ln(m, n, tn, me, ne, bg_r, 1) + if dl > dr: + k = 0 + return (dl, 0) + else: + k = 1 + return (dr, 1) + elif m == 0: # no top1 nt + return (calculate_ln(m, n, tn, me, ne, bg_r, m), m) + # k0 = m + 1 + elif m == tn: # all reads are top1 + return (calculate_ln(m, n, tn, me, ne, bg_r, m), m) + # elif m == 0: + # k0 = m + 1 + # elif m == tn: + # k0 = m - 1 + else: + k0 = m + + d0 = calculate_ln(m, n, tn, me, ne, bg_r, k0) + d1l = calculate_ln(m, n, tn, me, ne, bg_r, k0 - 1) + d1r = calculate_ln(m, n, tn, me, ne, bg_r, k0 + 1) + + if d0 > d1l - 1e-8 and d0 > d1r - 1e-8: + k = k0 + return (d0, k) + elif d1l > d0: + dold = d1l + kold = k0 - 1 + while kold >= 1: # //when kold=1 still run, than knew=0 is the final run + knew = kold - 1 + dnew = calculate_ln(m, n, tn, me, ne, bg_r, knew) + if dnew - 1e-8 < dold: + btemp = True + break + kold = knew + dold = dnew + + if btemp: # //maximum L value is in [1,m-1]; + k = kold + return (dold, k) + else: # //L(k=0) is the max for [0,m-1] + k = kold + return (dold, k) + elif d1r > d0: + dold = d1r + kold = k0 + 1 + while kold <= tn - 1: # //when kold=tn-1 still run, than knew=tn is the final run + knew = kold + 1 + dnew = calculate_ln(m, n, tn, me, ne, bg_r, knew) + if dnew - 1e-8 < dold: + btemp = True + break + kold = knew + dold = dnew + + if btemp: # //maximum L value is in [m+1,tn-1] + k = kold + return (dold, k) + else: # //L(k=tn) is the max for [m+1,tn] + k = kold + return (dold, k) + else: + raise Exception("error in GreedyMaxFunctionNoAS") + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cfunc +def calculate_ln(m: cython.int, + n: cython.int, + tn: cython.int, + me: cnp.ndarray(cython.int, ndim=1), + ne: cnp.ndarray(cython.int, ndim=1), + r: cython.double, + k: cython.int, + max_allowed_r: cython.float = 0.99): + """Calculate log likelihood given quality of top1 and top2, the + ratio r and the observed k. + + """ + i: cython.int + lnL: cython.double + e: cython.double + + lnL = 0 + + # r is extremely high or + if r > max_allowed_r or r < 1 - max_allowed_r: + lnL += k*log(max_allowed_r) + (tn-k)*log(1 - max_allowed_r) + else: + lnL += k*log(r) + (tn-k)*log(1-r) + + # it's entirely biased toward 1 allele + if k == 0 or k == tn: + pass + elif k <= tn/2: + for i in range(k): + lnL += log(float(tn-i)/(k-i)) + else: + for i in range(tn-k): + lnL += log(float(tn-i)/(tn-k-i)) + + for i in range(m): + e = exp(- me[i] * LN10_tenth) + lnL += log((1-e)*(float(k)/tn) + e*(1-float(k)/tn)) + + for i in range(n): + e = exp(- ne[i] * LN10_tenth) + lnL += log((1-e)*(1-float(k)/tn) + e*(float(k)/tn)) + + return lnL + + +@cython.ccall +def calculate_GQ(lnL1: cython.double, + lnL2: cython.double, + lnL3: cython.double) -> cython.int: + """GQ1 = -10*log_{10}((L2+L3)/(L1+L2+L3)) + """ + L1: cython.double + L2: cython.double + L3: cython.double + s: cython.double + tmp: cython.double + GQ_score: cython.int + + # L1 = exp(lnL1-lnL1) + L1 = 1 + L2 = exp(lnL2-lnL1) + L3 = exp(lnL3-lnL1) + + # if L1 > 1: + # L1 = 1 + + if L2 > 1: + L2 = 1 + if L3 > 1: + L3 = 1 + # if(L1<1e-110) L1=1e-110; + if L2 < 1e-110: + L2 = 1e-110 + if L3 < 1e-110: + L3 = 1e-110 + + s = L1 + L2 + L3 + tmp = (L2 + L3)/s + if tmp > 1e-110: + GQ_score = (int)(-4.34294*log(tmp)) + else: + GQ_score = 255 + + return GQ_score + + +@cython.ccall +def calculate_GQ_heterASsig(lnL1: cython.double, + lnL2: cython.double) -> cython.int: + """ + """ + L1: cython.double + L2: cython.double + s: cython.double + tmp: cython.double + ASsig_score: cython.int + + # L1=exp(2.7182818,lnL1-lnL1) + L1 = 1 + L2 = exp(lnL2 - lnL1) + + # if L1 > 1: + # L1 = 1 + if L2 > 1: + L2 = 1 + # if L1 < 1e-110: + # L1 = 1e-110 + if L2 < 1e-110: + L2 = 1e-110 + + s = L1 + L2 + tmp = L2/s + if tmp > 1e-110: + ASsig_score = (int)(-4.34294*log(tmp)) + else: + ASsig_score = 255 + + return ASsig_score diff --git a/MACS3/Signal/VariantStat.pyx b/MACS3/Signal/VariantStat.pyx deleted file mode 100644 index be2699ac..00000000 --- a/MACS3/Signal/VariantStat.pyx +++ /dev/null @@ -1,461 +0,0 @@ -# cython: language_level=3 -# cython: profile=True -# Time-stamp: <2020-12-04 18:41:28 Tao Liu> - -"""Module for SAPPER BAMParser class - -Copyright (c) 2017 Tao Liu - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD License (see the file COPYING included -with the distribution). - -@status: experimental -@version: $Revision$ -@author: Tao Liu -@contact: tliu4@buffalo.edu -""" - -# ------------------------------------ -# python modules -# ------------------------------------ -from cpython cimport bool - -cimport cython - -import numpy as np -cimport numpy as np - -ctypedef np.float32_t float32_t -ctypedef np.int32_t int32_t - -#from libc.math cimport log10, log, exp, M_LN10 #,fabs,log1p -#from libc.math cimport M_LN10 -from math import log1p, exp, log - -LN10 = 2.3025850929940458 -LN10_tenth = 0.23025850929940458 - -@cython.boundscheck(False) # turn off bounds-checking for entire function -@cython.wraparound(False) # turn off negative index wrapping for entire function -cpdef tuple CalModel_Homo( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C): - """Return (lnL, BIC). - - """ - cdef: - int i - double lnL, BIC - - lnL=0 - # Phred score is Phred = -10log_{10} E, where E is the error rate. - # to get the 1-E: 1-E = 1-exp( Phred/-10*M_LN10 ) = 1-exp( Phred * -LOG10_E_tenth ) - for i in range( top1_bq_T.shape[0] ): - lnL += log1p( -exp(-top1_bq_T[ i ]*LN10_tenth) ) - for i in range( top1_bq_C.shape[0] ): - lnL += log1p( -exp(-top1_bq_C[ i ]*LN10_tenth) ) - - for i in range( top2_bq_T.shape[0] ): - lnL += log( exp(-top2_bq_T[ i ]*LN10_tenth) ) - for i in range( top2_bq_C.shape[0] ): - lnL += log( exp(-top2_bq_C[ i ]*LN10_tenth) ) - - BIC = -2*lnL # no free variable, no penalty - return (lnL, BIC) - -@cython.boundscheck(False) # turn off bounds-checking for entire function -@cython.wraparound(False) # turn off negative index wrapping for entire function -cpdef tuple CalModel_Heter_noAS( np.ndarray[int32_t, ndim=1] top1_bq_T,np.ndarray[int32_t, ndim=1] top1_bq_C,np.ndarray[int32_t, ndim=1] top2_bq_T,np.ndarray[int32_t, ndim=1] top2_bq_C ): - """Return (lnL, BIC) - - k_T - k_C - """ - cdef: - int k_T, k_C - double lnL, BIC - int i - int tn_T, tn_C, tn # total observed NTs - double lnL_T, lnL_C # log likelihood for treatment and control - - lnL = 0 - BIC = 0 - #for k_T - # total oberseved treatment reads from top1 and top2 NTs - tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0] - - if tn_T == 0: - raise Exception("Total number of treatment reads is 0!") - else: - ( lnL_T, k_T ) = GreedyMaxFunctionNoAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T ) - lnL += lnL_T - BIC += -2*lnL_T - - #for k_C - tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0] - - if tn_C == 0: - pass - else: - ( lnL_C, k_C ) = GreedyMaxFunctionNoAS( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C ) - lnL += lnL_C - BIC += -2*lnL_C - - tn = tn_C + tn_T - - # we penalize big model depending on the number of reads/samples - if tn_T == 0: - BIC += log( tn_C ) - elif tn_C == 0: - BIC += log( tn_T ) - else: - BIC += log( tn_T ) + log( tn_C ) - - return ( lnL, BIC ) - - -@cython.boundscheck(False) # turn off bounds-checking for entire function -@cython.wraparound(False) # turn off negative index wrapping for entire function -cpdef tuple CalModel_Heter_AS( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C, float max_allowed_ar = 0.99 ): - """Return (lnL, BIC) - - kc - ki - AS_alleleratio - """ - cdef: - int k_T, k_C - double lnL, BIC - int i - int tn_T, tn_C, tn # total observed NTs - double lnL_T, lnL_C # log likelihood for treatment and control - double AS_alleleratio # allele ratio - - lnL = 0 - BIC = 0 - - #assert top2_bq_T.shape[0] + top2_bq_C.shape[0] > 0, "Total number of top2 nt should not be zero while using this function: CalModel_Heter_AS!" - - # Treatment - tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0] - - if tn_T == 0: - raise Exception("Total number of treatment reads is 0!") - else: - ( lnL_T, k_T, AS_alleleratio ) = GreedyMaxFunctionAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T, max_allowed_ar) - #print ">>>",lnL_T, k_T, AS_alleleratio - lnL += lnL_T - BIC += -2*lnL_T - - # control - tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0] - - if tn_C == 0: - pass - else: - # We assume control will not have allele preference - ( lnL_C, k_C ) = GreedyMaxFunctionNoAS ( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C) - lnL += lnL_C - BIC += -2*lnL_C - - tn = tn_C + tn_T - - # we penalize big model depending on the number of reads/samples - if tn_T == 0: - BIC += log( tn_C ) - elif tn_C == 0: - BIC += 2 * log( tn_T ) - else: - BIC += 2 * log( tn_T ) + log( tn_C ) - - return (lnL, BIC) - - -@cython.boundscheck(False) # turn off bounds-checking for entire function -@cython.wraparound(False) # turn off negative index wrapping for entire function -cdef tuple GreedyMaxFunctionAS( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, float max_allowed_ar = 0.99 ): - """Return lnL, k and alleleratio in tuple. - - Note: I only translate Liqing's C++ code into pyx here. Haven't done any review. - """ - cdef: - double dnew, dold, rold, rnew - int kold, knew - bool btemp - int k0 - double dl, dr, d0, d1l, d1r - - assert m+n == tn - btemp = False - if tn == 1: # only 1 read; I don't expect this to be run... - dl=calculate_ln(m,n,tn,me,ne,0,0); - dr=calculate_ln(m,n,tn,me,ne,1,1); - - if dl>dr: - k = 0 - return ( dl, 0, 0 ) - else: - k = 1 - return ( dr, 1, 1 ) - elif m == 0: #no top1 nt - return ( calculate_ln( m, n, tn, me, ne, 0, m, max_allowed_ar ), m, 1-max_allowed_ar ) - #k0 = m + 1 - elif m == tn: #all reads are top1 - return ( calculate_ln( m, n, tn, me, ne, 1, m, max_allowed_ar ), m, max_allowed_ar ) - else: - k0 = m - - d0 = calculate_ln( m, n, tn, me, ne, float(k0)/tn, k0, max_allowed_ar ) - d1l = calculate_ln( m, n, tn, me, ne, float(k0-1)/tn, k0-1, max_allowed_ar ) - d1r = calculate_ln( m, n, tn, me, ne, float(k0+1)/tn, k0+1, max_allowed_ar ) - - if d0 > d1l-1e-8 and d0 > d1r-1e-8: - k = k0 - ar = float(k0)/tn - return ( d0, k, ar ) - elif d1l > d0: - dold = d1l - kold = k0-1 - rold = float(k0-1)/tn - while kold > 1: #disable: when kold=1 still run, than knew=0 is the final run - knew = kold - 1 - rnew = float(knew)/tn - - dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar ) - - if(dnew-1e-8 < dold) : - btemp=True - break - kold=knew - dold=dnew - rold=rnew - - if btemp: #maximum L value is in [1,m-1]; - k = kold - ar= rold - return ( dold, k, ar ) - else: #L(k=0) is the max for [0,m-1] - k = kold - ar = rold - return ( dold, k, ar ) - - elif d1r > d0: - dold = d1r - kold = k0 + 1 - rold = float(k0 + 1)/tn - while kold < tn - 1: #//disable: when kold=tn-1 still run, than knew=tn is the final run - knew = kold + 1 - - rnew = float(knew)/tn - - dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar ) - - if dnew - 1e-8 < dold: - btemp = True - break - kold = knew - dold = dnew - rold = rnew - - if btemp: #maximum L value is in [m+1,tn-1] - k = kold - ar= rold - return ( dold, k, ar ) - else: #L(k=tn) is the max for [m+1,tn] - k = kold - ar = rold - return ( dold, k, ar ) - else: - raise Exception("error in GreedyMaxFunctionAS") - - -@cython.boundscheck(False) # turn off bounds-checking for entire function -@cython.wraparound(False) # turn off negative index wrapping for entire function -cdef tuple GreedyMaxFunctionNoAS (int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne ): - """Return lnL, and k in tuple. - - Note: I only translate Liqing's C++ code into pyx here. Haven't done any review. - """ - cdef: - double dnew, dold - int kold, knew - bool btemp - int k0 - double bg_r, dl, dr, d0, d1l, d1r - - btemp = False - bg_r = 0.5 - - if tn == 1: - dl = calculate_ln( m, n, tn, me, ne, bg_r, 0) - dr= calculate_ln( m, n, tn, me, ne, bg_r, 1) - if dl > dr: - k = 0 - return ( dl, 0 ) - else: - k = 1 - return ( dr, 1 ) - elif m == 0: #no top1 nt - return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m ) - #k0 = m + 1 - elif m == tn: #all reads are top1 - return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m ) - #elif m == 0: - # k0 = m + 1 - #elif m == tn: - # k0 = m - 1 - else: - k0 = m - - d0 = calculate_ln( m, n, tn, me, ne, bg_r, k0) - d1l = calculate_ln( m, n, tn, me, ne, bg_r, k0 - 1) - d1r = calculate_ln( m, n, tn, me, ne, bg_r, k0 + 1) - - if d0 > d1l - 1e-8 and d0 > d1r - 1e-8: - k = k0 - return ( d0, k ) - elif d1l > d0: - dold = d1l - kold=k0 - 1 - while kold >= 1: #//when kold=1 still run, than knew=0 is the final run - knew = kold - 1 - dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew ) - if dnew - 1e-8 < dold: - btemp = True - break - kold=knew - dold=dnew - - if btemp: #//maximum L value is in [1,m-1]; - k = kold - return ( dold, k ) - else: #//L(k=0) is the max for [0,m-1] - k = kold - return ( dold, k ) - elif d1r > d0: - dold = d1r - kold = k0 + 1 - while kold <= tn - 1: #//when kold=tn-1 still run, than knew=tn is the final run - knew = kold + 1 - dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew ) - if dnew - 1e-8 < dold: - btemp = True - break - kold = knew - dold = dnew - - if btemp: #//maximum L value is in [m+1,tn-1] - k = kold - return ( dold, k ) - else: #//L(k=tn) is the max for [m+1,tn] - k = kold - return ( dold, k ) - else: - raise Exception("error in GreedyMaxFunctionNoAS") - -@cython.boundscheck(False) # turn off bounds-checking for entire function -@cython.wraparound(False) # turn off negative index wrapping for entire function -cdef calculate_ln( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, double r, int k, float max_allowed_r = 0.99): - """Calculate log likelihood given quality of top1 and top2, the ratio r and the observed k. - - """ - cdef: - int i - double lnL - double e - - lnL = 0 - - if r > max_allowed_r or r < 1 - max_allowed_r: # r is extremely high or - #print "here1" - lnL += k*log( max_allowed_r ) + (tn-k)*log( 1- max_allowed_r) #-10000 - else: - lnL += k*log( r ) + (tn-k)*log(1-r) - - if k == 0 or k == tn: # it's entirely biased toward 1 allele - #print "here2" - pass - #lnL += k*log( max_allowed_r ) #-10000 - #lnL += -10000 - elif k <= tn/2: - for i in range( k ): - lnL += log(float(tn-i)/(k-i)) - else: - for i in range( tn-k ): - lnL += log(float(tn-i)/(tn-k-i)) - - for i in range( m ): - e = exp( - me[ i ] * LN10_tenth ) - lnL += log((1-e)*(float(k)/tn) + e*(1-float(k)/tn)) - - for i in range( n ): - e = exp( - ne[ i ] * LN10_tenth ) - lnL += log((1-e)*(1-float(k)/tn) + e*(float(k)/tn)) - - #print r,k,lnL - return lnL - -cpdef int calculate_GQ ( double lnL1, double lnL2, double lnL3): - """GQ1 = -10*log_{10}((L2+L3)/(L1+L2+L3)) - - - """ - cdef: - double L1, L2, L3, sum, tmp - int GQ_score - - #L1 = exp(lnL1-lnL1) - L1 = 1 - L2 = exp(lnL2-lnL1) - L3 = exp(lnL3-lnL1) - - #if L1 > 1: - # L1 = 1 - - if L2 > 1: - L2 = 1 - if L3 > 1: - L3 = 1 - #if(L1<1e-110) L1=1e-110; - if L2 < 1e-110: - L2=1e-110 - if L3 < 1e-110: - L3 = 1e-110 - - sum = L1 + L2 + L3 - tmp = ( L2 + L3 )/sum - if tmp > 1e-110: - GQ_score = (int)(-4.34294*log(tmp)) - else: - GQ_score = 255 - - return GQ_score; - -cpdef int calculate_GQ_heterASsig( double lnL1, double lnL2): - """ - """ - cdef: - double L1, L2, sum, tmp - int ASsig_score - - #L1=exp(2.7182818,lnL1-lnL1) - L1 = 1 - L2 = exp( lnL2 - lnL1 ) - - #if L1 > 1: - # L1 = 1 - if L2 > 1: - L2 = 1 - #if L1 < 1e-110: - # L1 = 1e-110 - if L2 < 1e-110: - L2 = 1e-110 - - sum = L1 + L2 - tmp = L2/sum - if tmp > 1e-110: - ASsig_score = (int)(-4.34294*log(tmp)) - else: - ASsig_score = 255 - - return ASsig_score - diff --git a/setup.py b/setup.py index 7248fb2b..96a3f543 100644 --- a/setup.py +++ b/setup.py @@ -139,17 +139,14 @@ def main(): include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), Extension("MACS3.Signal.VariantStat", - ["MACS3/Signal/VariantStat.pyx"], - libraries=["m"], + ["MACS3/Signal/VariantStat.py"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), Extension("MACS3.Signal.ReadAlignment", - ["MACS3/Signal/ReadAlignment.pyx"], - libraries=["m"], - include_dirs=numpy_include_dir, + ["MACS3/Signal/ReadAlignment.py"], extra_compile_args=extra_c_args), Extension("MACS3.Signal.RACollection", - ["MACS3/Signal/RACollection.pyx", + ["MACS3/Signal/RACollection.py", "MACS3/fermi-lite/bfc.c", "MACS3/fermi-lite/bseq.c", "MACS3/fermi-lite/bubble.c", @@ -165,24 +162,19 @@ def main(): "MACS3/fermi-lite/unitig.c", "MACS3/Signal/swalign.c"], libraries=["m", "z"], - include_dirs=numpy_include_dir+["./", - "./MACS3/fermi-lite/", - "./MACS3/Signal/"], + include_dirs=["./", + "./MACS3/fermi-lite/", + "./MACS3/Signal/"], extra_compile_args=extra_c_args+extra_c_args_for_fermi), Extension("MACS3.Signal.UnitigRACollection", - ["MACS3/Signal/UnitigRACollection.pyx"], - libraries=["m"], - include_dirs=numpy_include_dir, + ["MACS3/Signal/UnitigRACollection.py"], extra_compile_args=extra_c_args), Extension("MACS3.Signal.PosReadsInfo", - ["MACS3/Signal/PosReadsInfo.pyx"], - libraries=["m"], + ["MACS3/Signal/PosReadsInfo.py"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), Extension("MACS3.Signal.PeakVariants", - ["MACS3/Signal/PeakVariants.pyx"], - libraries=["m"], - include_dirs=numpy_include_dir, + ["MACS3/Signal/PeakVariants.py"], extra_compile_args=extra_c_args), Extension("MACS3.IO.Parser", ["MACS3/IO/Parser.py"],