diff --git a/ChangeLog b/ChangeLog index 31b2c22f..dc01e5e0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,5 @@ -2024-11-30 Tao Liu - MACS 3.0.3b1 +2025-02-12 Tao Liu + MACS 3.0.3 * Features added @@ -19,20 +19,35 @@ source codes can be more compatible to Python programming tools such as `flake8`. During rewritting, we cleaned the source codes even more, and removed unnecessary dependencies during - compilation. + compilation. We will continue to do more code cleaning in the + future. + + 3) We changed the behavior on the usage of 'blacklist' regions in + `hmmratac`. We will remove the aligned fragments located in the + 'blacklist' regions before the EM step to estimate fragment + lengths distributions and HMM step to learn and predict nucleosome + states. The reason is discussed in #680. To implement this + feature, we added the `exclude` functions to PETrackI and + PETrackII. * Bug fixed - 1) Fix issues in big-endian system in `Parser.py` codes. Enable - big-endian support in `BAM.py` codes for accessig certain - alignment records that overlap with givin genomic - coordinates using BAM/BAI files. + 1) `hmmratac` option `--keep-duplicate` had opposite effect + previously as indicated by the name and description. It has been + renamed as `--remove-dup` to reflect the actual + behavior. `hmmratac` will not remove duplicated fragments unless + this option is set. 2) `hmmratac`: wrong class name used while saving digested signals in BedGraph files. Multiple other issues related to output filenames. #682 - 3) `predictd` and `filterdup`: wrong variable name used while + 3) Fix issues in big-endian system in `Parser.py` codes. Enable + big-endian support in `BAM.py` codes for accessig certain + alignment records that overlap with givin genomic + coordinates using BAM/BAI files. + + 4) `predictd` and `filterdup`: wrong variable name used while reading multiple pe/frag files. * Doc diff --git a/MACS3/Commands/hmmratac_cmd.py b/MACS3/Commands/hmmratac_cmd.py index 846edbde..2b9a2b75 100644 --- a/MACS3/Commands/hmmratac_cmd.py +++ b/MACS3/Commands/hmmratac_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2025-02-05 12:38:55 Tao Liu> +# Time-stamp: <2025-02-12 14:30:02 Tao Liu> """Description: Main HMMR command @@ -118,9 +118,17 @@ def run(args): # remember to finalize the petrack petrack.finalize() + options.info(f"# Read {petrack.total} fragments.") + # filter duplicates if needed - if options.misc_keep_duplicates: + if options.misc_remove_duplicates: + options.info("# Removing duplicated fragments...") + # by default, we keep all duplicates + before_total = petrack.total petrack.filter_dup(maxnum=1) + removed_n = before_total - petrack.total + options.info(f"# We removed {removed_n} duplicated fragments.") + options.info(f"# There are {petrack.total} fragments left.") # read in blacklisted if option entered if options.blacklist: @@ -139,6 +147,14 @@ def run(args): blacklist_regions = Regions() blacklist_regions.init_from_PeakIO(blacklist) + # remove fragments aligned in the blacklisted regions + + before_total = petrack.total + petrack.exclude(blacklist_regions) + removed_n = before_total - petrack.total + options.info(f"# We removed {removed_n} fragments overlapping with blacklisted regions.") + options.info(f"# There are {petrack.total} fragments left.") + ############################################# # 2. EM ############################################# @@ -287,10 +303,6 @@ def run(args): training_regions.expand(options.hmm_training_flanking) training_regions.merge_overlap() - # remove peaks overlapping with blacklisted regions - if options.blacklist: - training_regions.exclude(blacklist_regions) - options.info(f"# after removing those overlapping with provided blacklisted regions, we have {training_regions.total} left") if options.save_train: fhd = open(training_region_bedfile, "w") training_regions.write_to_bed(fhd) @@ -409,11 +421,6 @@ def run(args): candidate_regions.merge_overlap() options.info(f"# after expanding and merging, we have {candidate_regions.total} candidate regions") - # remove peaks overlapping with blacklisted regions - if options.blacklist: - candidate_regions.exclude(blacklist_regions) - options.info(f"# after removing those overlapping with provided blacklisted regions, we have {candidate_regions.total} left") - # extract signals options.info("# Extract signals in candidate regions and decode with HMM") # we will do the extraction and prediction in a step of 10000 regions by default diff --git a/MACS3/Signal/PairedEndTrack.py b/MACS3/Signal/PairedEndTrack.py index b438b1d3..ab81b827 100644 --- a/MACS3/Signal/PairedEndTrack.py +++ b/MACS3/Signal/PairedEndTrack.py @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2025-02-05 12:40:47 Tao Liu> +# Time-stamp: <2025-02-12 13:03:49 Tao Liu> """Module for filter duplicate tags from paired-end data @@ -27,6 +27,7 @@ bedGraphTrackII) from MACS3.Signal.PileupV2 import (pileup_from_LR_hmmratac, pileup_from_LRC) +from MACS3.Signal.Region import Regions # ------------------------------------ # Other modules # ------------------------------------ @@ -76,7 +77,7 @@ def __init__(self, anno: str = "", buffer_size: cython.long = 100000): # [('l','i4'),('r','i4')] as value self.locations = {} # dictionary with chrname as key, size of the above nparray as value - # size is to remember the size of the fragments added to this chromosome + # size is to remember the number of the fragments added to this chromosome self.size = {} # dictionary with chrname as key, size of the above nparray as value self.buf_size = {} @@ -267,6 +268,104 @@ def fraglengths(self) -> cnp.ndarray: sizes = np.concatenate((sizes, locs['r'] - locs['l'])) return sizes + @cython.boundscheck(False) # do not check that np indices are valid + @cython.ccall + def exclude(self, regions): + """Exclude reads overlapping with anything in the regions. + + Run it right after you add all data into this object. + """ + i: cython.ulong + k: bytes + locs: cnp.ndarray + locs_size: cython.ulong + chrnames: set + regions_c: list + selected_idx: cnp.ndarray + regions_chrs: list + r1: cnp.void # this is the location in numpy.void -- like a tuple + r2: tuple # this is the region + n_rl1: cython.long + n_rl2: cython.long + + if not self.is_sorted: + self.sort() + + assert isinstance(regions, Regions) + regions.sort() + regions_chrs = list(regions.regions.keys()) + + chrnames = self.get_chr_names() + + for k in chrnames: # for each chromosome + locs = self.locations[k] + locs_size = self.size[k] + # let's check if k is in regions_chr + if k not in regions_chrs: + # do nothing and continue + self.total += locs_size + continue + + # discard overlapping reads and make a new locations[k] + # initialize boolean array as all TRUE, or all being kept + selected_idx = np.ones(locs_size, dtype=bool) + + regions_c = regions.regions[k] + + i = 0 + n_rl1 = locs_size # the number of locations left + n_rl2 = len(regions_c) # the number of regions left + rl1_k = iter(locs).__next__ + rl2_k = iter(regions_c).__next__ + r1 = rl1_k() # take the first value + n_rl1 -= 1 # remaining rl1 + r2 = rl2_k() + n_rl2 -= 1 # remaining rl2 + while (True): + # we do this until there is no r1 or r2 left. + if r2[0] < r1[1] and r1[0] < r2[1]: + # since we found an overlap, r1 will be skipped/excluded + # and move to the next r1 + # get rid of this one + n_rl1 -= 1 + self.length -= r1[1] - r1[0] + selected_idx[i] = False + + if n_rl1 > 0: + r1 = rl1_k() # take the next location + i += 1 + continue + else: + break + if r1[1] < r2[1]: + # in this case, we need to move to the next r1, + n_rl1 -= 1 + if n_rl1 > 0: + r1 = rl1_k() # take the next location + i += 1 + else: + # no more r1 left + break + else: + # in this case, we need to move the next r2 + if n_rl2: + r2 = rl2_k() # take the next region + n_rl2 -= 1 + else: + # no more r2 left + break + + self.locations[k] = locs[selected_idx] + self.size[k] = self.locations[k].shape[0] + # free memory? + # I know I should shrink it to 0 size directly, + # however, on Mac OSX, it seems directly assigning 0 + # doesn't do a thing. + selected_idx.resize(self.buffer_size, refcheck=False) + selected_idx.resize(0, refcheck=False) + self.finalize() # use length to set average_template_length and total + return + @cython.boundscheck(False) # do not check that np indices are valid @cython.ccall def filter_dup(self, maxnum: cython.int = -1): @@ -300,9 +399,10 @@ def filter_dup(self, maxnum: cython.int = -1): for k in chrnames: # for each chromosome locs = self.locations[k] - locs_size = locs.shape[0] + locs_size = self.size[k] if locs_size == 1: # do nothing and continue + self.total += locs_size continue # discard duplicate reads and make a new locations[k] # initialize boolean array as all TRUE, or all being kept @@ -340,9 +440,14 @@ def filter_dup(self, maxnum: cython.int = -1): return @cython.ccall - def sample_percent(self, percent: cython.float, seed: cython.int = -1): + def sample_percent(self, + percent: cython.float, + seed: cython.int = -1): """Sample the tags for a given percentage. + Note that the sampling is on each chromosome using the given + percentage. + Warning: the current object is changed! If a new PETrackI is wanted, use sample_percent_copy instead. @@ -381,8 +486,14 @@ def sample_percent(self, percent: cython.float, seed: cython.int = -1): return @cython.ccall - def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1): - """Sample the tags for a given percentage. Return a new PETrackI object + def sample_percent_copy(self, + percent: cython.float, + seed: cython.int = -1): + """Sample the tags for a given percentage. Return a new + PETrackI object + + Note that the sampling is on each chromosome using the given + percentage. """ # num: number of reads allowed on a certain chromosome @@ -392,7 +503,8 @@ def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1): ret_petrackI: PETrackI loc: cnp.ndarray - ret_petrackI = PETrackI(anno=self.annotation, buffer_size=self.buffer_size) + ret_petrackI = PETrackI(anno=self.annotation, + buffer_size=self.buffer_size) chrnames = self.get_chr_names() if seed >= 0: @@ -421,9 +533,14 @@ def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1): return ret_petrackI @cython.ccall - def sample_num(self, samplesize: cython.ulong, seed: cython.int = -1): + def sample_num(self, + samplesize: cython.ulong, + seed: cython.int = -1): """Sample the tags for a given number. + Note that the sampling is on each chromosome using the the + percentage calculated from the given number. + Warning: the current object is changed! """ percent: cython.float @@ -433,9 +550,14 @@ def sample_num(self, samplesize: cython.ulong, seed: cython.int = -1): return @cython.ccall - def sample_num_copy(self, samplesize: cython.ulong, seed: cython.int = -1): + def sample_num_copy(self, + samplesize: cython.ulong, + seed: cython.int = -1): """Sample the tags for a given number. + Note that the sampling is on each chromosome using the the + percentage calculated from the given number. + Warning: the current object is changed! """ percent: cython.float @@ -593,7 +715,7 @@ def pileup_bdg_hmmr(self, This is specifically designed for hmmratac HMM_SignalProcessing.py. Not a general function. - + The idea is that for each fragment length, we generate four bdg using four weights from four distributions. Then we add all sets of four bdgs together. @@ -928,9 +1050,9 @@ def subset(self, selected_barcodes: set): ret.is_sorted = self.is_sorted ret.rlengths = self.rlengths ret.buffer_size = self.buffer_size - ret.total = 0 + # ret.total = 0 ret.length = 0 - ret.average_template_length = 0 + # ret.average_template_length = 0 ret.is_destroyed = True for chromosome in sorted(chrs): @@ -940,11 +1062,12 @@ def subset(self, selected_barcodes: set): ret.locations[chromosome] = self.locations[chromosome][indices] ret.size[chromosome] = len(ret.locations[chromosome]) ret.buf_size[chromosome] = ret.size[chromosome] - ret.total += np.sum(ret.locations[chromosome]['c']) + # ret.total += np.sum(ret.locations[chromosome]['c']) ret.length += np.sum((ret.locations[chromosome]['r'] - ret.locations[chromosome]['l']) * ret.locations[chromosome]['c']) - ret.average_template_length = ret.length / ret.total + ret.finalize() + # ret.average_template_length = ret.length / ret.total return ret @cython.ccall @@ -982,3 +1105,101 @@ def pileup_bdg2(self): # bedGraphTrackII needs to be 'finalized'. bdg.finalize() return bdg + + @cython.boundscheck(False) # do not check that np indices are valid + @cython.ccall + def exclude(self, regions): + """Exclude reads overlapping with anything in the regions. + + Run it right after you add all data into this object. + """ + i: cython.ulong + k: bytes + locs: cnp.ndarray + locs_size: cython.ulong + chrnames: set + regions_c: list + selected_idx: cnp.ndarray + regions_chrs: list + r1: cnp.void + r2: tuple + n_rl1: cython.long + n_rl2: cython.long + + if not self.is_sorted: + self.sort() + + assert isinstance(regions, Regions) + regions.sort() + regions_chrs = list(regions.regions.keys()) + + chrnames = self.get_chr_names() + + for k in chrnames: # for each chromosome + locs = self.locations[k] + locs_size = self.size[k] + # let's check if k is in regions_chr + if k not in regions_chrs: + # do nothing and continue + continue + + # discard overlapping reads and make a new locations[k] + # initialize boolean array as all TRUE, or all being kept + selected_idx = np.ones(locs_size, dtype=bool) + + regions_c = regions.regions[k] + + i = 0 + n_rl1 = len(locs) + n_rl2 = len(regions_c) + rl1_k = iter(locs).__next__ + rl2_k = iter(regions_c).__next__ + r1 = rl1_k() + n_rl1 -= 1 # remaining rl1 + r2 = rl2_k() + n_rl2 -= 1 # remaining rl2 + while (True): + # we do this until there is no r1 or r2 left. + if r2[0] < r1[1] and r1[0] < r2[1]: + # since we found an overlap, r1 will be skipped/excluded + # and move to the next r1 + # get rid of this one + n_rl1 -= 1 + self.length -= (r1[1] - r1[0])*r1[2] + selected_idx[i] = False + + if n_rl1 >= 0: + r1 = rl1_k() + i += 1 + continue + else: + break + if r1[1] < r2[1]: + # in this case, we need to move to the next r1, + n_rl1 -= 1 + if n_rl1 >= 0: + r1 = rl1_k() + i += 1 + else: + # no more r1 left + break + else: + # in this case, we need to move the next r2 + if n_rl2: + r2 = rl2_k() + n_rl2 -= 1 + else: + # no more r2 left + break + + self.locations[k] = locs[selected_idx] + self.barcodes[k] = self.barcodes[k][selected_idx] + self.size[k] = self.locations[k].shape[0] + # free memory? + # I know I should shrink it to 0 size directly, + # however, on Mac OSX, it seems directly assigning 0 + # doesn't do a thing. + selected_idx.resize(self.buffer_size, refcheck=False) + selected_idx.resize(0, refcheck=False) + self.finalize() + return diff --git a/MACS3/Signal/Region.py b/MACS3/Signal/Region.py index 85d97ae4..fecb330d 100644 --- a/MACS3/Signal/Region.py +++ b/MACS3/Signal/Region.py @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2024-10-04 16:24:07 Tao Liu> +# Time-stamp: <2025-02-05 18:11:50 Tao Liu> """Module for Region classe. diff --git a/MACS3/Utilities/OptValidator.py b/MACS3/Utilities/OptValidator.py index 1ae98597..66bdfbe5 100644 --- a/MACS3/Utilities/OptValidator.py +++ b/MACS3/Utilities/OptValidator.py @@ -1,4 +1,4 @@ -# Time-stamp: <2025-02-05 12:41:15 Tao Liu> +# Time-stamp: <2025-02-12 14:12:21 Tao Liu> """Module Description This code is free software; you can redistribute it and/or modify it @@ -792,36 +792,8 @@ def opt_validate_hmmratac(options): logger.error(" In order to use -c or --prescan-cutoff, the cutoff must be larger than 1.") sys.exit(1) - if options.openregion_minlen < 0: # and options.store_peaks == True: + if options.openregion_minlen < 0: # and options.store_peaks == True: logger.error(" In order to use --minlen, the length should not be negative.") sys.exit(1) - #if options.call_score.lower() not in [ 'max', 'ave', 'med', 'fc', 'zscore', 'all']: - # logger.error(" Invalid method: %s" % options.call_score) - # sys.exit(1) - - # call_threshold non-negative - #if options.call_threshold <0: - # logger.error(" --threshold should not be negative! ") - # sys.exit(1) - - # Misc - # misc_blacklist - #if options.misc_keep_duplicates: - # options.argtxt += "# Duplicate reads from analysis will be stored. \n" - - # misc_trim non-negative - #if options.misc_trim <0: - # logger.error(" --trim should not be negative! ") - # sys.exit(1) - - # np # should this be mp? non-negative - #if options.np <0: - # logger.error(" -m, --multiple-processing should not be negative! ") - # sys.exit(1) - - # min_map_quality non-negative - #if options.min_map_quality <0: - # logger.error(" -q, --minmapq should not be negative! ") - # sys.exit(1) return options diff --git a/bin/macs3 b/bin/macs3 index 824be5f7..116094db 100644 --- a/bin/macs3 +++ b/bin/macs3 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Time-stamp: <2025-02-05 12:43:00 Tao Liu> +# Time-stamp: <2025-02-12 15:38:21 Tao Liu> """Description: MACS v3 main executable. @@ -987,12 +987,12 @@ plus an extra option for the HMM model file like `macs3 hmmratac group_misc.add_argument("--randomSeed", dest="hmm_randomSeed", type=int, help="Seed to set for random sampling of training regions. Default: 10151", default=10151) - group_misc.add_argument("--decoding-steps", dest="decoding_steps", type=int, default=1000, + group_misc.add_argument("--decoding-steps", dest="decoding_steps", type=int, default=1000, help="Number of candidate regions to be decoded at a time. The HMM model will be applied with Viterbi to find the optimal state path in each region. bigger the number, 'possibly' faster the decoding process, 'definitely' larger the memory usage. Default: 1000.") - group_misc.add_argument("-e", "--blacklist", dest="blacklist", type=str, required=False, - help="Filename of blacklisted regions to exclude (previously was BED_file). Examples are those from ENCODE. Default: NA") - group_misc.add_argument("--keep-duplicates", dest="misc_keep_duplicates", action="store_true", - help="Keep duplicate reads from analysis. By default, duplicate reads will be removed. Default: False", + group_misc.add_argument("-e", "--blacklist", dest="blacklist", type=str, required=False, + help="Filename of blacklisted regions to exclude. Fragments aligned to such regions will be excluded from analysis. Examples are those from ENCODE. Default: None") + group_misc.add_argument("--remove-dup", dest="misc_remove_duplicates", action="store_true", + help="Remove duplicated fragments from analysis. A fragment is duplicated if both ends of the fragment are the same as another fragment. By default, duplicated fragments won't be excluded. Default: False", default=False) # group_misc.add_argument("--trim", dest="misc_trim", type=int, # help="How many signals from the end to trim off (ie starting with tri signal then di etc). This may be useful if your data doesn't contain many large fragments. Default: 0", diff --git a/docs/source/docs/hmmratac.md b/docs/source/docs/hmmratac.md index 9e2b9c42..7592f621 100644 --- a/docs/source/docs/hmmratac.md +++ b/docs/source/docs/hmmratac.md @@ -99,10 +99,13 @@ output file names. DEFAULT: "NA" ### `-e BLACKLIST`/`--blacklist BLACKLIST` -Filename of blacklisted regions to exclude from training and peak -detection. An example of such file can be found from [ENCODE -project](https://github.com/Boyle-Lab/Blacklist/). By default, there -is no blacklist file. +Filename of the file containing the blacklisted regions to exclude +from the process. Any fragments overlapping with blacklisted regions +are excluded. An example of such file can be found from the ENCODE +project at: https://github.com/Boyle-Lab/Blacklist/. Alternatively, if +you wish to exclude centromeres and telomeres, you can find their +genomic coordinates and write them to a BED format file. By default, +there is no blacklist file in use. ### `--modelonly` diff --git a/test/test_BedGraph.py b/test/test_BedGraph.py index b40f80f3..ee9971d6 100644 --- a/test/test_BedGraph.py +++ b/test/test_BedGraph.py @@ -1,80 +1,63 @@ -#!/usr/bin/env python -# Time-stamp: <2024-05-15 19:31:36 Tao Liu> +# Time-stamp: <2025-02-10 08:46:48 Tao Liu> import pytest -from MACS3.Signal.BedGraph import * +from MACS3.Signal.BedGraph import bedGraphTrackI from MACS3.IO.PeakIO import PeakIO -test_regions1 = [(b"chrY",0,10593155,0.0), - (b"chrY",10593155,10597655,0.0066254580149)] -overlie_max = [(b"chrY", 0, 75, 20.0), # max - (b"chrY", 75, 85, 35.0), - (b"chrY", 85, 90, 75.0), - (b"chrY", 90,150, 10.0), - (b"chrY",150,155, 0.0)] -overlie_mean = [(b"chrY", 0, 70, 6.66667), # mean - (b"chrY", 70, 75, 9), - (b"chrY", 75, 80, 14), - (b"chrY", 80, 85, 11.66667), - (b"chrY", 85, 90, 36.66667), - (b"chrY", 90,150, 3.33333), - (b"chrY",150,155, 0.0)] -overlie_fisher = [(b"chrY", 0, 70, (0,0,20), 92.10340371976183, 1.1074313239555578e-17, 16.9557 ), # fisher - (b"chrY", 70, 75, (7,0,20), 124.33959502167846, 1.9957116587802055e-24, 23.6999 ), - (b"chrY", 75, 80, (7,0,35), 193.41714781149986, 4.773982707347631e-39, 38.3211 ), - (b"chrY", 80, 85, (0,0,35), 161.1809565095832, 3.329003070922764e-32, 31.4777), - (b"chrY", 85, 90, (0,75,35), 506.56872045869005, 3.233076792862357e-106, 105.4904), - (b"chrY", 90,150, (0,0,10), 46.051701859880914, 2.8912075645386016e-08, 7.5389), - (b"chrY",150,155, (0,0,0), 0, 1.0, 0.0)] - +test_regions1 = [(b"chrY", 0, 10593155, 0.0), + (b"chrY", 10593155, 10597655, 0.0066254580149)] +overlie_max = [(b"chrY", 0, 75, 20.0), + (b"chrY", 75, 85, 35.0), + (b"chrY", 85, 90, 75.0), + (b"chrY", 90, 150, 10.0), + (b"chrY", 150, 155, 0.0)] +overlie_mean = [(b"chrY", 0, 70, 6.66667), + (b"chrY", 70, 75, 9.0), + (b"chrY", 75, 80, 14.0), + (b"chrY", 80, 85, 11.66667), + (b"chrY", 85, 90, 36.66667), + (b"chrY", 90, 150, 3.33333), + (b"chrY", 150, 155, 0.0)] +overlie_fisher = [(b"chrY", 0, 70, (0, 0, 20), 92.10340371976183, 1.1074313239555578e-17, 16.9557), + (b"chrY", 70, 75, (7, 0, 20), 124.33959502167846, 1.9957116587802055e-24, 23.6999), + (b"chrY", 75, 80, (7, 0, 35), 193.41714781149986, 4.773982707347631e-39, 38.3211), + (b"chrY", 80, 85, (0, 0, 35), 161.1809565095832, 3.329003070922764e-32, 31.4777), + (b"chrY", 85, 90, (0, 75, 35), 506.56872045869005, 3.233076792862357e-106, 105.4904), + (b"chrY", 90, 150, (0, 0, 10), 46.051701859880914, 2.8912075645386016e-08, 7.5389), + (b"chrY", 150, 155, (0, 0, 0), 0.0, 1.0, 0.0)] some_peak = [(b"chrY", 50, 80), (b"chrY", 95, 152)] + @pytest.fixture def define_regions(): - test_regions1 = [(b"chrY",0,70,0.00), - (b"chrY",70,80,7.00), - (b"chrY",80,150,0.00)] - test_regions2 = [(b"chrY",0,85,0.00), - (b"chrY",85,90,75.00), - (b"chrY",90,155,0.00)] - test_regions3 = [(b"chrY",0,75,20.0), - (b"chrY",75,90,35.0), - (b"chrY",90,150,10.0)] + test_regions1 = [(b"chrY", 0, 70, 0.0), + (b"chrY", 70, 80, 7.0), + (b"chrY", 80, 150, 0.0)] + test_regions2 = [(b"chrY", 0, 85, 0.0), + (b"chrY", 85, 90, 75.0), + (b"chrY", 90, 155, 0.0)] + test_regions3 = [(b"chrY", 0, 75, 20.0), + (b"chrY", 75, 90, 35.0), + (b"chrY", 90, 150, 10.0)] bdg1 = bedGraphTrackI() bdg2 = bedGraphTrackI() bdg3 = bedGraphTrackI() for a in test_regions1: - bdg1.add_loc(a[0],a[1],a[2],a[3]) - + bdg1.add_loc(a[0], a[1], a[2], a[3]) for a in test_regions2: - bdg2.add_loc(a[0],a[1],a[2],a[3]) - + bdg2.add_loc(a[0], a[1], a[2], a[3]) for a in test_regions3: - bdg3.add_loc(a[0],a[1],a[2],a[3]) - + bdg3.add_loc(a[0], a[1], a[2], a[3]) return (bdg1, bdg2, bdg3) -def test_add_loc1( ): + +def test_add_loc1(): bdg = bedGraphTrackI() for a in test_regions1: - bdg.add_loc(a[0],a[1],a[2],a[3]) - -def test_overlie_max( define_regions ): - (bdg1,bdg2,bdg3) = define_regions - bdgb = bdg1.overlie([bdg2,bdg3], func="max") + bdg.add_loc(a[0], a[1], a[2], a[3]) - chrom = b"chrY" - (p,v) = bdgb.get_data_by_chr(chrom) - pre = 0 - for i in range(len(p)): - pos = p[i] - value = v[i] - assert pre == pytest.approx( overlie_max[i][1] ) - assert pos == pytest.approx( overlie_max[i][2] ) - assert value == pytest.approx( overlie_max[i][3] ) - pre = pos def test_refine_peak(): bdg = bedGraphTrackI() @@ -82,37 +65,8 @@ def test_refine_peak(): bdg.add_loc(a[0], a[1], a[2], a[3]) peak = PeakIO() for a in some_peak: - peak.add( a[0], a[1], a[2]) - new_peak = bdg.refine_peaks( peak ) + peak.add(a[0], a[1], a[2]) + new_peak = bdg.refine_peaks(peak) out = str(new_peak) std = "chrom:chrY\tstart:50\tend:80\tname:peak_1\tscore:14\tsummit:77\nchrom:chrY\tstart:95\tend:152\tname:peak_2\tscore:3.33333\tsummit:122\n" assert out == std - - # def test_overlie_mean(self): - # bdgb = self.bdg1.overlie([self.bdg2,self.bdg3], func="mean") - - # chrom = b"chrY" - # (p,v) = bdgb.get_data_by_chr(chrom) - # pre = 0 - # for i in range(len(p)): - # pos = p[i] - # value = v[i] - # self.assertEqual_float( self.test_overlie_mean[i][1], pre ) - # self.assertEqual_float( self.test_overlie_mean[i][2], pos ) - # self.assertEqual_float( self.test_overlie_mean[i][3], value ) - # pre = pos - - # def test_overlie_fisher(self): - # bdgb = self.bdg1.overlie([self.bdg2,self.bdg3], func="fisher") - - # chrom = b"chrY" - # (p,v) = bdgb.get_data_by_chr(chrom) - # pre = 0 - # for i in range(len(p)): - # pos = p[i] - # value = v[i] - # self.assertEqual_float( self.test_overlie_fisher[i][1], pre ) - # self.assertEqual_float( self.test_overlie_fisher[i][2], pos ) - # self.assertEqual_float( self.test_overlie_fisher[i][6], value ) - # pre = pos - diff --git a/test/test_PairedEndTrack.py b/test/test_PairedEndTrack.py index 9779feb3..a4ec5f3f 100644 --- a/test/test_PairedEndTrack.py +++ b/test/test_PairedEndTrack.py @@ -1,183 +1,183 @@ #!/usr/bin/env python -# Time-stamp: <2025-02-05 09:28:39 Tao Liu> +# Time-stamp: <2025-02-12 13:44:51 Tao Liu> import unittest from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII +from MACS3.Signal.Region import Regions import numpy as np class Test_PETrackI(unittest.TestCase): def setUp(self): - self.input_regions = [(b"chrY", 0, 100), - (b"chrY", 70, 270), - (b"chrY", 70, 100), - (b"chrY", 80, 160), - (b"chrY", 80, 160), - (b"chrY", 80, 180), - (b"chrY", 80, 180), - (b"chrY", 85, 185), - (b"chrY", 85, 285), - (b"chrY", 85, 285), - (b"chrY", 85, 285), - (b"chrY", 85, 385), - (b"chrY", 90, 190), - (b"chrY", 90, 190), - (b"chrY", 90, 191), - (b"chrY", 150, 190), - (b"chrY", 150, 250), + self.input_regions = [(b"chrY", 0, 100), # 17 tags in chrY + (b"chrY", 70, 270), # will exclude + (b"chrY", 70, 100), # will exclude + (b"chrY", 80, 160), # will exclude + (b"chrY", 80, 160), # will exclude + (b"chrY", 80, 180), # will exclude + (b"chrY", 80, 180), # will exclude + (b"chrY", 85, 185), # will exclude + (b"chrY", 85, 285), # will exclude + (b"chrY", 85, 285), # will exclude + (b"chrY", 85, 285), # will exclude + (b"chrY", 85, 385), # will exclude + (b"chrY", 90, 190), # will exclude + (b"chrY", 90, 190), # will exclude + (b"chrY", 90, 191), # will exclude + (b"chrY", 150, 190), # will exclude + (b"chrY", 150, 250), # will exclude + (b"chrX", 0, 100), # 9 tags in chrX + (b"chrX", 70, 270), # will exclude + (b"chrX", 70, 100), # will exclude + (b"chrX", 80, 160), + (b"chrX", 80, 180), + (b"chrX", 85, 185), + (b"chrX", 85, 285), + (b"chrX", 90, 190), + (b"chrX", 90, 191) ] self.t = sum([x[2]-x[1] for x in self.input_regions]) - def test_add_loc(self): - pe = PETrackI() - for (c, l, r) in self.input_regions: - pe.add_loc(c, l, r) - pe.finalize() - # roughly check the numbers... - self.assertEqual(pe.total, 17) - self.assertEqual(pe.length, self.t) + self.exclude_regions = [(b"chrY", 85, 200), (b"chrX", 50, 75)] + self.x_regions = Regions() + for r in self.exclude_regions: + self.x_regions.add_loc(r[0], r[1], r[2]) - def test_filter_dup(self): - pe = PETrackI() + self.pe = PETrackI() for (c, l, r) in self.input_regions: - pe.add_loc(c, l, r) - pe.finalize() - # roughly check the numbers... - self.assertEqual(pe.total, 17) - self.assertEqual(pe.length, self.t) + self.pe.add_loc(c, l, r) + self.pe.finalize() - # filter out more than 3 tags - pe.filter_dup(3) - self.assertEqual(pe.total, 17) + def test_add_loc(self): + self.assertEqual(self.pe.total, 26) + self.assertEqual(self.pe.length, self.t) + + def test_filter_dup(self): + self.pe.filter_dup(3) + self.assertEqual(self.pe.total, 26) - # filter out more than 2 tags - pe.filter_dup(2) - self.assertEqual(pe.total, 16) + self.pe.filter_dup(2) + self.assertEqual(self.pe.total, 25) - # filter out more than 1 tag - pe.filter_dup(1) - self.assertEqual(pe.total, 12) + self.pe.filter_dup(1) + self.assertEqual(self.pe.total, 21) def test_sample_num(self): - pe = PETrackI() - for (c, l, r) in self.input_regions: - pe.add_loc(c, l, r) - pe.finalize() - pe.sample_num(10) - self.assertEqual(pe.total, 10) + self.pe.sample_num(10) # percentage is 38.5% + self.assertEqual(self.pe.total, 9) def test_sample_percent(self): - pe = PETrackI() - for (c, l, r) in self.input_regions: - pe.add_loc(c, l, r) - pe.finalize() - pe.sample_percent(0.5) - self.assertEqual(pe.total, 8) + self.pe.sample_percent(0.5) # 12=int(0.5*17)+int(0.5*9) + self.assertEqual(self.pe.total, 12) def test_pileupbdg(self): - pe = PETrackI() - for (c, l, r) in self.input_regions: - pe.add_loc(c, l, r) - pe.finalize() - pe.pileup_bdg() + self.pe.pileup_bdg() + + def test_exclude(self): + self.pe.exclude(self.x_regions) + self.assertEqual(self.pe.total, 7) + self.assertEqual(self.pe.length, 781) + self.assertAlmostEqual(self.pe.average_template_length, 112, 0) + class Test_PETrackII(unittest.TestCase): def setUp(self): - self.input_regions = [(b"chrY", 0, 100, b"0w#AAACGAAAGACTCGGA", 2), - (b"chrY", 70, 170, b"0w#AAACGAAAGACTCGGA", 1), + self.input_regions = [(b"chrY", 0, 100, b"0w#AAACGAAAGACTCGGA", 2), # will be excluded + (b"chrY", 70, 170, b"0w#AAACGAAAGACTCGGA", 1), # will be excluded (b"chrY", 80, 190, b"0w#AAACGAAAGACTCGGA", 1), (b"chrY", 85, 180, b"0w#AAACGAAAGACTCGGA", 3), (b"chrY", 100, 190, b"0w#AAACGAAAGACTCGGA", 1), - (b"chrY", 0, 100, b"0w#AAACGAACAAGTAACA", 1), - (b"chrY", 70, 170, b"0w#AAACGAACAAGTAACA", 2), + (b"chrY", 0, 100, b"0w#AAACGAACAAGTAACA", 1), # will be excluded + (b"chrY", 70, 170, b"0w#AAACGAACAAGTAACA", 2), # will be excluded (b"chrY", 80, 190, b"0w#AAACGAACAAGTAACA", 1), (b"chrY", 85, 180, b"0w#AAACGAACAAGTAACA", 1), (b"chrY", 100, 190, b"0w#AAACGAACAAGTAACA", 3), - (b"chrY", 10, 110, b"0w#AAACGAACAAGTAAGA", 1), - (b"chrY", 50, 160, b"0w#AAACGAACAAGTAAGA", 2), + (b"chrY", 10, 110, b"0w#AAACGAACAAGTAAGA", 1), # will be excluded + (b"chrY", 50, 160, b"0w#AAACGAACAAGTAAGA", 2), # will be excluded (b"chrY", 100, 170, b"0w#AAACGAACAAGTAAGA", 3) ] - self.pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4") - self.pileup_v = np.array([3.0, 4.0, 6.0, 9.0, 11.0, 15.0, 19.0, 18.0, 16.0, 10.0, 6.0], dtype="f4") + + self.exclude_regions = [(b"chrY", 10, 75)] + self.x_regions = Regions() + for r in self.exclude_regions: + self.x_regions.add_loc(r[0], r[1], r[2]) + + self.pileup_p = np.array([10, 50, 70, 80, 85, + 100, 110, 160, 170, 180, + 190], dtype="i4") + self.pileup_v = np.array([3.0, 4.0, 6.0, 9.0, 11.0, + 15.0, 19.0, 18.0, 16.0, 10.0, + 6.0], dtype="f4") self.peak_str = "chrom:chrY start:80 end:180 name:peak_1 score:19 summit:105\n" self.subset_barcodes = {b'0w#AAACGAACAAGTAACA', b"0w#AAACGAACAAGTAAGA"} - self.subset_pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4") - self.subset_pileup_v = np.array([1.0, 2.0, 4.0, 6.0, 7.0, 8.0, 13.0, 12.0, 10.0, 5.0, 4.0], dtype="f4") + self.subset_pileup_p = np.array([10, 50, 70, 80, 85, + 100, 110, 160, 170, 180, + 190], dtype="i4") + self.subset_pileup_v = np.array([1.0, 2.0, 4.0, 6.0, 7.0, + 8.0, 13.0, 12.0, 10.0, 5.0, + 4.0], dtype="f4") self.subset_peak_str = "chrom:chrY start:100 end:170 name:peak_1 score:13 summit:105\n" - self.t = sum([(x[2]-x[1]) * x[4] for x in self.input_regions]) - def test_add_frag(self): - pe = PETrackII() + self.pe = PETrackII() for (c, l, r, b, C) in self.input_regions: - pe.add_loc(c, l, r, b, C) - pe.finalize() - # roughly check the numbers... - self.assertEqual(pe.total, 22) - self.assertEqual(pe.length, self.t) - - # subset - pe_subset = pe.subset(self.subset_barcodes) - # roughly check the numbers... + self.pe.add_loc(c, l, r, b, C) + self.pe.finalize() + + def test_add_frag(self): + self.assertEqual(self.pe.total, 22) + self.assertEqual(self.pe.length, self.t) + + pe_subset = self.pe.subset(self.subset_barcodes) self.assertEqual(pe_subset.total, 14) self.assertEqual(pe_subset.length, 1305) def test_pileup(self): - pe = PETrackII() - for (c, l, r, b, C) in self.input_regions: - pe.add_loc(c, l, r, b, C) - pe.finalize() - bdg = pe.pileup_bdg() - d = bdg.get_data_by_chr(b'chrY') # (p, v) of ndarray + bdg = self.pe.pileup_bdg() + d = bdg.get_data_by_chr(b'chrY') np.testing.assert_array_equal(d[0], self.pileup_p) np.testing.assert_array_equal(d[1], self.pileup_v) - pe_subset = pe.subset(self.subset_barcodes) + pe_subset = self.pe.subset(self.subset_barcodes) bdg = pe_subset.pileup_bdg() - d = bdg.get_data_by_chr(b'chrY') # (p, v) of ndarray + d = bdg.get_data_by_chr(b'chrY') np.testing.assert_array_equal(d[0], self.subset_pileup_p) np.testing.assert_array_equal(d[1], self.subset_pileup_v) def test_pileup2(self): - pe = PETrackII() - for (c, l, r, b, C) in self.input_regions: - pe.add_loc(c, l, r, b, C) - pe.finalize() - bdg = pe.pileup_bdg2() - d = bdg.get_data_by_chr(b'chrY') # (p, v) of ndarray + bdg = self.pe.pileup_bdg2() + d = bdg.get_data_by_chr(b'chrY') np.testing.assert_array_equal(d['p'], self.pileup_p) np.testing.assert_array_equal(d['v'], self.pileup_v) - pe_subset = pe.subset(self.subset_barcodes) + pe_subset = self.pe.subset(self.subset_barcodes) bdg = pe_subset.pileup_bdg2() - d = bdg.get_data_by_chr(b'chrY') # (p, v) of ndarray + d = bdg.get_data_by_chr(b'chrY') np.testing.assert_array_equal(d['p'], self.subset_pileup_p) np.testing.assert_array_equal(d['v'], self.subset_pileup_v) def test_callpeak(self): - pe = PETrackII() - for (c, l, r, b, C) in self.input_regions: - pe.add_loc(c, l, r, b, C) - pe.finalize() - bdg = pe.pileup_bdg() # bedGraphTrackI object + bdg = self.pe.pileup_bdg() peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) self.assertEqual(str(peaks), self.peak_str) - pe_subset = pe.subset(self.subset_barcodes) + pe_subset = self.pe.subset(self.subset_barcodes) bdg = pe_subset.pileup_bdg() peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) self.assertEqual(str(peaks), self.subset_peak_str) def test_callpeak2(self): - pe = PETrackII() - for (c, l, r, b, C) in self.input_regions: - pe.add_loc(c, l, r, b, C) - pe.finalize() - bdg = pe.pileup_bdg2() # bedGraphTrackII object + bdg = self.pe.pileup_bdg2() peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) self.assertEqual(str(peaks), self.peak_str) - pe_subset = pe.subset(self.subset_barcodes) + pe_subset = self.pe.subset(self.subset_barcodes) bdg = pe_subset.pileup_bdg2() peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) self.assertEqual(str(peaks), self.subset_peak_str) + + def test_exclude(self): + self.pe.exclude(self.x_regions) + self.assertEqual(self.pe.total, 13) + self.assertEqual(self.pe.length, 1170) + self.assertAlmostEqual(self.pe.average_template_length, 90, 0) diff --git a/test/test_Parser.py b/test/test_Parser.py index 09c82c6d..e1ddf2e4 100644 --- a/test/test_Parser.py +++ b/test/test_Parser.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Time-stamp: <2024-10-16 00:13:01 Tao Liu> +# Time-stamp: <2025-02-12 11:38:00 Tao Liu> import unittest @@ -50,5 +50,5 @@ def test_fragment_file(self): bdg2 = petrack.pileup_bdg2() peaks = bdg.call_peaks(cutoff=10, min_length=200, max_gap=100) peaks2 = bdg2.call_peaks(cutoff=10, min_length=200, max_gap=100) - print(peaks) - print(peaks2) + # print(peaks) + # print(peaks2)