subsample_bam

nanoporetech · Dec 13, 2023 · d33dc2b · d33dc2b
1 parent 19d96f0
commit d33dc2b
Show file tree

Hide file tree

Showing 8 changed files with 327 additions and 234 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.3.14] - 2023-10-25
+### Changed
+- `subsample_bam` and `coverage_from_bam` now have unified read filtering options and logic.
+### Added
+- `filter_bam` to filter a bam with the same logic used in `subsample_bam`. 
+### Fixed
+- `subsample_bam` was previously subsampling proportionally before filtering resulting in lower than expected depth. 
+
 ## [v0.3.13] - 2023-06-23
 ### Changed
 - `subsample_bam`: `--force_low_coverage` saves contigs with coverage below the target 

diff --git a/pomoxis/__init__.py b/pomoxis/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.3.13'
+__version__ = '0.3.14'
 
 import argparse
 import os

diff --git a/pomoxis/coverage_from_bam.py b/pomoxis/coverage_from_bam.py
@@ -1,13 +1,16 @@
 import argparse
+import functools
 import logging
-import numpy as np
 import os
+
+import numpy as np
 import pandas as pd
 import pysam
-from pomoxis.util import parse_regions, Region
 
+from pomoxis.util import filter_args, filter_read, parse_regions, Region
 
-def coverage_of_region(region, bam_fp, stride, primary_only=False):
+
+def coverage_of_region(region, bam_fp, stride, read_filter_func):
     """Get coverage data for a region"""
 
     bins = np.arange(region.start, region.end, stride)
@@ -16,7 +19,7 @@ def coverage_of_region(region, bam_fp, stride, primary_only=False):
     coverage_by_is_rev = {True: np.zeros(len(bins)), False: np.zeros(len(bins))}
     for r_obj in pysam.AlignmentFile(bam_fp).fetch(contig=region.ref_name, start=region.start, end=region.end):
         # Ignore secondary/supplementary maps when computing the median depth, if requested
-        if primary_only and (r_obj.is_secondary or r_obj.is_supplementary):
+        if read_filter_func(r_obj):
             continue
         start_i = max((r_obj.reference_start - bins[0]) // stride, 0)
         end_i = min((r_obj.reference_end - bins[0]) // stride, len(bins))
@@ -39,24 +42,25 @@ def coverage_summary_of_region(*args, **kwargs):
 
 def main():
     logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
+
     parser = argparse.ArgumentParser(
         prog='coverage_from_bam',
         description='Calculate read coverage depth from a bam.',
         epilog='By default a file is written per reference sequence, this can be changed with the `--one_file` '
         'option. If overlapping regions are specified, `--one_file` should not be used.',
+        parents=[filter_args()],
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('bam', help='.fasta/fastq file.')
     parser.add_argument('-r', '--regions', nargs='+', help='Only process given regions.')
     grp = parser.add_mutually_exclusive_group()
     grp.add_argument('-p', '--prefix', help='Prefix for output, defaults to basename of bam.')
     grp.add_argument('-o', '--one_file', help='Single output file with "region" column.')
     parser.add_argument('-s', '--stride', type=int, default=1000, help='Stride in genomic coordinate.')
-    parser.add_argument('--primary_only', action='store_true', 
-                        help='Use only primary reads when computing the coverage.')
     parser.add_argument('--summary_only', action='store_true',
                         help='Output only the depth_summary.txt file')
 
     args = parser.parse_args()
+    read_filter_func = functools.partial(filter_read, args=args)
 
     bam = pysam.AlignmentFile(args.bam)
     ref_lengths = dict(zip(bam.references, bam.lengths))
@@ -76,7 +80,7 @@ def main():
             prefix = os.path.splitext(os.path.basename(args.bam))[0]
 
         region_str = '{}_{}_{}'.format(region.ref_name, region.start, region.end)
-        df = coverage_of_region(region, args.bam, args.stride, primary_only=args.primary_only)
+        df = coverage_of_region(region, args.bam, args.stride, read_filter_func)
         summary[region_str] = df['depth'].describe()
 
         if not args.summary_only:

diff --git a/pomoxis/filter_bam.py b/pomoxis/filter_bam.py
@@ -0,0 +1,40 @@
+import argparse
+import logging
+
+import pysam
+
+from pomoxis.util import filter_args, filter_read, parse_regions, Region
+
+
+def main():
+    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        prog='filter_bam',
+        description='Filter a bam',
+        parents=[filter_args()],
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('bam',
+        help='input bam file.')
+    parser.add_argument('-o', '--output_bam', default='filtered.bam',
+        help='Output bam file.')
+    parser.add_argument('-r', '--region',
+        help='Only process given region.')
+
+    args = parser.parse_args()
+    logger = logging.getLogger('filter')
+
+    with pysam.AlignmentFile(args.bam) as bam:
+
+        if args.region is not None:
+            ref_lengths = dict(zip(bam.references, bam.lengths))
+            region = parse_regions([args.region], ref_lengths=ref_lengths)[0]
+            reads = bam.fetch(region.ref_name, region.start, region.end)
+        else:
+            reads = bam
+        out_bam = pysam.AlignmentFile(args.output_bam, "wb", header=bam.header)
+        for read in reads:
+            if filter_read(read, args, logger):
+                continue
+            out_bam.write(read)
+    out_bam.close()
+    pysam.index(args.output_bam)
diff --git a/pomoxis/stats_from_bam.py b/pomoxis/stats_from_bam.py
@@ -12,7 +12,7 @@
 
 import pysam
 
-from pomoxis.util import intervaltrees_from_bed
+from pomoxis.util import intervaltrees_from_bed, masked_stats_from_aligned_read, stats_from_aligned_read
 
 parser = argparse.ArgumentParser(
         prog='stats_from_bam',
@@ -32,151 +32,6 @@
                     help='Number of threads for parallel processing.')
 
 
-def stats_from_aligned_read(read, references, lengths):
-    """Create summary information for an aligned read
-
-    :param read: :class:`pysam.AlignedSegment` object
-    """
-    try:
-        NM = read.get_tag('NM')
-    except:
-        raise IOError("Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.")
-
-    name = read.qname
-    start_offset = 0
-    if read.is_secondary or read.is_supplementary:
-        first_cig = read.cigartuples[0]
-        if first_cig[0] == 5:  # should always be true for minimap2
-            start_offset = first_cig[1]
-    counts, _ = read.get_cigar_stats()
-    match = counts[0] + counts[7] + counts[8]  # total of M, =, and X
-    ins = counts[1]
-    delt = counts[2]
-
-    lra_flag = False
-    if read.has_tag('NX'):
-        # likely from lra
-        # NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32
-        sub = counts[8]
-        lra_flag = True
-    else:
-        # likely from minimap2
-        # NM is edit distance: NM = INS + DEL + SUB
-        sub = NM - ins - delt
-
-    length = match + ins + delt
-    iden = 100 * float(match - sub) / match
-    acc = 100 * float(match - sub) / length
-
-    read_length = read.infer_read_length()
-    coverage = 100 * float(read.query_alignment_length) / read_length
-    direction = '-' if read.is_reverse else '+'
-
-    results = {
-        "name": name,
-        "coverage": coverage,
-        "direction": direction,
-        "length": length,
-        "read_length": read_length,
-        "match": match,
-        "ins": ins,
-        "del": delt,
-        "sub": sub,
-        "iden": iden,
-        "acc": acc,
-        "qstart": read.query_alignment_start + start_offset,
-        "qend": read.query_alignment_end + start_offset,
-        "rstart": read.reference_start,
-        "rend": read.reference_end,
-        "ref": references[read.reference_id],
-        "aligned_ref_len": read.reference_length,
-        "ref_coverage": 100*float(read.reference_length) / lengths[read.reference_id],
-        "mapq": read.mapping_quality,
-    }
-
-    return results, lra_flag
-
-
-def masked_stats_from_aligned_read(read, references, lengths, tree):
-    """Create summary information for an aligned read over regions in bed file.
-
-    :param read: :class:`pysam.AlignedSegment` object
-    """
-    try:
-        MD = read.get_tag('MD')
-    except:
-        raise IOError("Read is missing required 'MD' tag. Try running 'samtools callmd - ref.fa'.")
-
-    correct, delt, ins, sub, aligned_ref_len, masked = 0, 0, 0, 0, 0, 0
-    pairs = read.get_aligned_pairs(with_seq=True)
-    qseq = read.query_sequence
-    pos_is_none = lambda x: (x[1] is None or x[0] is None)
-    pos = None
-    insertions = []
-    # TODO: refactor to use get_trimmed_pairs (as in catalogue_errors)?
-    for qp, rp, rb in itertools.dropwhile(pos_is_none, pairs):
-        if rp == read.reference_end or (qp == read.query_alignment_end):
-            break
-        pos = rp if rp is not None else pos
-        if not tree.has_overlap(pos, pos + 1) or (rp is None and not tree.has_overlap(pos + 1, pos + 2)):
-            # if rp is None, we are in an insertion, check if pos + 1 overlaps
-            # (ref position of ins is arbitrary)
-            # print('Skipping ref {}:{}'.format(read.reference_name, pos))
-            masked += 1
-            continue
-        else:
-            if rp is not None:  # we don't have an insertion
-                aligned_ref_len += 1
-                if qp is None:  # deletion
-                    delt += 1
-                elif qseq[qp] == rb:  # correct
-                    correct += 1
-                elif qseq[qp] != rb:  # sub
-                    sub += 1
-            else:  # we have an insertion
-                ins += 1
-
-    name = read.qname
-    match = correct + sub
-    length = match + ins + delt
-
-    if match == 0:
-        # no matches within bed regions - all bed ref positions were deleted.
-        # skip this alignment.
-        return None
-
-    iden = 100 * float(match - sub) / match
-    acc = 100 - 100 * float(sub + ins + delt) / length
-
-    read_length = read.infer_read_length()
-    masked_query_alignment_length = correct + sub + ins
-    coverage = 100*float(masked_query_alignment_length) / read_length
-    direction = '-' if read.is_reverse else '+'
-
-    results = {
-        "name": name,
-        "coverage": coverage,
-        "direction": direction,
-        "length": length,
-        "read_length": read_length,
-        "match": match,
-        "ins": ins,
-        "del": delt,
-        "sub": sub,
-        "iden": iden,
-        "acc": acc,
-        "qstart": read.query_alignment_start,
-        "qend": read.query_alignment_end,
-        "rstart": read.reference_start,
-        "rend": read.reference_end,
-        "ref": references[read.reference_id],
-        "aligned_ref_len": aligned_ref_len,
-        "ref_coverage": 100 * float(aligned_ref_len) / lengths[read.reference_id],
-        "mapq": read.mapping_quality,
-        "masked": masked,
-    }
-    return results
-
 
 def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, bed_file=None):
     start, stop = start_stop
@@ -205,12 +60,12 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be
                     counts['masked'] += 1
                     continue
                 else:
-                    result = masked_stats_from_aligned_read(read, bam.references, bam.lengths, trees[read.reference_name])
+                    result = masked_stats_from_aligned_read(read, trees[read.reference_name])
                     if result is None:  # no matches within bed regions
                         counts['all_matches_masked'] +=  1
                         continue
             else:
-                result, lra_flag = stats_from_aligned_read(read, bam.references, bam.lengths)
+                result, lra_flag = stats_from_aligned_read(read)
 
             if min_length is not None and result['length'] < min_length:
                 counts['short'] += 1