From d33dc2b550b2aa044298cfa2293a8c644ce3e223 Mon Sep 17 00:00:00 2001 From: Michael Wykes Date: Wed, 13 Dec 2023 09:57:23 +0000 Subject: [PATCH] subsample_bam --- CHANGELOG.md | 8 ++ pomoxis/__init__.py | 2 +- pomoxis/coverage_from_bam.py | 18 ++- pomoxis/filter_bam.py | 40 ++++++ pomoxis/stats_from_bam.py | 151 +-------------------- pomoxis/subsample_bam.py | 86 ++---------- pomoxis/util.py | 255 ++++++++++++++++++++++++++++++++++- setup.py | 1 + 8 files changed, 327 insertions(+), 234 deletions(-) create mode 100755 pomoxis/filter_bam.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fe298e..e898689 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.3.14] - 2023-10-25 +### Changed +- `subsample_bam` and `coverage_from_bam` now have unified read filtering options and logic. +### Added +- `filter_bam` to filter a bam with the same logic used in `subsample_bam`. +### Fixed +- `subsample_bam` was previously subsampling proportionally before filtering resulting in lower than expected depth. + ## [v0.3.13] - 2023-06-23 ### Changed - `subsample_bam`: `--force_low_coverage` saves contigs with coverage below the target diff --git a/pomoxis/__init__.py b/pomoxis/__init__.py index 736f014..020beb5 100644 --- a/pomoxis/__init__.py +++ b/pomoxis/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.3.13' +__version__ = '0.3.14' import argparse import os diff --git a/pomoxis/coverage_from_bam.py b/pomoxis/coverage_from_bam.py index dd1616c..50d4266 100644 --- a/pomoxis/coverage_from_bam.py +++ b/pomoxis/coverage_from_bam.py @@ -1,13 +1,16 @@ import argparse +import functools import logging -import numpy as np import os + +import numpy as np import pandas as pd import pysam -from pomoxis.util import parse_regions, Region +from pomoxis.util import filter_args, filter_read, parse_regions, Region -def coverage_of_region(region, bam_fp, stride, primary_only=False): + +def coverage_of_region(region, bam_fp, stride, read_filter_func): """Get coverage data for a region""" bins = np.arange(region.start, region.end, stride) @@ -16,7 +19,7 @@ def coverage_of_region(region, bam_fp, stride, primary_only=False): coverage_by_is_rev = {True: np.zeros(len(bins)), False: np.zeros(len(bins))} for r_obj in pysam.AlignmentFile(bam_fp).fetch(contig=region.ref_name, start=region.start, end=region.end): # Ignore secondary/supplementary maps when computing the median depth, if requested - if primary_only and (r_obj.is_secondary or r_obj.is_supplementary): + if read_filter_func(r_obj): continue start_i = max((r_obj.reference_start - bins[0]) // stride, 0) end_i = min((r_obj.reference_end - bins[0]) // stride, len(bins)) @@ -39,11 +42,13 @@ def coverage_summary_of_region(*args, **kwargs): def main(): logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) + parser = argparse.ArgumentParser( prog='coverage_from_bam', description='Calculate read coverage depth from a bam.', epilog='By default a file is written per reference sequence, this can be changed with the `--one_file` ' 'option. If overlapping regions are specified, `--one_file` should not be used.', + parents=[filter_args()], formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('bam', help='.fasta/fastq file.') parser.add_argument('-r', '--regions', nargs='+', help='Only process given regions.') @@ -51,12 +56,11 @@ def main(): grp.add_argument('-p', '--prefix', help='Prefix for output, defaults to basename of bam.') grp.add_argument('-o', '--one_file', help='Single output file with "region" column.') parser.add_argument('-s', '--stride', type=int, default=1000, help='Stride in genomic coordinate.') - parser.add_argument('--primary_only', action='store_true', - help='Use only primary reads when computing the coverage.') parser.add_argument('--summary_only', action='store_true', help='Output only the depth_summary.txt file') args = parser.parse_args() + read_filter_func = functools.partial(filter_read, args=args) bam = pysam.AlignmentFile(args.bam) ref_lengths = dict(zip(bam.references, bam.lengths)) @@ -76,7 +80,7 @@ def main(): prefix = os.path.splitext(os.path.basename(args.bam))[0] region_str = '{}_{}_{}'.format(region.ref_name, region.start, region.end) - df = coverage_of_region(region, args.bam, args.stride, primary_only=args.primary_only) + df = coverage_of_region(region, args.bam, args.stride, read_filter_func) summary[region_str] = df['depth'].describe() if not args.summary_only: diff --git a/pomoxis/filter_bam.py b/pomoxis/filter_bam.py new file mode 100755 index 0000000..191d369 --- /dev/null +++ b/pomoxis/filter_bam.py @@ -0,0 +1,40 @@ +import argparse +import logging + +import pysam + +from pomoxis.util import filter_args, filter_read, parse_regions, Region + + +def main(): + logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) + parser = argparse.ArgumentParser( + prog='filter_bam', + description='Filter a bam', + parents=[filter_args()], + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('bam', + help='input bam file.') + parser.add_argument('-o', '--output_bam', default='filtered.bam', + help='Output bam file.') + parser.add_argument('-r', '--region', + help='Only process given region.') + + args = parser.parse_args() + logger = logging.getLogger('filter') + + with pysam.AlignmentFile(args.bam) as bam: + + if args.region is not None: + ref_lengths = dict(zip(bam.references, bam.lengths)) + region = parse_regions([args.region], ref_lengths=ref_lengths)[0] + reads = bam.fetch(region.ref_name, region.start, region.end) + else: + reads = bam + out_bam = pysam.AlignmentFile(args.output_bam, "wb", header=bam.header) + for read in reads: + if filter_read(read, args, logger): + continue + out_bam.write(read) + out_bam.close() + pysam.index(args.output_bam) diff --git a/pomoxis/stats_from_bam.py b/pomoxis/stats_from_bam.py index 851d618..36bb6e8 100755 --- a/pomoxis/stats_from_bam.py +++ b/pomoxis/stats_from_bam.py @@ -12,7 +12,7 @@ import pysam -from pomoxis.util import intervaltrees_from_bed +from pomoxis.util import intervaltrees_from_bed, masked_stats_from_aligned_read, stats_from_aligned_read parser = argparse.ArgumentParser( prog='stats_from_bam', @@ -32,151 +32,6 @@ help='Number of threads for parallel processing.') -def stats_from_aligned_read(read, references, lengths): - """Create summary information for an aligned read - - :param read: :class:`pysam.AlignedSegment` object - """ - try: - NM = read.get_tag('NM') - except: - raise IOError("Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.") - - name = read.qname - start_offset = 0 - if read.is_secondary or read.is_supplementary: - first_cig = read.cigartuples[0] - if first_cig[0] == 5: # should always be true for minimap2 - start_offset = first_cig[1] - counts, _ = read.get_cigar_stats() - match = counts[0] + counts[7] + counts[8] # total of M, =, and X - ins = counts[1] - delt = counts[2] - - lra_flag = False - if read.has_tag('NX'): - # likely from lra - # NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32 - sub = counts[8] - lra_flag = True - else: - # likely from minimap2 - # NM is edit distance: NM = INS + DEL + SUB - sub = NM - ins - delt - - length = match + ins + delt - iden = 100 * float(match - sub) / match - acc = 100 * float(match - sub) / length - - read_length = read.infer_read_length() - coverage = 100 * float(read.query_alignment_length) / read_length - direction = '-' if read.is_reverse else '+' - - results = { - "name": name, - "coverage": coverage, - "direction": direction, - "length": length, - "read_length": read_length, - "match": match, - "ins": ins, - "del": delt, - "sub": sub, - "iden": iden, - "acc": acc, - "qstart": read.query_alignment_start + start_offset, - "qend": read.query_alignment_end + start_offset, - "rstart": read.reference_start, - "rend": read.reference_end, - "ref": references[read.reference_id], - "aligned_ref_len": read.reference_length, - "ref_coverage": 100*float(read.reference_length) / lengths[read.reference_id], - "mapq": read.mapping_quality, - } - - return results, lra_flag - - -def masked_stats_from_aligned_read(read, references, lengths, tree): - """Create summary information for an aligned read over regions in bed file. - - :param read: :class:`pysam.AlignedSegment` object - """ - try: - MD = read.get_tag('MD') - except: - raise IOError("Read is missing required 'MD' tag. Try running 'samtools callmd - ref.fa'.") - - correct, delt, ins, sub, aligned_ref_len, masked = 0, 0, 0, 0, 0, 0 - pairs = read.get_aligned_pairs(with_seq=True) - qseq = read.query_sequence - pos_is_none = lambda x: (x[1] is None or x[0] is None) - pos = None - insertions = [] - # TODO: refactor to use get_trimmed_pairs (as in catalogue_errors)? - for qp, rp, rb in itertools.dropwhile(pos_is_none, pairs): - if rp == read.reference_end or (qp == read.query_alignment_end): - break - pos = rp if rp is not None else pos - if not tree.has_overlap(pos, pos + 1) or (rp is None and not tree.has_overlap(pos + 1, pos + 2)): - # if rp is None, we are in an insertion, check if pos + 1 overlaps - # (ref position of ins is arbitrary) - # print('Skipping ref {}:{}'.format(read.reference_name, pos)) - masked += 1 - continue - else: - if rp is not None: # we don't have an insertion - aligned_ref_len += 1 - if qp is None: # deletion - delt += 1 - elif qseq[qp] == rb: # correct - correct += 1 - elif qseq[qp] != rb: # sub - sub += 1 - else: # we have an insertion - ins += 1 - - name = read.qname - match = correct + sub - length = match + ins + delt - - if match == 0: - # no matches within bed regions - all bed ref positions were deleted. - # skip this alignment. - return None - - iden = 100 * float(match - sub) / match - acc = 100 - 100 * float(sub + ins + delt) / length - - read_length = read.infer_read_length() - masked_query_alignment_length = correct + sub + ins - coverage = 100*float(masked_query_alignment_length) / read_length - direction = '-' if read.is_reverse else '+' - - results = { - "name": name, - "coverage": coverage, - "direction": direction, - "length": length, - "read_length": read_length, - "match": match, - "ins": ins, - "del": delt, - "sub": sub, - "iden": iden, - "acc": acc, - "qstart": read.query_alignment_start, - "qend": read.query_alignment_end, - "rstart": read.reference_start, - "rend": read.reference_end, - "ref": references[read.reference_id], - "aligned_ref_len": aligned_ref_len, - "ref_coverage": 100 * float(aligned_ref_len) / lengths[read.reference_id], - "mapq": read.mapping_quality, - "masked": masked, - } - return results - def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, bed_file=None): start, stop = start_stop @@ -205,12 +60,12 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be counts['masked'] += 1 continue else: - result = masked_stats_from_aligned_read(read, bam.references, bam.lengths, trees[read.reference_name]) + result = masked_stats_from_aligned_read(read, trees[read.reference_name]) if result is None: # no matches within bed regions counts['all_matches_masked'] += 1 continue else: - result, lra_flag = stats_from_aligned_read(read, bam.references, bam.lengths) + result, lra_flag = stats_from_aligned_read(read) if min_length is not None and result['length'] < min_length: counts['short'] += 1 diff --git a/pomoxis/subsample_bam.py b/pomoxis/subsample_bam.py index b92cb7b..b36e424 100755 --- a/pomoxis/subsample_bam.py +++ b/pomoxis/subsample_bam.py @@ -10,9 +10,8 @@ import pysam -from pomoxis.util import parse_regions, Region +from pomoxis.util import parse_regions, Region, filter_args, filter_read, write_bam from pomoxis.coverage_from_bam import coverage_summary_of_region -from pomoxis.stats_from_bam import stats_from_aligned_read def main(): @@ -20,6 +19,7 @@ def main(): parser = argparse.ArgumentParser( prog='subsample_bam', description='Subsample a bam to uniform or proportional depth', + parents=[filter_args()], formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('bam', help='input bam file.') @@ -31,22 +31,10 @@ def main(): help='Only process given regions.') parser.add_argument('-p', '--profile', type=int, default=1000, help='Stride in genomic coordinates for depth profile.') - parser.add_argument('-O', '--orientation', choices=['fwd', 'rev'], - help='Sample only forward or reverse reads.') parser.add_argument('-t', '--threads', type=int, default=-1, help='Number of threads to use.') - parser.add_argument('-q', '--quality', type=float, - help='Filter reads by mean qscore.') - parser.add_argument('-a', '--accuracy', type=float, - help='Filter reads by accuracy.') - parser.add_argument('-c', '--coverage', type=float, - help='Filter reads by coverage (what fraction of the read aligns).') - parser.add_argument('-l', '--length', type=int, default=None, - help='Filter reads by read length.') parser.add_argument('--force_low_depth', action='store_true', help='Force saving reads mapped to a sequence with coverage below the expected value.') - parser.add_argument('--force_non_primary', action='store_true', - help='Force saving non-primary alignments.') eparser = parser.add_mutually_exclusive_group() eparser.add_argument('--any_fail', action='store_true', @@ -96,14 +84,15 @@ def main(): def subsample_region_proportionally(region, args): logger = logging.getLogger(region.ref_name) - coverage_summary = coverage_summary_of_region(region, args.bam, args.stride, primary_only=True) + coverage_summary = coverage_summary_of_region( + region, args.bam, args.stride, functools.partial(filter_read, args=args, logger=logger)) col = 'depth_{}'.format(args.orientation) if args.orientation is not None else 'depth' median_coverage = coverage_summary[col].T['50%'] logger.info(f'Median coverage {median_coverage}') with pysam.AlignmentFile(args.bam) as bam: def _read_iter(): for r in bam.fetch(region.ref_name, region.start, region.end): - if not filter_read(r, bam, args, logger): + if not filter_read(r, args, logger): yield r # Prepare iterator reads = _read_iter() @@ -147,7 +136,7 @@ def _read_iter(): else: prefix = '{}_{}X'.format(args.output_prefix, int(median_coverage)) if n_reads > 0: - _write_bam(args.bam, prefix, region, target_reads['name'], secondary=args.force_non_primary) + write_bam(args.bam, prefix, region, target_reads['name'], keep_supplementary=args.keep_supplementary) coverage.fill(0.0) # reset coverage for each target depth for read in target_reads: coverage[read['start'] - region.start:read['end'] - region.start] += 1 @@ -156,43 +145,6 @@ def _read_iter(): return found_enough_depth -QSCORES_TO_PROBS = 10 ** (-0.1 * np.array(np.arange(100))) - -def filter_read(r, bam, args, logger): - """Decide whether a read should be filtered out, returning a bool""" - - # primary alignments - if r.is_secondary or r.is_supplementary: - return True - - # filter orientation - if (r.is_reverse and args.orientation == 'fwd') or \ - (not r.is_reverse and args.orientation == 'rev'): - return True - - # filter quality - if args.quality is not None: - mean_q = -10 * np.log10(np.mean(QSCORES_TO_PROBS[r.query_qualities])) - if mean_q < args.quality: - logger.debug(f"Filtering {r.query_name} with quality {mean_q:.2f}") - return True - - # filter accuracy or alignment coverage - if args.accuracy is not None or args.coverage is not None or args.length is not None: - stats, _ = stats_from_aligned_read(r, bam.references, bam.lengths) - if args.accuracy is not None and stats['acc'] < args.accuracy: - logger.info("Filtering {} by accuracy ({:.2f}).".format(r.query_name, stats['acc'])) - return True - if args.coverage is not None and stats['coverage'] < args.coverage: - logger.info("Filtering {} by coverage ({:.2f}).".format(r.query_name, stats['coverage'])) - return True - if args.length is not None and stats['read_length'] < args.length: - logger.info("Filtering {} by length ({:.2f}).".format(r.query_name, stats['length'])) - return True - # don't filter - return False - - def subsample_region_uniformly(region, args): logger = logging.getLogger(region.ref_name) logger.info("Building interval tree.") @@ -200,7 +152,7 @@ def subsample_region_uniformly(region, args): with pysam.AlignmentFile(args.bam) as bam: ref_lengths = dict(zip(bam.references, bam.lengths)) for r in bam.fetch(region.ref_name, region.start, region.end): - if filter_read(r, bam, args, logger): + if filter_read(r, args, logger): continue # trim reads to region tree.add(Interval( @@ -241,7 +193,7 @@ def subsample_region_uniformly(region, args): msg = "Hit target depth {}." logger.info(msg.format(target)) prefix = '{}_{}X'.format(args.output_prefix, target) - _write_bam(args.bam, prefix, region, reads, secondary=args.force_non_primary) + write_bam(args.bam, prefix, region, reads, keep_supplementary=args.keep_supplementary) _write_coverage(prefix, region, coverage, args.profile) try: target = next(targets) @@ -252,7 +204,7 @@ def subsample_region_uniformly(region, args): logger.warn(msg.format(target, int(median_depth))) prefix = '{}_{}X'.format(args.output_prefix, int(median_depth)) if median_depth > 0: - _write_bam(args.bam, prefix, region, reads, secondary=args.force_non_primary) + write_bam(args.bam, prefix, region, reads, keep_supplementary=args.keep_supplementary) _write_coverage(prefix, region, coverage, args.profile) break # exit if nothing happened this iteration @@ -295,26 +247,6 @@ def _nearest_overlapping_point(src, point): return items[0] -def _write_bam(bam, prefix, region, sequences, secondary=False): - # filtered bam - sequences = set(sequences) - taken = set() - output = '{}_{}.{}'.format(prefix, region.ref_name, os.path.basename(bam)) - src_bam = pysam.AlignmentFile(bam, "rb") - out_bam = pysam.AlignmentFile(output, "wb", template=src_bam) - for read in src_bam.fetch(region.ref_name, region.start, region.end): - if read.query_name in sequences and not (read.is_secondary or read.is_supplementary): - if read.query_name not in taken: - out_bam.write(read) - taken.add(read.query_name) - if read.query_name in sequences and secondary and \ - (read.is_secondary or read.is_supplementary): - out_bam.write(read) - src_bam.close() - out_bam.close() - pysam.index(output) - - def _write_coverage(prefix, region, coverage, profile): # depth profile output = '{}_{}.depth'.format(prefix, region.ref_name) diff --git a/pomoxis/util.py b/pomoxis/util.py index 96549a7..436b7be 100644 --- a/pomoxis/util.py +++ b/pomoxis/util.py @@ -1,6 +1,8 @@ import argparse from collections import namedtuple, defaultdict import itertools +import logging +import os import shutil import sys @@ -14,7 +16,7 @@ class FastxWrite: - def __init__(self, fname, mode='w', width=80, force_q=False, mock_q=10): + def __init__(self, fname, mode='w', width=80, force_q=False, mock_q=10): self.fname = fname self.mode = mode self.width = width @@ -436,3 +438,254 @@ def reverse_bed(): d['rc_start'] = d['chrom_length'] - d['stop'] d['chrom_rc'] = d['chrom'] + '_rc' d[['chrom_rc', 'rc_start', 'rc_stop']].to_csv(args.bed_out, index=False, header=False, sep='\t') + + +class PrimSupAction(argparse.Action): + """Parse primary / supplementary option.""" + + def __call__(self, parser, namespace, values, option_string=None): + if self.dest == 'primary_only': + setattr(namespace, self.dest, True) + setattr(namespace, 'keep_supplementary', False) + elif self.dest == 'keep_supplementary': + setattr(namespace, self.dest, True) + setattr(namespace, 'primary_only', False) + + +def stats_from_aligned_read(read): + """Create summary information for an aligned read + + :param read: :class:`pysam.AlignedSegment` object + """ + try: + NM = read.get_tag('NM') + except: + raise IOError("Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.") + + name = read.query_name + start_offset = 0 + if read.is_secondary or read.is_supplementary: + first_cig = read.cigartuples[0] + if first_cig[0] == 5: # should always be true for minimap2 + start_offset = first_cig[1] + counts, _ = read.get_cigar_stats() + match = counts[0] + counts[7] + counts[8] # total of M, =, and X + ins = counts[1] + delt = counts[2] + + lra_flag = False + if read.has_tag('NX'): + # likely from lra + # NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32 + sub = counts[8] + lra_flag = True + else: + # likely from minimap2 + # NM is edit distance: NM = INS + DEL + SUB + sub = NM - ins - delt + + length = match + ins + delt + iden = 100 * float(match - sub) / match + acc = 100 * float(match - sub) / length + + read_length = read.infer_read_length() + coverage = 100 * float(read.query_alignment_length) / read_length + direction = '-' if read.is_reverse else '+' + + results = { + "name": name, + "coverage": coverage, + "direction": direction, + "length": length, + "read_length": read_length, + "match": match, + "ins": ins, + "del": delt, + "sub": sub, + "iden": iden, + "acc": acc, + "qstart": read.query_alignment_start + start_offset, + "qend": read.query_alignment_end + start_offset, + "rstart": read.reference_start, + "rend": read.reference_end, + "ref": read.reference_name, + "aligned_ref_len": read.reference_length, + "ref_coverage": 100*float(read.reference_length) / read.header.lengths[read.reference_id], + "mapq": read.mapping_quality, + } + + return results, lra_flag + + +def masked_stats_from_aligned_read(read, tree): + """Create summary information for an aligned read over regions in bed file. + + :param read: :class:`pysam.AlignedSegment` object + """ + try: + MD = read.get_tag('MD') + except: + raise IOError("Read is missing required 'MD' tag. Try running 'samtools callmd - ref.fa'.") + + correct, delt, ins, sub, aligned_ref_len, masked = 0, 0, 0, 0, 0, 0 + pairs = read.get_aligned_pairs(with_seq=True) + qseq = read.query_sequence + pos_is_none = lambda x: (x[1] is None or x[0] is None) + pos = None + insertions = [] + # TODO: refactor to use get_trimmed_pairs (as in catalogue_errors)? + for qp, rp, rb in itertools.dropwhile(pos_is_none, pairs): + if rp == read.reference_end or (qp == read.query_alignment_end): + break + pos = rp if rp is not None else pos + if not tree.has_overlap(pos, pos + 1) or (rp is None and not tree.has_overlap(pos + 1, pos + 2)): + # if rp is None, we are in an insertion, check if pos + 1 overlaps + # (ref position of ins is arbitrary) + # print('Skipping ref {}:{}'.format(read.reference_name, pos)) + masked += 1 + continue + else: + if rp is not None: # we don't have an insertion + aligned_ref_len += 1 + if qp is None: # deletion + delt += 1 + elif qseq[qp] == rb: # correct + correct += 1 + elif qseq[qp] != rb: # sub + sub += 1 + else: # we have an insertion + ins += 1 + + name = read.query_name + match = correct + sub + length = match + ins + delt + + if match == 0: + # no matches within bed regions - all bed ref positions were deleted. + # skip this alignment. + return None + + iden = 100 * float(match - sub) / match + acc = 100 - 100 * float(sub + ins + delt) / length + + read_length = read.infer_read_length() + masked_query_alignment_length = correct + sub + ins + coverage = 100*float(masked_query_alignment_length) / read_length + direction = '-' if read.is_reverse else '+' + + results = { + "name": name, + "coverage": coverage, + "direction": direction, + "length": length, + "read_length": read_length, + "match": match, + "ins": ins, + "del": delt, + "sub": sub, + "iden": iden, + "acc": acc, + "qstart": read.query_alignment_start, + "qend": read.query_alignment_end, + "rstart": read.reference_start, + "rend": read.reference_end, + "ref": read.reference_name, + "aligned_ref_len": aligned_ref_len, + "ref_coverage": 100 * float(aligned_ref_len) / read.header.lengths[read.reference_id], + "mapq": read.mapping_quality, + "masked": masked, + } + return results + + +def filter_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) + group = parser.add_argument_group('Read filtering options') + group.add_argument('-O', '--orientation', choices=['fwd', 'rev'], + help='Sample only forward or reverse reads.') + group.add_argument('-q', '--quality', type=float, + help='Filter reads by mean qscore.') + group.add_argument('-a', '--accuracy', type=float, + help='Filter reads by accuracy.') + group.add_argument('-c', '--coverage', type=float, + help='Filter reads by coverage (what fraction of the read aligns).') + group.add_argument('-l', '--length', type=int, default=None, + help='Filter reads by read length.') + + pgroup = group.add_mutually_exclusive_group() + pgroup.add_argument('--primary_only', action=PrimSupAction, default=True, nargs=0, + help='Use only primary reads.') + pgroup.add_argument('--keep_supplementary', action=PrimSupAction, default=False, nargs=0, + help='Include supplementary alignments.') + return parser + + +QSCORES_TO_PROBS = 10 ** (-0.1 * np.array(np.arange(100))) + + +def filter_read(r, args, logger=None): + """Decide whether a read should be filtered out, returning a bool""" + + if logger is None: + logger = logging.getLogger('Filter') + # filter secondary and unmapped reads + if r.is_secondary or r.is_unmapped: + return True + if r.is_supplementary and not args.keep_supplementary: + return True + + # filter orientation + if (r.is_reverse and args.orientation == 'fwd') or \ + (not r.is_reverse and args.orientation == 'rev'): + return True + + # filter quality + if args.quality is not None: + mean_q = -10 * np.log10(np.mean(QSCORES_TO_PROBS[r.query_qualities])) + if mean_q < args.quality: + logger.debug(f"Filtering {r.query_name} with quality {mean_q:.2f}") + return True + + # filter accuracy or alignment coverage + if args.accuracy is not None or args.coverage is not None or args.length is not None: + stats, _ = stats_from_aligned_read(r) + if args.accuracy is not None and stats['acc'] < args.accuracy: + logger.info("Filtering {} by accuracy ({:.2f}).".format(r.query_name, stats['acc'])) + return True + if args.coverage is not None and stats['coverage'] < args.coverage: + logger.info("Filtering {} by coverage ({:.2f}).".format(r.query_name, stats['coverage'])) + return True + if args.length is not None and stats['read_length'] < args.length: + logger.info("Filtering {} by length ({:.2f}).".format(r.query_name, stats['length'])) + return True + # don't filter + return False + + +def write_bam(bam, prefix, region, sequences, keep_supplementary=False): + """Write subset bam. + + :param bam: str, path to input bam. + :param prefix: str, prefix for output bam. + :param bam: `Region` obj of region to subset to. + :param sequences: set of query names to retain. + :param keep_supplementary: bool, whether to retain supplementary alignments. + """ + # filtered bam + sequences = set(sequences) + taken = set() + output = '{}_{}.{}'.format(prefix, region.ref_name, os.path.basename(bam)) + src_bam = pysam.AlignmentFile(bam, "rb") + out_bam = pysam.AlignmentFile(output, "wb", template=src_bam) + for read in src_bam.fetch(region.ref_name, region.start, region.end): + if read.is_secondary: + continue + if read.is_supplementary and not keep_supplementary: + continue + if read.query_name in sequences and (read.query_name, read.is_supplementary) not in taken: + out_bam.write(read) + taken.add((read.query_name, read.is_supplementary)) + src_bam.close() + out_bam.close() + pysam.index(output) diff --git a/setup.py b/setup.py index 856a02f..4c6ae2a 100644 --- a/setup.py +++ b/setup.py @@ -99,6 +99,7 @@ 'split_fastx = {}.util:split_fastx_cmdline'.format(__pkg_name__), 'stats_from_bam = {}.stats_from_bam:main'.format(__pkg_name__), 'subsample_bam = {}.subsample_bam:main'.format(__pkg_name__), + 'filter_bam = {}.filter_bam:main'.format(__pkg_name__), 'summary_from_stats = {}.summary_from_stats:main'.format(__pkg_name__), 'tag_bam = {}.util:tag_bam'.format(__pkg_name__), 'trim_alignments = {}.trim_alignments:main'.format(__pkg_name__),