Skip to content

Commit

Permalink
subsample_bam
Browse files Browse the repository at this point in the history
  • Loading branch information
mwykes committed Dec 13, 2023
1 parent 19d96f0 commit d33dc2b
Show file tree
Hide file tree
Showing 8 changed files with 327 additions and 234 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.3.14] - 2023-10-25
### Changed
- `subsample_bam` and `coverage_from_bam` now have unified read filtering options and logic.
### Added
- `filter_bam` to filter a bam with the same logic used in `subsample_bam`.
### Fixed
- `subsample_bam` was previously subsampling proportionally before filtering resulting in lower than expected depth.

## [v0.3.13] - 2023-06-23
### Changed
- `subsample_bam`: `--force_low_coverage` saves contigs with coverage below the target
Expand Down
2 changes: 1 addition & 1 deletion pomoxis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.3.13'
__version__ = '0.3.14'

import argparse
import os
Expand Down
18 changes: 11 additions & 7 deletions pomoxis/coverage_from_bam.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import argparse
import functools
import logging
import numpy as np
import os

import numpy as np
import pandas as pd
import pysam
from pomoxis.util import parse_regions, Region

from pomoxis.util import filter_args, filter_read, parse_regions, Region

def coverage_of_region(region, bam_fp, stride, primary_only=False):

def coverage_of_region(region, bam_fp, stride, read_filter_func):
"""Get coverage data for a region"""

bins = np.arange(region.start, region.end, stride)
Expand All @@ -16,7 +19,7 @@ def coverage_of_region(region, bam_fp, stride, primary_only=False):
coverage_by_is_rev = {True: np.zeros(len(bins)), False: np.zeros(len(bins))}
for r_obj in pysam.AlignmentFile(bam_fp).fetch(contig=region.ref_name, start=region.start, end=region.end):
# Ignore secondary/supplementary maps when computing the median depth, if requested
if primary_only and (r_obj.is_secondary or r_obj.is_supplementary):
if read_filter_func(r_obj):
continue
start_i = max((r_obj.reference_start - bins[0]) // stride, 0)
end_i = min((r_obj.reference_end - bins[0]) // stride, len(bins))
Expand All @@ -39,24 +42,25 @@ def coverage_summary_of_region(*args, **kwargs):

def main():
logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)

parser = argparse.ArgumentParser(
prog='coverage_from_bam',
description='Calculate read coverage depth from a bam.',
epilog='By default a file is written per reference sequence, this can be changed with the `--one_file` '
'option. If overlapping regions are specified, `--one_file` should not be used.',
parents=[filter_args()],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('bam', help='.fasta/fastq file.')
parser.add_argument('-r', '--regions', nargs='+', help='Only process given regions.')
grp = parser.add_mutually_exclusive_group()
grp.add_argument('-p', '--prefix', help='Prefix for output, defaults to basename of bam.')
grp.add_argument('-o', '--one_file', help='Single output file with "region" column.')
parser.add_argument('-s', '--stride', type=int, default=1000, help='Stride in genomic coordinate.')
parser.add_argument('--primary_only', action='store_true',
help='Use only primary reads when computing the coverage.')
parser.add_argument('--summary_only', action='store_true',
help='Output only the depth_summary.txt file')

args = parser.parse_args()
read_filter_func = functools.partial(filter_read, args=args)

bam = pysam.AlignmentFile(args.bam)
ref_lengths = dict(zip(bam.references, bam.lengths))
Expand All @@ -76,7 +80,7 @@ def main():
prefix = os.path.splitext(os.path.basename(args.bam))[0]

region_str = '{}_{}_{}'.format(region.ref_name, region.start, region.end)
df = coverage_of_region(region, args.bam, args.stride, primary_only=args.primary_only)
df = coverage_of_region(region, args.bam, args.stride, read_filter_func)
summary[region_str] = df['depth'].describe()

if not args.summary_only:
Expand Down
40 changes: 40 additions & 0 deletions pomoxis/filter_bam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import argparse
import logging

import pysam

from pomoxis.util import filter_args, filter_read, parse_regions, Region


def main():
logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
parser = argparse.ArgumentParser(
prog='filter_bam',
description='Filter a bam',
parents=[filter_args()],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('bam',
help='input bam file.')
parser.add_argument('-o', '--output_bam', default='filtered.bam',
help='Output bam file.')
parser.add_argument('-r', '--region',
help='Only process given region.')

args = parser.parse_args()
logger = logging.getLogger('filter')

with pysam.AlignmentFile(args.bam) as bam:

if args.region is not None:
ref_lengths = dict(zip(bam.references, bam.lengths))
region = parse_regions([args.region], ref_lengths=ref_lengths)[0]
reads = bam.fetch(region.ref_name, region.start, region.end)
else:
reads = bam
out_bam = pysam.AlignmentFile(args.output_bam, "wb", header=bam.header)
for read in reads:
if filter_read(read, args, logger):
continue
out_bam.write(read)
out_bam.close()
pysam.index(args.output_bam)
151 changes: 3 additions & 148 deletions pomoxis/stats_from_bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import pysam

from pomoxis.util import intervaltrees_from_bed
from pomoxis.util import intervaltrees_from_bed, masked_stats_from_aligned_read, stats_from_aligned_read

parser = argparse.ArgumentParser(
prog='stats_from_bam',
Expand All @@ -32,151 +32,6 @@
help='Number of threads for parallel processing.')


def stats_from_aligned_read(read, references, lengths):
"""Create summary information for an aligned read
:param read: :class:`pysam.AlignedSegment` object
"""
try:
NM = read.get_tag('NM')
except:
raise IOError("Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.")

name = read.qname
start_offset = 0
if read.is_secondary or read.is_supplementary:
first_cig = read.cigartuples[0]
if first_cig[0] == 5: # should always be true for minimap2
start_offset = first_cig[1]
counts, _ = read.get_cigar_stats()
match = counts[0] + counts[7] + counts[8] # total of M, =, and X
ins = counts[1]
delt = counts[2]

lra_flag = False
if read.has_tag('NX'):
# likely from lra
# NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32
sub = counts[8]
lra_flag = True
else:
# likely from minimap2
# NM is edit distance: NM = INS + DEL + SUB
sub = NM - ins - delt

length = match + ins + delt
iden = 100 * float(match - sub) / match
acc = 100 * float(match - sub) / length

read_length = read.infer_read_length()
coverage = 100 * float(read.query_alignment_length) / read_length
direction = '-' if read.is_reverse else '+'

results = {
"name": name,
"coverage": coverage,
"direction": direction,
"length": length,
"read_length": read_length,
"match": match,
"ins": ins,
"del": delt,
"sub": sub,
"iden": iden,
"acc": acc,
"qstart": read.query_alignment_start + start_offset,
"qend": read.query_alignment_end + start_offset,
"rstart": read.reference_start,
"rend": read.reference_end,
"ref": references[read.reference_id],
"aligned_ref_len": read.reference_length,
"ref_coverage": 100*float(read.reference_length) / lengths[read.reference_id],
"mapq": read.mapping_quality,
}

return results, lra_flag


def masked_stats_from_aligned_read(read, references, lengths, tree):
"""Create summary information for an aligned read over regions in bed file.
:param read: :class:`pysam.AlignedSegment` object
"""
try:
MD = read.get_tag('MD')
except:
raise IOError("Read is missing required 'MD' tag. Try running 'samtools callmd - ref.fa'.")

correct, delt, ins, sub, aligned_ref_len, masked = 0, 0, 0, 0, 0, 0
pairs = read.get_aligned_pairs(with_seq=True)
qseq = read.query_sequence
pos_is_none = lambda x: (x[1] is None or x[0] is None)
pos = None
insertions = []
# TODO: refactor to use get_trimmed_pairs (as in catalogue_errors)?
for qp, rp, rb in itertools.dropwhile(pos_is_none, pairs):
if rp == read.reference_end or (qp == read.query_alignment_end):
break
pos = rp if rp is not None else pos
if not tree.has_overlap(pos, pos + 1) or (rp is None and not tree.has_overlap(pos + 1, pos + 2)):
# if rp is None, we are in an insertion, check if pos + 1 overlaps
# (ref position of ins is arbitrary)
# print('Skipping ref {}:{}'.format(read.reference_name, pos))
masked += 1
continue
else:
if rp is not None: # we don't have an insertion
aligned_ref_len += 1
if qp is None: # deletion
delt += 1
elif qseq[qp] == rb: # correct
correct += 1
elif qseq[qp] != rb: # sub
sub += 1
else: # we have an insertion
ins += 1

name = read.qname
match = correct + sub
length = match + ins + delt

if match == 0:
# no matches within bed regions - all bed ref positions were deleted.
# skip this alignment.
return None

iden = 100 * float(match - sub) / match
acc = 100 - 100 * float(sub + ins + delt) / length

read_length = read.infer_read_length()
masked_query_alignment_length = correct + sub + ins
coverage = 100*float(masked_query_alignment_length) / read_length
direction = '-' if read.is_reverse else '+'

results = {
"name": name,
"coverage": coverage,
"direction": direction,
"length": length,
"read_length": read_length,
"match": match,
"ins": ins,
"del": delt,
"sub": sub,
"iden": iden,
"acc": acc,
"qstart": read.query_alignment_start,
"qend": read.query_alignment_end,
"rstart": read.reference_start,
"rend": read.reference_end,
"ref": references[read.reference_id],
"aligned_ref_len": aligned_ref_len,
"ref_coverage": 100 * float(aligned_ref_len) / lengths[read.reference_id],
"mapq": read.mapping_quality,
"masked": masked,
}
return results


def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, bed_file=None):
start, stop = start_stop
Expand Down Expand Up @@ -205,12 +60,12 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be
counts['masked'] += 1
continue
else:
result = masked_stats_from_aligned_read(read, bam.references, bam.lengths, trees[read.reference_name])
result = masked_stats_from_aligned_read(read, trees[read.reference_name])
if result is None: # no matches within bed regions
counts['all_matches_masked'] += 1
continue
else:
result, lra_flag = stats_from_aligned_read(read, bam.references, bam.lengths)
result, lra_flag = stats_from_aligned_read(read)

if min_length is not None and result['length'] < min_length:
counts['short'] += 1
Expand Down
Loading

0 comments on commit d33dc2b

Please sign in to comment.