Skip to content

Commit

Permalink
don't do truncation alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
colinvwood committed Dec 11, 2024
1 parent e174ffc commit 46ce373
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 90 deletions.
37 changes: 0 additions & 37 deletions q2_quality_filter/_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,39 +283,6 @@ def _is_retained(
return True


def _align_records(
forward_record: FastqRecord, reverse_record: FastqRecord
) -> tuple[FastqRecord, FastqRecord]:
'''
Align a forward record and reverse record to the same truncation length.
Note that if either (forward or reverse) truncation resulted in the record
falling below the minimum length fraction then this was already handled
upstream.
Parameters
----------
forward_record : FastqRecord
The record from the forward fastq file.
reverse_record : FastqRecord
The record from the reverse fastq file.
Returns
-------
tuple[FastqRecord, FastqRecord]
The length-aligned forward and reverse records.
'''
if len(forward_record.sequence) < len(reverse_record.sequence):
reverse_record = _truncate(
reverse_record, len(forward_record.sequence)
)
elif len(reverse_record.sequence) < len(forward_record.sequence):
forward_record = _truncate(
forward_record, len(reverse_record.sequence)
)

return forward_record, reverse_record


def _write_record(fastq_record: FastqRecord, fh: gzip.GzipFile) -> None:
'''
Writes a fastq record to an open fastq file.
Expand Down Expand Up @@ -443,10 +410,6 @@ def q_score(
# if retained write to output file(s)
if retained:
if paired:
# align truncations if paired
forward_record, reverse_record = _align_records(
forward_record, reverse_record
)
_write_record(forward_record, forward_fh)
_write_record(reverse_record, reverse_fh)
else:
Expand Down
57 changes: 4 additions & 53 deletions q2_quality_filter/tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
RecordStatus,
_process_record,
_is_retained,
_align_records,
_write_record,
)
from q2_quality_filter._format import QualityFilterStatsFmt
Expand Down Expand Up @@ -336,51 +335,6 @@ def test_is_retained(self):
)
filtering_stats_df.iloc[:, :] = 0

def test_align_records(self):
# records unchanged if equal lengths
forward_record = FastqRecord(
b'@header', b'ATTCTGTA', b'+', b'MMLMLL++'
)
reverse_record = FastqRecord(
b'@header', b'TTAGCATC', b'+', b'+MM+MLM+'
)
obs_forward_record, obs_reverse_record = _align_records(
forward_record, reverse_record
)
self.assertEqual(obs_forward_record, forward_record)
self.assertEqual(obs_reverse_record, reverse_record)

# longer record truncated to shorter record
forward_record = FastqRecord(
b'@header', b'ATTCTGTA', b'+', b'MMLMLL++'
)
reverse_record = FastqRecord(
b'@header', b'TTAGCA', b'+', b'+MM+ML'
)
obs_forward_record, obs_reverse_record = _align_records(
forward_record, reverse_record
)
exp_forward_record = FastqRecord(
b'@header', b'ATTCTG', b'+', b'MMLMLL'
)
self.assertEqual(obs_forward_record, exp_forward_record)
self.assertEqual(obs_reverse_record, reverse_record)

forward_record = FastqRecord(
b'@header', b'ATTC', b'+', b'MMLM'
)
reverse_record = FastqRecord(
b'@header', b'TTAGCATC', b'+', b'+MM+MLM+'
)
obs_forward_record, obs_reverse_record = _align_records(
forward_record, reverse_record
)
exp_reverse_record = FastqRecord(
b'@header', b'TTAG', b'+', b'+MM+'
)
self.assertEqual(obs_forward_record, forward_record)
self.assertEqual(obs_reverse_record, exp_reverse_record)

def test_write_record(self):
fastq_record = FastqRecord(
b'@header', b'ATTCTGTA', b'+', b'MMLMLL++'
Expand Down Expand Up @@ -692,9 +646,6 @@ def _assert_records_match(self, manifest_df: pd.DataFrame):
self.assertEqual(
self._get_header_diff(forward_record, reverse_record), 1
)
self.assertEqual(
len(forward_record.sequence), len(reverse_record.sequence)
)

def test_paired_end_sequences(self):
demux_artifact = Artifact.import_data(
Expand All @@ -714,10 +665,10 @@ def test_paired_end_sequences(self):
)
demux_manifest_df = output_demux_format.manifest.view(pd.DataFrame)

# corresponding records should be same length and have matching headers
# corresponding records should have matching headers
self._assert_records_match(demux_manifest_df)

# "Human-Kneecap2_S2" is dropped because the R2 reads have low q scores
# "Human-Kneecap2_S2" is dropped because the R1 reads have low q scores
exp_sample_ids = ['Human-Kneecap', 'Human-Kneecap3']
self.assertEqual(
set(demux_manifest_df.index), set(exp_sample_ids)
Expand Down Expand Up @@ -747,9 +698,9 @@ def test_paired_end_sequences(self):
sample1_reverse_exp = [
# first record dropped because of R2 scores
b'@M00899:113:000000000-A5K20:1:1101:25454:3578 2:N:0:2',
b'GACTACCGGGGTATCTAATCCTGTTCGATACCCGCACCTTCGAGCTTCAGCGTCAGTTGCG',
b'GACTACCGGGGTATCTAATCCTGTTCGATACCCGCACCTTCGAGCTTCAGCGTCAGTTGCGCTCCCGTCAGCTGC', # noqa
b'+',
b'CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGG',
b'CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG', # noqa
b'@M00899:113:000000000-A5K20:1:1101:25177:3605 2:N:0:2',
b'GACTACTGGGGTATCTAATCCTGTTTGATACCCGCACCTTCGAGCTTAAGCGTCAGTTGCGCTCCCGTCAGCTGC', # noqa
b'+',
Expand Down

0 comments on commit 46ce373

Please sign in to comment.