Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Anchor barcodes during demultiplexing #58

Merged
merged 17 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 58 additions & 23 deletions q2_cutadapt/_demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,12 @@ def run_command(cmd, verbose=True):

def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
untrimmed_dir_fmt, error_rate, minimum_length,
forward_cut=0, reverse_cut=0, cores=1):
forward_cut=0, reverse_cut=0,
anchor_forward=False, anchor_reverse=False,
cores=1):
cmd = ['cutadapt',
'--front', 'file:%s' % barcode_fhs['fwd'].name,
'-g',
f'{"^" if anchor_forward else ""}file:{barcode_fhs["fwd"].name}',
'--error-rate', str(error_rate),
'--minimum-length', str(minimum_length),
# {name} is a cutadapt convention for interpolating the sample id
Expand All @@ -57,7 +60,8 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
# Dual indices
cmd += [
'--pair-adapters',
'-G', 'file:%s' % barcode_fhs['rev'].name,
'-G',
f'{"^" if anchor_reverse else ""}file:{barcode_fhs["rev"].name}', # noqa: E501
]
cmd += [
'-p', os.path.join(str(per_sample_dir_fmt), '{name}.2.fastq.gz'),
Expand All @@ -66,7 +70,7 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
str(seqs_dir_fmt.forward_sequences.view(FastqGzFormat)),
str(seqs_dir_fmt.reverse_sequences.view(FastqGzFormat)),
'-U', str(reverse_cut),
]
]
else:
# SINGLE-END
cmd += [str(seqs_dir_fmt.file.view(FastqGzFormat))]
Expand Down Expand Up @@ -196,7 +200,7 @@ def _check_barcodes_uniqueness(

def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
error_tolerance, mux_fmt, batch_size, minimum_length, forward_cut,
reverse_cut, cores):
reverse_cut, anchor_forward, anchor_reverse, cores):
fwd_barcode_name = forward_barcodes.name
forward_barcodes = forward_barcodes.drop_missing_values()
barcodes = forward_barcodes.to_series().to_frame()
Expand All @@ -222,11 +226,11 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
open_fhs['rev'] = tempfile.NamedTemporaryFile()
_write_barcode_fasta(barcode_batch[rev_barcode_name],
open_fhs['rev'])
cmd = _build_demux_command(previous_untrimmed, open_fhs,
per_sample_sequences,
current_untrimmed, error_tolerance,
minimum_length, forward_cut, reverse_cut,
cores)
cmd = _build_demux_command(
previous_untrimmed, open_fhs, per_sample_sequences,
current_untrimmed, error_tolerance, minimum_length, forward_cut,
reverse_cut, anchor_forward, anchor_reverse, cores
)
run_command(cmd)
open_fhs['fwd'].close()
if reverse_barcodes is not None:
Expand All @@ -243,10 +247,11 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,

def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
barcodes: qiime2.CategoricalMetadataColumn,
cut: int = 0,
anchor_barcode: bool = False,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1,
cut: int = 0,
cores: int = 1) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedSingleEndBarcodeInSequenceDirFmt):
Expand All @@ -255,17 +260,53 @@ def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt

untrimmed = _demux(
seqs, per_sample_sequences, barcodes, None, error_rate, mux_fmt,
batch_size, minimum_length, cut, 0, cores)
seqs, per_sample_sequences, barcodes, None, error_rate,
mux_fmt, batch_size, minimum_length, cut, 0, anchor_barcode, False,
cores)

return per_sample_sequences, untrimmed


def _check_paired_requirements(loc):
mixed_orientation = loc.get("mixed_orientation", None)
forward_cut = loc.get("forward_cut", 0)
reverse_cut = loc.get("reverse_cut", 0)
reverse_barcodes = loc.get("reverse_barcodes", None)
anchor_forward_barcode = loc.get("anchor_forward_barcode", False)
anchor_reverse_barcode = loc.get("anchor_reverse_barcode", False)

if (
not mixed_orientation
and anchor_reverse_barcode and (reverse_barcodes is None)
):
raise ValueError("A reverse barcode needs to be provided in order to "
"anchor the reverse barcode.")

if (
mixed_orientation
and forward_cut != reverse_cut
):
raise ValueError("'forward_cut' and 'reverse_cut' need to be set to "
"the same number when using the 'mixed_orientation' "
"mode.")

if (
mixed_orientation
and anchor_forward_barcode != anchor_reverse_barcode
):
colinvwood marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
"'anchor_forward_barcode' and 'anchor_reverse_barcode' need to be "
"set to the same value when using the 'mixed_orientation' mode."
)


def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
forward_barcodes: qiime2.CategoricalMetadataColumn,
reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
forward_cut: int = 0,
reverse_cut: int = 0,
anchor_forward_barcode: bool = False,
anchor_reverse_barcode: bool = False,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1,
Expand All @@ -275,22 +316,15 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
MultiplexedPairedEndBarcodeInSequenceDirFmt):
_check_barcodes_uniqueness(
forward_barcodes, reverse_barcodes, mixed_orientation)

if (
mixed_orientation
and forward_cut != reverse_cut
):
raise ValueError("'forward_cut' and 'reverse_cut' need to be set to "
"the same number when using the 'mixed_orientation' "
"mode")
_check_paired_requirements(locals())

per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt

untrimmed = _demux(
seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
error_rate, mux_fmt, batch_size, minimum_length, forward_cut,
reverse_cut, cores)
reverse_cut, anchor_forward_barcode, anchor_reverse_barcode, cores)

if mixed_orientation:
fwd = untrimmed.forward_sequences.view(FastqGzFormat)
Expand All @@ -305,6 +339,7 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
untrimmed = _demux(
remaining_seqs, per_sample_sequences, forward_barcodes,
reverse_barcodes, error_rate, mux_fmt, batch_size,
minimum_length, 0, 0, cores)
minimum_length, 0, 0, anchor_reverse_barcode,
anchor_forward_barcode, cores)

return per_sample_sequences, untrimmed
25 changes: 21 additions & 4 deletions q2_cutadapt/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@
'barcodes': MetadataColumn[Categorical],
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
'anchor_barcode': Bool,
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
'cut': Int,
Expand All @@ -283,6 +284,10 @@
'allowable error rate. The default value specified by '
'cutadapt is 0.1 (=10%), which is greater than '
'`demux emp-*`, which is 0.0 (=0%).',
'anchor_barcode': 'Anchor the barcode. The barcode is then '
'expected to occur in full length at the beginning '
'(5\' end) of the sequence. Can speed up '
'demultiplexing if used.',
'batch_size': 'The number of samples cutadapt demultiplexes '
'concurrently. Demultiplexing in smaller batches will '
'yield the same result with marginal speed loss, and '
Expand All @@ -292,11 +297,11 @@
'the cutadapt default of 0 has been overridden, '
'because that value produces empty sequence '
'records.',
'cut': 'Remove the specified number of bases from the sequences. Bases'
'are removed before demultiplexing. If a positive value is'
'provided, bases are removed from the beginning of the '
'cut': 'Remove the specified number of bases from the sequences. '
'Bases are removed before demultiplexing. If a positive value '
'is provided, bases are removed from the beginning of the '
'sequences. If a negative value is provided, bases are removed '
'from the end of the sequences',
'from the end of the sequences.',
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
Expand Down Expand Up @@ -324,6 +329,8 @@
'reverse_cut': Int,
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
'anchor_forward_barcode': Bool,
'anchor_reverse_barcode': Bool,
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
'mixed_orientation': Bool,
Expand Down Expand Up @@ -358,6 +365,16 @@
'the sequences. If --p-mixed-orientation is set, then '
'both --p-forward-cut and --p-reverse-cut must be '
'set to the same value.',
'anchor_forward_barcode': 'Anchor the forward barcode. The '
'barcode is then expected to occur in full '
'length at the beginning (5\' end) of the '
'forward sequence. Can speed up '
'demultiplexing if used.',
'anchor_reverse_barcode': 'Anchor the reverse barcode. The '
'barcode is then expected to occur in full '
'length at the beginning (5\' end) of the '
'reverse sequence. Can speed up '
'demultiplexing if used.',
'error_rate': 'The level of error tolerance, specified as the maximum '
'allowable error rate.',
'batch_size': 'The number of samples cutadapt demultiplexes '
Expand Down
Loading