Skip to content

Commit

Permalink
IMP: Anchor barcodes during demultiplexing (#58)
Browse files Browse the repository at this point in the history
  • Loading branch information
vaamb authored Dec 5, 2023
1 parent ba4bff0 commit c78541c
Show file tree
Hide file tree
Showing 3 changed files with 306 additions and 30 deletions.
81 changes: 58 additions & 23 deletions q2_cutadapt/_demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,12 @@ def run_command(cmd, verbose=True):

def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
untrimmed_dir_fmt, error_rate, minimum_length,
forward_cut=0, reverse_cut=0, cores=1):
forward_cut=0, reverse_cut=0,
anchor_forward=False, anchor_reverse=False,
cores=1):
cmd = ['cutadapt',
'--front', 'file:%s' % barcode_fhs['fwd'].name,
'-g',
f'{"^" if anchor_forward else ""}file:{barcode_fhs["fwd"].name}',
'--error-rate', str(error_rate),
'--minimum-length', str(minimum_length),
# {name} is a cutadapt convention for interpolating the sample id
Expand All @@ -57,7 +60,8 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
# Dual indices
cmd += [
'--pair-adapters',
'-G', 'file:%s' % barcode_fhs['rev'].name,
'-G',
f'{"^" if anchor_reverse else ""}file:{barcode_fhs["rev"].name}', # noqa: E501
]
cmd += [
'-p', os.path.join(str(per_sample_dir_fmt), '{name}.2.fastq.gz'),
Expand All @@ -66,7 +70,7 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
str(seqs_dir_fmt.forward_sequences.view(FastqGzFormat)),
str(seqs_dir_fmt.reverse_sequences.view(FastqGzFormat)),
'-U', str(reverse_cut),
]
]
else:
# SINGLE-END
cmd += [str(seqs_dir_fmt.file.view(FastqGzFormat))]
Expand Down Expand Up @@ -196,7 +200,7 @@ def _check_barcodes_uniqueness(

def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
error_tolerance, mux_fmt, batch_size, minimum_length, forward_cut,
reverse_cut, cores):
reverse_cut, anchor_forward, anchor_reverse, cores):
fwd_barcode_name = forward_barcodes.name
forward_barcodes = forward_barcodes.drop_missing_values()
barcodes = forward_barcodes.to_series().to_frame()
Expand All @@ -222,11 +226,11 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
open_fhs['rev'] = tempfile.NamedTemporaryFile()
_write_barcode_fasta(barcode_batch[rev_barcode_name],
open_fhs['rev'])
cmd = _build_demux_command(previous_untrimmed, open_fhs,
per_sample_sequences,
current_untrimmed, error_tolerance,
minimum_length, forward_cut, reverse_cut,
cores)
cmd = _build_demux_command(
previous_untrimmed, open_fhs, per_sample_sequences,
current_untrimmed, error_tolerance, minimum_length, forward_cut,
reverse_cut, anchor_forward, anchor_reverse, cores
)
run_command(cmd)
open_fhs['fwd'].close()
if reverse_barcodes is not None:
Expand All @@ -243,10 +247,11 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,

def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
barcodes: qiime2.CategoricalMetadataColumn,
cut: int = 0,
anchor_barcode: bool = False,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1,
cut: int = 0,
cores: int = 1) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedSingleEndBarcodeInSequenceDirFmt):
Expand All @@ -255,17 +260,53 @@ def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt

untrimmed = _demux(
seqs, per_sample_sequences, barcodes, None, error_rate, mux_fmt,
batch_size, minimum_length, cut, 0, cores)
seqs, per_sample_sequences, barcodes, None, error_rate,
mux_fmt, batch_size, minimum_length, cut, 0, anchor_barcode, False,
cores)

return per_sample_sequences, untrimmed


def _check_paired_requirements(loc):
mixed_orientation = loc.get("mixed_orientation", None)
forward_cut = loc.get("forward_cut", 0)
reverse_cut = loc.get("reverse_cut", 0)
reverse_barcodes = loc.get("reverse_barcodes", None)
anchor_forward_barcode = loc.get("anchor_forward_barcode", False)
anchor_reverse_barcode = loc.get("anchor_reverse_barcode", False)

if (
not mixed_orientation
and anchor_reverse_barcode and (reverse_barcodes is None)
):
raise ValueError("A reverse barcode needs to be provided in order to "
"anchor the reverse barcode.")

if (
mixed_orientation
and forward_cut != reverse_cut
):
raise ValueError("'forward_cut' and 'reverse_cut' need to be set to "
"the same number when using the 'mixed_orientation' "
"mode.")

if (
mixed_orientation
and anchor_forward_barcode != anchor_reverse_barcode
):
raise ValueError(
"'anchor_forward_barcode' and 'anchor_reverse_barcode' need to be "
"set to the same value when using the 'mixed_orientation' mode."
)


def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
forward_barcodes: qiime2.CategoricalMetadataColumn,
reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
forward_cut: int = 0,
reverse_cut: int = 0,
anchor_forward_barcode: bool = False,
anchor_reverse_barcode: bool = False,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1,
Expand All @@ -275,22 +316,15 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
MultiplexedPairedEndBarcodeInSequenceDirFmt):
_check_barcodes_uniqueness(
forward_barcodes, reverse_barcodes, mixed_orientation)

if (
mixed_orientation
and forward_cut != reverse_cut
):
raise ValueError("'forward_cut' and 'reverse_cut' need to be set to "
"the same number when using the 'mixed_orientation' "
"mode")
_check_paired_requirements(locals())

per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt

untrimmed = _demux(
seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
error_rate, mux_fmt, batch_size, minimum_length, forward_cut,
reverse_cut, cores)
reverse_cut, anchor_forward_barcode, anchor_reverse_barcode, cores)

if mixed_orientation:
fwd = untrimmed.forward_sequences.view(FastqGzFormat)
Expand All @@ -305,6 +339,7 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
untrimmed = _demux(
remaining_seqs, per_sample_sequences, forward_barcodes,
reverse_barcodes, error_rate, mux_fmt, batch_size,
minimum_length, 0, 0, cores)
minimum_length, 0, 0, anchor_reverse_barcode,
anchor_forward_barcode, cores)

return per_sample_sequences, untrimmed
25 changes: 21 additions & 4 deletions q2_cutadapt/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@
'barcodes': MetadataColumn[Categorical],
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
'anchor_barcode': Bool,
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
'cut': Int,
Expand All @@ -283,6 +284,10 @@
'allowable error rate. The default value specified by '
'cutadapt is 0.1 (=10%), which is greater than '
'`demux emp-*`, which is 0.0 (=0%).',
'anchor_barcode': 'Anchor the barcode. The barcode is then '
'expected to occur in full length at the beginning '
'(5\' end) of the sequence. Can speed up '
'demultiplexing if used.',
'batch_size': 'The number of samples cutadapt demultiplexes '
'concurrently. Demultiplexing in smaller batches will '
'yield the same result with marginal speed loss, and '
Expand All @@ -292,11 +297,11 @@
'the cutadapt default of 0 has been overridden, '
'because that value produces empty sequence '
'records.',
'cut': 'Remove the specified number of bases from the sequences. Bases'
'are removed before demultiplexing. If a positive value is'
'provided, bases are removed from the beginning of the '
'cut': 'Remove the specified number of bases from the sequences. '
'Bases are removed before demultiplexing. If a positive value '
'is provided, bases are removed from the beginning of the '
'sequences. If a negative value is provided, bases are removed '
'from the end of the sequences',
'from the end of the sequences.',
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
Expand Down Expand Up @@ -324,6 +329,8 @@
'reverse_cut': Int,
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
'anchor_forward_barcode': Bool,
'anchor_reverse_barcode': Bool,
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
'mixed_orientation': Bool,
Expand Down Expand Up @@ -358,6 +365,16 @@
'the sequences. If --p-mixed-orientation is set, then '
'both --p-forward-cut and --p-reverse-cut must be '
'set to the same value.',
'anchor_forward_barcode': 'Anchor the forward barcode. The '
'barcode is then expected to occur in full '
'length at the beginning (5\' end) of the '
'forward sequence. Can speed up '
'demultiplexing if used.',
'anchor_reverse_barcode': 'Anchor the reverse barcode. The '
'barcode is then expected to occur in full '
'length at the beginning (5\' end) of the '
'reverse sequence. Can speed up '
'demultiplexing if used.',
'error_rate': 'The level of error tolerance, specified as the maximum '
'allowable error rate.',
'batch_size': 'The number of samples cutadapt demultiplexes '
Expand Down
Loading

0 comments on commit c78541c

Please sign in to comment.