From b728ca5151a1f886b9173d0c39a983b18167abb0 Mon Sep 17 00:00:00 2001 From: Anthony Underwood Date: Sat, 8 May 2021 00:22:43 +0000 Subject: [PATCH] Correct vcf2psuedogenome and reference_to_single_sequence for multiple --- ...gle_sequence.py => reference2single_sequence.py} | 5 ++++- bin/vcf2pseudogenome.py | 13 +++++++------ modules/local/alignpseudogenomes.nf | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) rename bin/{reference_to_single_sequence.py => reference2single_sequence.py} (90%) diff --git a/bin/reference_to_single_sequence.py b/bin/reference2single_sequence.py similarity index 90% rename from bin/reference_to_single_sequence.py rename to bin/reference2single_sequence.py index 5d3282e..467b972 100755 --- a/bin/reference_to_single_sequence.py +++ b/bin/reference2single_sequence.py @@ -2,6 +2,7 @@ from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +import textwrap import argparse, sys, os class ParserWithErrors(argparse.ArgumentParser): @@ -33,7 +34,9 @@ def argparser(): def combine_sequences(reference_sequence): records = list(SeqIO.parse(reference_sequence, "fasta")) new_sequence = ''.join([str(record.seq) for record in records]) - new_id = '-'.join([record.id for record in records]) + new_id = '|'.join([record.id for record in records]) + if len(new_id) > 100: + new_id = textwrap.shorten(new_id, width=97, placeholder="...") new_record = SeqRecord(Seq(new_sequence), id = new_id, description = '') return(new_record) diff --git a/bin/vcf2pseudogenome.py b/bin/vcf2pseudogenome.py index 253af75..0d426c9 100755 --- a/bin/vcf2pseudogenome.py +++ b/bin/vcf2pseudogenome.py @@ -54,8 +54,6 @@ def filtered_bcf_to_fasta(filtered_bcf_file, reference_lengths): previous_positions[chrom] = 0 with VariantFile(filtered_bcf_file) as vcf_reader: for record in vcf_reader.fetch(): - if record.pos % 10000 == 0: - print(record.pos) record_chrom = record.chrom if record.pos == previous_positions[record_chrom]: # Insertion - remove previous character and add 'N' sequences[record_chrom].pop() # remove last position @@ -77,8 +75,11 @@ def filtered_bcf_to_fasta(filtered_bcf_file, reference_lengths): else: # if not PASS it's a low qual SNP so add N sequences[record_chrom].append('N') previous_positions[record_chrom] = record.pos - if previous_positions[record_chrom] != reference_lengths[record_chrom]: # if gap at the end - sequences[record_chrom].extend(calculate_gaps_to_add(previous_positions[record_chrom], reference_lengths[record_chrom])) + + # check for gaps at end + for chrom in sequences: + if len(sequences[chrom]) != reference_lengths[record_chrom]: # if gap at the end + sequences[chrom].extend(calculate_gaps_to_add(len(sequences[chrom]), reference_lengths[chrom])) return sequences def write_sequence(filepath, id, sequence): @@ -90,8 +91,8 @@ def write_sequence(filepath, id, sequence): parser = argparser() args = parser.parse_args() - reference_length = calculate_reference_lengths(args.reference_file) - pseudogenome_sequence_lists = filtered_bcf_to_fasta(args.filtered_bcf_file, reference_length) + reference_lengths = calculate_reference_lengths(args.reference_file) + pseudogenome_sequence_lists = filtered_bcf_to_fasta(args.filtered_bcf_file, reference_lengths) pseudogenome_sequences = [ ''.join(sequence_list) for sequence_list in pseudogenome_sequence_lists.values()] combined_pseudogenome_sequence = ''.join(sequence for sequence in pseudogenome_sequences) diff --git a/modules/local/alignpseudogenomes.nf b/modules/local/alignpseudogenomes.nf index 8548002..e421f3f 100644 --- a/modules/local/alignpseudogenomes.nf +++ b/modules/local/alignpseudogenomes.nf @@ -39,7 +39,7 @@ process ALIGNPSEUDOGENOMES { echo "\$pseudogenome\t\$fraction_non_GATC_bases" >> low_quality_pseudogenomes.tsv fi done - reference_to_single_sequence.py -r ${reference} -o final_reference.fas + reference2single_sequence.py -r ${reference} -o final_reference.fas cat final_reference.fas >> aligned_pseudogenomes.fas NUM_ALIGNMENT_GENOMES=\$(grep -c ">" aligned_pseudogenomes.fas)