Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

biWFA it #288

Merged
merged 32 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
95c8014
Here is the one-line commit message for the changes:
ekg Oct 25, 2024
3ffcab8
feat: Add biWFA alignment logic to wflign.cpp
ekg Oct 25, 2024
d576563
fix: Move wfa_convex_penalties declaration earlier
ekg Oct 25, 2024
b9c2727
fix: Move wfa_convex_penalties definition earlier
ekg Oct 25, 2024
ba3f458
fix: move wfa_convex_penalties definition before first use
ekg Oct 25, 2024
00b27a1
fix: Set penalties for wfa convex in wflign.cpp
ekg Oct 25, 2024
3307042
fix: Remove duplicate declaration of `wfa_convex_penalties`
ekg Oct 27, 2024
ec4b7b9
fix: move declaration of wfa_convex_penalties before first use
ekg Oct 27, 2024
1bafd4e
okify the declaration of convex penalties
ekg Oct 28, 2024
1ba06a3
fix: Add missing `do_biwfa_alignment()` function implementation
ekg Oct 28, 2024
4278901
fix: Replace `write_sam_alignment` with `write_merged_alignment`
ekg Oct 28, 2024
ed9b5ab
fix: feat: Replace `write_sam_alignment` with `write_merged_alignment…
ekg Oct 28, 2024
c11e239
fix: reverse logic to make biWFA the default aligner
ekg Oct 28, 2024
69a2631
feat: Add option to force WFLign alignment
ekg Oct 28, 2024
9602bfd
feat: Add force_wflign variable to WFlign class
ekg Oct 28, 2024
beedda8
feat: Add force_wflign parameter to align::Parameters struct
ekg Oct 28, 2024
78154a3
fix: fix incorrect logic for choosing between WFlign and biWFA
ekg Oct 28, 2024
da5aa07
fix: Correct the logic for force_wflign in wflign.cpp
ekg Oct 28, 2024
75d52f8
line
ekg Oct 28, 2024
bf6f63b
match asm20/defaults for mm2 in biwfa params
ekg Oct 28, 2024
92660ca
fix: Set default chain gap to 2k
ekg Oct 28, 2024
a14e4b4
feat: update help text for chain-gap default value
ekg Oct 28, 2024
f8d9fa9
fix: Properly manage alignment_t objects in wflign
ekg Oct 28, 2024
907a16e
fix: Dereference alignment_t pointer before passing to write_alignmen…
ekg Oct 28, 2024
263321c
fix: Use biWFA for smaller sequences or very high identity matches
ekg Oct 28, 2024
62e6e9f
fix: Remove redundant alignment cleanup
ekg Oct 29, 2024
e1884c0
fix: Prevent double-free of alignments in write_merged_alignment()
ekg Oct 29, 2024
9017734
refactor: Use biWFA alignment directly instead of WFline
ekg Oct 30, 2024
f4a106e
feat: declare do_biwfa_alignment function in wflign.hpp
ekg Oct 30, 2024
eb3b735
fix: Comment out pt:Z:true and iv:Z:false tags in SAM and PAF output
ekg Oct 30, 2024
e63c51f
fix: Add missing semicolon in write_alignment_sam function
ekg Oct 30, 2024
2fbdef1
remove dup of biwfa alignment function
ekg Oct 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/align/include/align_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct Parameters {
uint16_t wflambda_segment_length; //segment length for wflambda

bool force_biwfa_alignment; //force biwfa alignment
bool force_wflign; //force alignment with WFlign instead of the default biWFA

int wfa_mismatch_score;
int wfa_gap_opening_score;
Expand Down
60 changes: 19 additions & 41 deletions src/align/include/computeAlignments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,49 +242,19 @@ std::string processAlignment(seq_record_t* rec) {
skch::CommonFunc::reverseComplement(query_seq.data(), queryRegionStrand.data(), query_seq.size());
}

wflign::wavefront::WFlign wflign(
param.wflambda_segment_length,
param.min_identity,
param.force_biwfa_alignment,
param.wfa_mismatch_score,
param.wfa_gap_opening_score,
param.wfa_gap_extension_score,
param.wfa_patching_mismatch_score,
param.wfa_patching_gap_opening_score1,
param.wfa_patching_gap_extension_score1,
param.wfa_patching_gap_opening_score2,
param.wfa_patching_gap_extension_score2,
rec->currentRecord.mashmap_estimated_identity,
param.wflign_mismatch_score,
param.wflign_gap_opening_score,
param.wflign_gap_extension_score,
param.wflign_max_mash_dist,
param.wflign_min_wavefront_length,
param.wflign_max_distance_threshold,
param.wflign_max_len_major,
param.wflign_max_len_minor,
param.wflign_erode_k,
param.chain_gap,
param.wflign_min_inv_patch_len,
param.wflign_max_patching_score);
// Set up penalties for biWFA
wflign_penalties_t wfa_penalties;
wfa_penalties.match = 0;
wfa_penalties.mismatch = param.wfa_patching_mismatch_score;
wfa_penalties.gap_opening1 = param.wfa_patching_gap_opening_score1;
wfa_penalties.gap_extension1 = param.wfa_patching_gap_extension_score1;
wfa_penalties.gap_opening2 = param.wfa_patching_gap_opening_score2;
wfa_penalties.gap_extension2 = param.wfa_patching_gap_extension_score2;

std::stringstream output;
wflign.set_output(
&output,
#ifdef WFA_PNG_TSV_TIMING
!param.tsvOutputPrefix.empty(),
nullptr,
param.prefix_wavefront_plot_in_png,
param.wfplot_max_size,
!param.path_patching_info_in_tsv.empty(),
nullptr,
#endif
true, // merge alignments
param.emit_md_tag,
!param.sam_format,
param.no_seq_in_sam);

wflign.wflign_affine_wavefront(
// Do direct biWFA alignment
wflign::wavefront::do_biwfa_alignment(
rec->currentRecord.qId,
queryRegionStrand.data(),
rec->queryTotalLength,
Expand All @@ -295,7 +265,15 @@ std::string processAlignment(seq_record_t* rec) {
ref_seq_ptr,
rec->refTotalLength,
rec->currentRecord.rStartPos,
rec->currentRecord.rEndPos - rec->currentRecord.rStartPos);
rec->currentRecord.rEndPos - rec->currentRecord.rStartPos,
output,
wfa_penalties,
param.emit_md_tag,
!param.sam_format,
param.no_seq_in_sam,
param.min_identity,
param.wflign_max_len_minor,
rec->currentRecord.mashmap_estimated_identity);

return output.str();
}
Expand Down
103 changes: 103 additions & 0 deletions src/common/wflign/src/alignment_printer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#ifndef ALIGNMENT_PRINTER_HPP
#define ALIGNMENT_PRINTER_HPP

#include <string>
#include <sstream>
#include "wflign.hpp"

namespace wflign {
namespace wavefront {

void write_tag_and_md_string(
std::ostream &out,
const char *cigar_ops,
const int cigar_start,
const int cigar_end,
const int target_start,
const char *target,
const int64_t target_offset,
const int64_t target_pointer_shift);

void write_alignment_sam(
std::ostream &out,
const alignment_t& patch_aln,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
const uint64_t& query_length,
const bool& query_is_rev,
const std::string& target_name,
const uint64_t& target_total_length,
const uint64_t& target_offset,
const uint64_t& target_length,
const float& min_identity,
const float& mashmap_estimated_identity,
const bool& no_seq_in_sam,
const bool& emit_md_tag,
const char* query,
const char* target,
const int64_t& target_pointer_shift);

bool write_alignment_paf(
std::ostream& out,
const alignment_t& aln,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
const uint64_t& query_length,
const bool& query_is_rev,
const std::string& target_name,
const uint64_t& target_total_length,
const uint64_t& target_offset,
const uint64_t& target_length,
const float& min_identity,
const float& mashmap_estimated_identity,
const bool& with_endline = true,
const bool& is_rev_patch = false);

void write_merged_alignment(
std::ostream &out,
const std::vector<alignment_t *> &trace,
wfa::WFAlignerGapAffine2Pieces& wf_aligner,
const wflign_penalties_t& convex_penalties,
const bool& emit_md_tag,
const bool& paf_format_else_sam,
const bool& no_seq_in_sam,
const char* query,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
const uint64_t& query_length,
const bool& query_is_rev,
const char* target,
const std::string& target_name,
const uint64_t& target_total_length,
const uint64_t& target_offset,
const uint64_t& target_length,
const float& min_identity,
#ifdef WFA_PNG_TSV_TIMING
const long& elapsed_time_wflambda_ms,
const uint64_t& num_alignments,
const uint64_t& num_alignments_performed,
#endif
const float& mashmap_estimated_identity,
const uint64_t& wflign_max_len_major,
const uint64_t& wflign_max_len_minor,
const int& erode_k,
const int64_t& chain_gap,
const int& max_patching_score,
const uint64_t& min_inversion_length,
const int& min_wf_length,
const int& max_dist_threshold,
#ifdef WFA_PNG_TSV_TIMING
const std::string* prefix_wavefront_plot_in_png,
const uint64_t& wfplot_max_size,
const bool& emit_patching_tsv,
std::ostream* out_patching_tsv,
#endif
const bool& with_endline = true);

} // namespace wavefront
} // namespace wflign

#endif
155 changes: 130 additions & 25 deletions src/common/wflign/src/wflign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,104 @@ namespace wavefront {
/*
* Configuration
*/
#define MAX_LEN_FOR_STANDARD_WFA 1000
#define MIN_WF_LENGTH 256

void do_biwfa_alignment(
const std::string& query_name,
char* const query,
const uint64_t query_total_length,
const uint64_t query_offset,
const uint64_t query_length,
const bool query_is_rev,
const std::string& target_name,
char* const target,
const uint64_t target_total_length,
const uint64_t target_offset,
const uint64_t target_length,
std::ostream& out,
const wflign_penalties_t& penalties,
const bool emit_md_tag,
const bool paf_format_else_sam,
const bool no_seq_in_sam,
const float min_identity,
const uint64_t wflign_max_len_minor,
const float mashmap_estimated_identity) {

// Create WFA aligner with the provided penalties
wfa::WFAlignerGapAffine2Pieces wf_aligner(
0, // match
penalties.mismatch,
penalties.gap_opening1,
penalties.gap_extension1,
penalties.gap_opening2,
penalties.gap_extension2,
wfa::WFAligner::Alignment,
wfa::WFAligner::MemoryUltralow);

wf_aligner.setHeuristicNone();

// Perform the alignment
const int status = wf_aligner.alignEnd2End(target, (int)target_length, query, (int)query_length);

if (status == 0) { // WF_STATUS_SUCCESSFUL
// Create alignment record on stack
alignment_t aln;
aln.ok = true;
aln.j = 0;
aln.i = 0;
aln.query_length = query_length;
aln.target_length = target_length;
aln.is_rev = false;

// Copy alignment CIGAR
wflign_edit_cigar_copy(wf_aligner, &aln.edit_cigar);

// Write alignment
if (paf_format_else_sam) {
write_alignment_paf(
out,
aln,
query_name,
query_total_length,
query_offset,
query_length,
query_is_rev,
target_name,
target_total_length,
target_offset,
target_length,
min_identity,
mashmap_estimated_identity);
} else {
// Write SAM output directly
write_alignment_sam(
out,
aln,
query_name,
query_total_length,
query_offset,
query_length,
query_is_rev,
target_name,
target_total_length,
target_offset,
target_length,
min_identity,
mashmap_estimated_identity,
no_seq_in_sam,
emit_md_tag,
query,
target,
0); // No target pointer shift for biwfa
}
}
}

/*
* Configuration
*/
#define MAX_LEN_FOR_STANDARD_WFA 1000

/*
* Utils
*/
Expand All @@ -42,7 +137,7 @@ void clean_up_sketches(std::vector<std::vector<rkmh::hash_t>*> &sketches) {
WFlign::WFlign(
const uint16_t segment_length,
const float min_identity,
const bool force_biwfa_alignment,
const bool force_wflign_,
const int wfa_mismatch_score,
const int wfa_gap_opening_score,
const int wfa_gap_extension_score,
Expand All @@ -68,7 +163,7 @@ WFlign::WFlign(
this->segment_length = segment_length;
this->min_identity = min_identity;

this->force_biwfa_alignment = force_biwfa_alignment;
this->force_wflign = force_wflign_;

this->wfa_mismatch_score = wfa_mismatch_score;
this->wfa_gap_opening_score = wfa_gap_opening_score;
Expand Down Expand Up @@ -331,6 +426,35 @@ void WFlign::wflign_affine_wavefront(
return;
}

// Set penalties for wfa convex
wflign_penalties_t wfa_convex_penalties;
if (wfa_patching_mismatch_score > 0 && wfa_patching_gap_opening_score1 > 0 && wfa_patching_gap_extension_score1 > 0 && wfa_patching_gap_opening_score2 > 0 && wfa_patching_gap_extension_score2 > 0){
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = wfa_patching_mismatch_score;
wfa_convex_penalties.gap_opening1 = wfa_patching_gap_opening_score1;
wfa_convex_penalties.gap_extension1 = wfa_patching_gap_extension_score1;
wfa_convex_penalties.gap_opening2 = wfa_patching_gap_opening_score2;
wfa_convex_penalties.gap_extension2 = wfa_patching_gap_extension_score2;
} else {
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = 6;
wfa_convex_penalties.gap_opening1 = 6;
wfa_convex_penalties.gap_extension1 = 2;
wfa_convex_penalties.gap_opening2 = 26;
wfa_convex_penalties.gap_extension2 = 1;
}

// Use biWFA for smaller sequences or very high identity matches

if (!force_wflign && (query_length <= MAX_LEN_FOR_STANDARD_WFA || target_length <= MAX_LEN_FOR_STANDARD_WFA)) {
do_biwfa_alignment(
query_name, query, query_total_length, query_offset, query_length, query_is_rev,
target_name, target, target_total_length, target_offset, target_length,
*out, wfa_convex_penalties, emit_md_tag, paf_format_else_sam, no_seq_in_sam,
min_identity, wflign_max_len_minor, mashmap_estimated_identity);
return;
}

// Check if mashmap_estimated_identity == 1 to avoid division by zero, leading to a minhash_kmer_size of 8.
// Such low value was leading to confusion in HORs alignments in the human centromeres (high runtime and memory usage, and wrong alignments)
const int minhash_kmer_size = mashmap_estimated_identity == 1 ? 17 : std::max(8, std::min(17, (int)std::floor(1.0 / (1.0 - mashmap_estimated_identity))));
Expand Down Expand Up @@ -366,23 +490,6 @@ void WFlign::wflign_affine_wavefront(
*/
}

// Set penalties for wfa convex
wflign_penalties_t wfa_convex_penalties;
if (wfa_patching_mismatch_score > 0 && wfa_patching_gap_opening_score1 > 0 && wfa_patching_gap_extension_score1 > 0 && wfa_patching_gap_opening_score2 > 0 && wfa_patching_gap_extension_score2 > 0){
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = wfa_patching_mismatch_score;
wfa_convex_penalties.gap_opening1 = wfa_patching_gap_opening_score1;
wfa_convex_penalties.gap_extension1 = wfa_patching_gap_extension_score1;
wfa_convex_penalties.gap_opening2 = wfa_patching_gap_opening_score2;
wfa_convex_penalties.gap_extension2 = wfa_patching_gap_extension_score2;
} else {
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = 5;
wfa_convex_penalties.gap_opening1 = 8;
wfa_convex_penalties.gap_extension1 = 2;
wfa_convex_penalties.gap_opening2 = 49;
wfa_convex_penalties.gap_extension2 = 1;
}

// heuristic bound on the max mash dist, adaptive based on estimated
// identity the goal here is to sparsify the set of alignments in the
Expand Down Expand Up @@ -426,11 +533,8 @@ void WFlign::wflign_affine_wavefront(
#ifdef WFA_PNG_TSV_TIMING
const auto start_time = std::chrono::steady_clock::now();
#endif
if (force_biwfa_alignment ||
(query_length <= segment_length * 8 || target_length <= segment_length * 8) ||
(mashmap_estimated_identity >= 0.99
&& query_length <= MAX_LEN_FOR_STANDARD_WFA && target_length <= MAX_LEN_FOR_STANDARD_WFA)
) {

if (!force_wflign) {
wfa::WFAlignerGapAffine2Pieces* wf_aligner =
new wfa::WFAlignerGapAffine2Pieces(
0,
Expand Down Expand Up @@ -493,6 +597,7 @@ void WFlign::wflign_affine_wavefront(
std::chrono::steady_clock::now() - start_time).count();
#endif


// Free old aligner
delete wf_aligner;

Expand Down
Loading
Loading