Skip to content

Commit

Permalink
Merge branch 'waveygang:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
quim0 authored Feb 4, 2025
2 parents 180dc14 + e800c3d commit 92380e6
Show file tree
Hide file tree
Showing 20 changed files with 3,180 additions and 2,061 deletions.
32 changes: 7 additions & 25 deletions .github/workflows/test_on_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,34 +32,16 @@ jobs:
samtools
libjemalloc-dev
bedtools
cargo
- name: Install wgatools
run: cargo install --git https://github.com/wjwei-handsome/wgatools.git
- name: Install pafcheck
run: cargo install --git https://github.com/ekg/pafcheck.git
- name: Init and update submodules
run: git submodule update --init --recursive
- name: Build wfmash
run: cmake -H. -Bbuild -D CMAKE_BUILD_TYPE=Debug -DWFA_PNG_AND_TSV=ON && cmake --build build -- -j 2
- name: Run cmake tests (exclude lengthy ones)
- name: Run cmake tests
run: |
cd build
ctest -E all2all
- name: Install Rust and Cargo
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Install wgatools
run: cargo install --git https://github.com/wjwei-handsome/wgatools.git
- name: Install wgatools
run: cargo install --git https://github.com/ekg/pafcheck.git
- name: Run wfmash and generate PAF
run: build/bin/wfmash -t 8 -T SGDref -Q S288C -Y '#' data/scerevisiae8.fa.gz > test.paf
- name: check PAF coordinates and extended CIGAR validity
run: pafcheck --query-fasta data/scerevisiae8.fa.gz --paf test.paf
- name: Convert PAF to MAF using wgatools
run: wgatools paf2maf --target data/scerevisiae8.fa.gz --query data/scerevisiae8.fa.gz test.paf > test.maf
- name: Check if MAF file is not empty
run: |
if [ -s test.maf ]; then
echo "MAF file is not empty. Test passed."
else
echo "MAF file is empty. Test failed."
exit 1
fi
ctest --output-on-failure
24 changes: 5 additions & 19 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -258,31 +258,17 @@ add_test(

add_test(
NAME wfmash-mapping-coverage-with-8-yeast-genomes-to-PAF
COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y \\# > scerevisiae8.paf && ./scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92 && head -3000 scerevisiae8.paf > scerevisiae8.paf.output && ${CMAKE_COMMAND} -E compare_files ${REGRESSION_TEST_DIR}/scerevisiae8.paf.output scerevisiae8.paf.output"
COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y \\# > scerevisiae8.paf && ./scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

add_test(
NAME wfmash-short-reads-500bps-to-SAM
COMMAND bash -c "${INVOKE} data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head > wfmash-short-reads-500bps-to-SAM.output && ${CMAKE_COMMAND} -E compare_files ${REGRESSION_TEST_DIR}/wfmash-short-reads-500bps-to-SAM.output wfmash-short-reads-500bps-to-SAM.output"
NAME wfmash-pafcheck-yeast
COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -t 16 -T S288C -Q Y12 > x.paf && pafcheck --query-fasta data/scerevisiae8.fa.gz --target-fasta data/scerevisiae8.fa.gz --paf x.paf"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

add_test(
NAME wfmash-short-reads-255bps-to-PAF
COMMAND bash -c "${INVOKE} data/reads.255bps.fa.gz -w 16 -s 100 -L > reads.255bps.paf && head reads.255bps.paf && ${CMAKE_COMMAND} -E compare_files ${REGRESSION_TEST_DIR}/reads.255bps.paf reads.255bps.paf"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

add_test(
NAME wfmash-input-mapping
COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -p 95 -T S288C -Q SK1 -m > mappings.paf && \
${INVOKE} data/scerevisiae8.fa.gz -i mappings.paf|sort > aligned.paf.output && \
${CMAKE_COMMAND} -E compare_files ${REGRESSION_TEST_DIR}/aligned.paf.output aligned.paf.output"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

add_test(
NAME wfmash-all2all
COMMAND bash -c "${INVOKE} -t 8 data/scerevisiae8.fa.gz > all2all.paf && \
sort all2all.paf | head -300 > all2all-300.paf.output && \
${CMAKE_COMMAND} -E compare_files ${REGRESSION_TEST_DIR}/all2all-300.paf.output all2all-300.paf.output"
NAME wfmash-maf-validity
COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -t 16 -T S288C -Q Y12 > x.paf && wgatools paf2maf --target data/scerevisiae8.fa.gz --query data/scerevisiae8.fa.gz x.paf > x.maf && test $(wgatools stat x.maf | cut -f 7 | awk '{ s+=$1 } END { print s }') -gt 11000000"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

install(TARGETS wfmash DESTINATION bin)
Expand Down
1 change: 1 addition & 0 deletions src/align/include/align_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct Parameters {
bool sam_format; //Emit the output in SAM format (PAF default)
bool no_seq_in_sam; //Do not fill the SEQ field in SAM format
bool multithread_fasta_input; //Multithreaded fasta input
uint64_t target_padding; //Additional padding around target sequence

#ifdef WFA_PNG_TSV_TIMING
// plotting
Expand Down
5 changes: 5 additions & 0 deletions src/align/include/align_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ namespace align
skch::offset_t rEndPos; //mapping boundary end offset on ref
skch::strand_t strand; //mapping strand
float mashmap_estimated_identity;

// Chain metadata
int32_t chain_id{-1}; // Unique ID for this chain (-1 if not part of chain)
int32_t chain_length{1}; // Total segments in chain (1 if not part of chain)
int32_t chain_pos{1}; // Position in chain, 1-based (1 if not part of chain)
};

typedef std::unordered_map <std::string, std::string> refSequenceMap_t;
Expand Down
111 changes: 101 additions & 10 deletions src/align/include/computeAlignments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ typedef atomic_queue::AtomicQueue<std::string*, 1024, nullptr, true, true, false
/**
* @brief compute alignments
*/

void compute()
{
this->computeAlignments();
Expand All @@ -149,7 +150,7 @@ typedef atomic_queue::AtomicQueue<std::string*, 1024, nullptr, true, true, false
* @param[in] mappingRecordLine
* @param[out] currentRecord
*/
inline static void parseMashmapRow(const std::string &mappingRecordLine, MappingBoundaryRow &currentRecord) {
inline static void parseMashmapRow(const std::string &mappingRecordLine, MappingBoundaryRow &currentRecord, const uint64_t target_padding) {
std::stringstream ss(mappingRecordLine); // Insert the string into a stream
std::string word; // Have a buffer string

Expand All @@ -169,15 +170,63 @@ typedef atomic_queue::AtomicQueue<std::string*, 1024, nullptr, true, true, false
// if the estimated identity is missing, avoid assuming too low values
const float mm_id = wfmash::is_a_number(mm_id_vec.back()) ? std::stof(mm_id_vec.back()) : skch::fixed::percentage_identity;

// Parse chain info if present (expecting format "chain:i:id.pos.len" in tokens[14])
int32_t chain_id = -1;
int32_t chain_length = 1;
int32_t chain_pos = 1;
if (tokens.size() > 14) {
const vector<string> chain_vec = skch::CommonFunc::split(tokens[14], ':');
if (chain_vec.size() == 3 && chain_vec[0] == "chain" && chain_vec[1] == "i") {
// Split the id.pos.len format
const vector<string> chain_parts = skch::CommonFunc::split(chain_vec[2], '.');
if (chain_parts.size() == 3) {
chain_id = std::stoi(chain_parts[0]);
chain_pos = std::stoi(chain_parts[1]);
chain_length = std::stoi(chain_parts[2]);
}
}
}

// Save words into currentRecord
{
currentRecord.qId = tokens[0];
currentRecord.qStartPos = std::stoi(tokens[2]);
currentRecord.qEndPos = std::stoi(tokens[3]);
currentRecord.strand = (tokens[4] == "+" ? skch::strnd::FWD : skch::strnd::REV);
currentRecord.refId = tokens[5];
currentRecord.rStartPos = std::stoi(tokens[7]);
currentRecord.rEndPos = std::stoi(tokens[8]);
const uint64_t ref_len = std::stoi(tokens[6]);
currentRecord.chain_id = chain_id;
currentRecord.chain_length = chain_length;
currentRecord.chain_pos = chain_pos;

// Apply target padding while ensuring we don't go below 0 or above reference length
uint64_t rStartPos = std::stoi(tokens[7]);
uint64_t rEndPos = std::stoi(tokens[8]);

// Always apply target padding
if (target_padding > 0) {
if (rStartPos >= target_padding) {
rStartPos -= target_padding;
} else {
rStartPos = 0;
}
if (rEndPos + target_padding <= ref_len) {
rEndPos += target_padding;
} else {
rEndPos = ref_len;
}
}

// Validate coordinates against reference length
if (rStartPos >= ref_len || rEndPos > ref_len) {
std::cerr << "[parse-debug] ERROR: Coordinates exceed reference length!" << std::endl;
throw std::runtime_error("[wfmash::align::parseMashmapRow] Error! Coordinates exceed reference length: "
+ std::to_string(rStartPos) + "-" + std::to_string(rEndPos)
+ " (ref_len=" + std::to_string(ref_len) + ")");
}

currentRecord.rStartPos = rStartPos;
currentRecord.rEndPos = rEndPos;
currentRecord.mashmap_estimated_identity = mm_id;
}
}
Expand All @@ -193,7 +242,7 @@ seq_record_t* createSeqRecord(const MappingBoundaryRow& currentRecord,
// Get the query sequence length
const int64_t query_size = faidx_seq_len(query_faidx, currentRecord.qId.c_str());

// Compute padding
// Compute padding for sequence extraction
const uint64_t head_padding = currentRecord.rStartPos >= param.wflign_max_len_minor
? param.wflign_max_len_minor : currentRecord.rStartPos;
const uint64_t tail_padding = ref_size - currentRecord.rEndPos >= param.wflign_max_len_minor
Expand Down Expand Up @@ -273,7 +322,10 @@ std::string processAlignment(seq_record_t* rec) {
param.no_seq_in_sam,
param.min_identity,
param.wflign_max_len_minor,
rec->currentRecord.mashmap_estimated_identity);
rec->currentRecord.mashmap_estimated_identity,
rec->currentRecord.chain_id,
rec->currentRecord.chain_length,
rec->currentRecord.chain_pos);

return output.str();
}
Expand Down Expand Up @@ -310,7 +362,7 @@ void processor_thread(std::atomic<size_t>& total_alignments_queued,
std::string* line_ptr = nullptr;
if (line_queue.try_pop(line_ptr)) {
MappingBoundaryRow currentRecord;
parseMashmapRow(*line_ptr, currentRecord);
parseMashmapRow(*line_ptr, currentRecord, param.target_padding);

// Process the record and create seq_record_t
seq_record_t* rec = createSeqRecord(currentRecord, *line_ptr, local_ref_faidx, local_query_faidx);
Expand Down Expand Up @@ -408,9 +460,48 @@ void worker_thread(uint64_t tid,
if (seq_queue.try_pop(rec)) {
is_working.store(true);
std::string alignment_output = processAlignment(rec);

// Push the alignment output to the paf_queue
paf_queue.push(new std::string(std::move(alignment_output)));

// Parse the alignment output to find CIGAR string and coordinates
std::stringstream ss(alignment_output);
std::string line;
while (std::getline(ss, line)) {
if (line.empty()) continue;

std::vector<std::string> fields;
std::stringstream field_ss(line);
std::string field;
while (field_ss >> field) {
fields.push_back(field);
}

// Find the CIGAR string field (should be after cg:Z:)
auto cigar_it = std::find_if(fields.begin(), fields.end(),
[](const std::string& s) { return s.substr(0, 5) == "cg:Z:"; });

if (cigar_it != fields.end()) {
std::string cigar = cigar_it->substr(5); // Remove cg:Z: prefix
uint64_t ref_start = std::stoull(fields[7]);
uint64_t ref_end = std::stoull(fields[8]);


// Just pass through the CIGAR string and coordinates unchanged
// The trimming is now handled in wflign namespace

// Reconstruct the line
std::string new_line;
for (const auto& f : fields) {
if (!new_line.empty()) new_line += '\t';
new_line += f;
}
new_line += '\n';

// Push the modified alignment output to the paf_queue
paf_queue.push(new std::string(std::move(new_line)));
} else {
// If no CIGAR string found, output the line unchanged
paf_queue.push(new std::string(line + '\n'));
}
}

// Update progress meter and processed alignment length
uint64_t alignment_length = rec->currentRecord.qEndPos - rec->currentRecord.qStartPos;
Expand Down Expand Up @@ -514,7 +605,7 @@ void computeAlignments() {

while(std::getline(mappingListStream, mappingRecordLine)) {
if (!mappingRecordLine.empty()) {
parseMashmapRow(mappingRecordLine, currentRecord);
parseMashmapRow(mappingRecordLine, currentRecord, param.target_padding);
total_alignment_length += currentRecord.qEndPos - currentRecord.qStartPos;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/common/wflign/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ set(wflign_SOURCE
src/wflign.cpp
src/wflign_alignment.cpp
src/wflign_patch.cpp
src/wflign_swizzle.cpp
src/rkmh.cpp
src/murmur3.cpp
)
Expand Down
2 changes: 2 additions & 0 deletions src/common/wflign/src/alignment_printer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ void write_tag_and_md_string(
void write_alignment_sam(
std::ostream &out,
const alignment_t& patch_aln,
const std::string& cigar_str,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
Expand All @@ -41,6 +42,7 @@ void write_alignment_sam(
bool write_alignment_paf(
std::ostream& out,
const alignment_t& aln,
const std::string& cigar_str,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
Expand Down
41 changes: 38 additions & 3 deletions src/common/wflign/src/wflign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "wflign.hpp"
#include "wflign_patch.hpp"
#include "wflign_swizzle.hpp"


// Namespaces
Expand Down Expand Up @@ -34,7 +35,10 @@ void do_biwfa_alignment(
const bool no_seq_in_sam,
const float min_identity,
const uint64_t wflign_max_len_minor,
const float mashmap_estimated_identity) {
const float mashmap_estimated_identity,
const int32_t chain_id,
const int32_t chain_length,
const int32_t chain_pos) {

// Create WFA aligner with the provided penalties
wfa::WFAlignerGapAffine2Pieces wf_aligner(
Expand All @@ -49,6 +53,7 @@ void do_biwfa_alignment(

wf_aligner.setHeuristicNone();


// Perform the alignment
const int status = wf_aligner.alignEnd2End(target, (int)target_length, query, (int)query_length);

Expand All @@ -64,12 +69,39 @@ void do_biwfa_alignment(

// Copy alignment CIGAR
wflign_edit_cigar_copy(wf_aligner, &aln.edit_cigar);


// Convert WFA CIGAR to string format for potential swizzling
std::string cigar_str = wfa_edit_cigar_to_string(aln.edit_cigar);
// Try swizzling the CIGAR at both ends with debug enabled

std::string swizzled = try_swap_start_pattern(cigar_str, query, target, 0, 0);
if (swizzled != cigar_str) {
cigar_str = swizzled;
} else {
}

swizzled = try_swap_end_pattern(cigar_str, query, target, 0, 0);
if (swizzled != cigar_str) {
cigar_str = swizzled;
} else {
}

// If the CIGAR changed, update coordinates and alignment
//if (cigar_str != wfa_edit_cigar_to_string(aln.edit_cigar)) {
// Update coordinates based on swizzled CIGAR
//auto new_coords = alignment_end_coords(cigar_str, query_offset, target_offset);

// Convert back to WFA format
//wfa_string_to_edit_cigar(cigar_str, &aln.edit_cigar);
//}

//wfa_string_to_edit_cigar(cigar_str, &aln.edit_cigar);
// Write alignment
if (paf_format_else_sam) {
write_alignment_paf(
out,
aln,
cigar_str,
query_name,
query_total_length,
query_offset,
Expand All @@ -86,6 +118,7 @@ void do_biwfa_alignment(
write_alignment_sam(
out,
aln,
cigar_str,
query_name,
query_total_length,
query_offset,
Expand Down Expand Up @@ -451,7 +484,8 @@ void WFlign::wflign_affine_wavefront(
query_name, query, query_total_length, query_offset, query_length, query_is_rev,
target_name, target, target_total_length, target_offset, target_length,
*out, wfa_convex_penalties, emit_md_tag, paf_format_else_sam, no_seq_in_sam,
min_identity, wflign_max_len_minor, mashmap_estimated_identity);
min_identity, wflign_max_len_minor, mashmap_estimated_identity,
-1, 1, 1); // Not part of a chain when using direct biWFA
return;
}

Expand Down Expand Up @@ -1134,6 +1168,7 @@ void WFlign::wflign_affine_wavefront(
write_alignment_paf(
*out,
**x,
"",
query_name,
query_total_length,
query_offset,
Expand Down
Loading

0 comments on commit 92380e6

Please sign in to comment.