Merge pull request #290 from waveygang/flagtastrophe

Flagtastrophe
waveygang · Nov 8, 2024 · c8985d3 · c8985d3
2 parents b160f53 + 0e123bd
commit c8985d3
Show file tree

Hide file tree

Showing 12 changed files with 421 additions and 414 deletions.
diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
@@ -46,6 +46,20 @@ jobs:
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head
       - name: Test mapping+alignment with short reads (255bps) (PAF output)
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reads.255bps.fa.gz -w 16 -s 100 -L > reads.255bps.paf && head reads.255bps.paf
+      - name: Test input mapping functionality
+        run: |
+          # First generate mappings
+          ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -T S288C -Q SK1 -m >mappings.paf
+          # Then align using the mappings
+          ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -i mappings.paf > aligned.paf
+          # Count lines in alignment file
+          ALIGN_LINES=$(wc -l < aligned.paf)
+          if [ $ALIGN_LINES -eq 0 ]; then
+            echo "ERROR: Alignment file is empty"
+            exit 1
+          else
+            echo "Found $ALIGN_LINES alignments"
+          fi
       - name: Install Rust and Cargo
         uses: actions-rs/toolchain@v1
         with:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -189,4 +189,4 @@ install(TARGETS wfa2_static
         PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/include)
-execute_process(COMMAND bash ${CMAKE_SOURCE_DIR}/scripts/generate_git_version.sh ${CMAKE_SOURCE_DIR}/src)
+execute_process(COMMAND bash ${CMAKE_SOURCE_DIR}/scripts/generate_git_version.sh ${CMAKE_SOURCE_DIR}/src ${CMAKE_SOURCE_DIR}/src/common/wflign/src)
diff --git a/README.md b/README.md
@@ -6,29 +6,17 @@ _**a pangenome-scale aligner**_
 [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](https://anaconda.org/bioconda/wfmash)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6949373.svg)](https://doi.org/10.5281/zenodo.6949373)
 
-`wfmash` is an aligner for pangenomes based on sparse homology mapping and wavefront inception.
+`wfmash` is an aligner for pangenomes that combines efficient homology mapping with base-level alignment. It uses MashMap 3.5 to find approximate mappings between sequences, then applies WFA (Wave Front Alignment) to obtain base-level alignments.
 
-`wfmash` uses a variant of [MashMap](https://github.com/marbl/MashMap) to find large-scale sequence homologies.
-It then obtains base-level alignments using [WFA](https://github.com/smarco/WFA2-lib), via the [`wflign`](https://github.com/waveygang/wfmash/tree/master/src/common/wflign) hierarchical wavefront alignment algorithm.
+`wfmash` is designed to make whole genome alignment easy. On a modest compute node, whole genome alignments of gigabase-scale genomes should take minutes to hours, depending on sequence divergence. It can handle high sequence divergence, with average nucleotide identity between input sequences as low as 70%.
 
-`wfmash` is designed to make whole genome alignment easy. On a modest compute node, whole genome alignments of gigabase-scale genomes should take minutes to hours, depending on sequence divergence.
-It can handle high sequence divergence, with average nucleotide identity between input sequences as low as 70%.
+`wfmash` is the key algorithm in [`pggb`](https://github.com/pangenome/pggb) (the PanGenome Graph Builder), where it is applied to make an all-to-all alignment of input genomes that defines the base structure of the pangenome graph. It can scale to support the all-to-all alignment of hundreds of human genomes.
 
-`wfmash` is the key algorithm in [`pggb`](https://github.com/pangenome/pggb) (the PanGenome Graph Builder), where it is applied to make an all-to-all alignment of input genomes that defines the base structure of the pangenome graph.
-It can scale to support the all-to-all alignment of hundreds of human genomes.
+## Process
 
-## process
+By default, `wfmash` breaks query sequences into non-overlapping segments (default: 1kb) and maps them using MashMap. Consecutive mappings separated by less than the chain gap (default: 2kb) are merged. Mappings are limited to 50kb in length by default, which allows efficient base-level alignment using WFA. This length limit is important because WFA's computational complexity is quadratic in the number of differences between sequences, not their percent divergence - meaning longer sequences with the same divergence percentage require dramatically more compute time.
 
-Each query sequence is broken into non-overlapping pieces defined by `-s[N], --segment-length=[N]`.
-These segments are then mapped using MashMap's mapping algorithm.
-Unlike MashMap, `wfmash` merges aggressively across large gaps, finding the best neighboring segment up to `-c[N], --chain-gap=[N]` base-pairs away.
-
-Each mapping location is then used as a target for alignment using the wavefront inception algorithm in `wflign`.
-The resulting alignments always contain extended CIGARs in the `cg:Z:*` tag.
-Approximate mappings can be obtained with `-m, --approx-map`.
-
-Sketching, mapping, and alignment are all run in parallel using a configurable number of threads.
-The number of threads must be set manually, using `-t`, and defaults to 1.
+For longer sequences, use `-m/--approx-mapping` to get approximate mappings only, which allows working with much larger segment and mapping lengths.
 
 ## usage
 
@@ -85,10 +73,10 @@ Map a set of query sequences against a reference genome:
 wfmash reference.fa query.fa >aln.paf
 ```
 
-Setting a longer segment length forces the alignments to be more collinear:
+For mapping longer sequences without alignment, use -m with larger segment and max length values:
 
 ```sh
-wfmash -s 20k reference.fa query.fa >aln.paf
+wfmash -m -s 50k -P 500k reference.fa query.fa >mappings.paf
 ```
 
 Self-mapping of sequences:

diff --git a/scripts/generate_git_version.sh b/scripts/generate_git_version.sh
@@ -1,9 +1,14 @@
 INC_DIR=$1
+WFLIGN_DIR=$2
 
 # Go to the directory where the script is
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd "$SCRIPT_DIR"
 
 GIT_VERSION=$(git describe --always --tags --long)
 
+# Write main wfmash version header
 echo "#define WFMASH_GIT_VERSION" \"$GIT_VERSION\" > "$INC_DIR"/wfmash_git_version.hpp
+
+# Write wflign version header 
+echo "#define WFLIGN_GIT_VERSION" \"$GIT_VERSION\" > "$WFLIGN_DIR"/wflign_git_version.hpp
diff --git a/src/align/include/computeAlignments.hpp b/src/align/include/computeAlignments.hpp
@@ -453,7 +453,7 @@ void write_sam_header(std::ofstream& outstream) {
                 });
         }
     }
-    outstream << "@PG\tID:wfmash\tPN:wfmash\tVN:0.1\tCL:wfmash\n";
+    outstream << "@PG\tID:wfmash\tPN:wfmash\tVN:" << WFMASH_GIT_VERSION << "\tCL:wfmash\n";
 }
 
 void writer_thread(const std::string& output_file,

diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp
@@ -5,6 +5,7 @@
 #include <atomic_image.hpp>
 #include "rkmh.hpp"
 #include "wflign_patch.hpp"
+#include "wflign_git_version.hpp"
 
 namespace wflign {
 
@@ -1939,7 +1940,8 @@ query_start : query_end)
             out << "\t" << "cg:Z:" << cigarv << "\n";
 #endif
         } else {
-            out << query_name                          // Query template NAME
+            out << "@PG\tID:wfmash\tPN:wfmash\tVN:" << WFLIGN_GIT_VERSION << "\tCL:wfmash\n"
+                << query_name                          // Query template NAME
                 << "\t" << (query_is_rev ? "16" : "0") // bitwise FLAG
                 << "\t" << target_name // Reference sequence NAME
                 << "\t"

diff --git a/src/interface/main.cpp b/src/interface/main.cpp
@@ -52,19 +52,18 @@ int main(int argc, char** argv) {
         auto t0 = skch::Time::now();
 
         if (map_parameters.use_spaced_seeds) {
-          std::cerr << "[wfmash::map] Generating spaced seeds" << std::endl;
+          std::cerr << "[wfmash::mashmap] Generating spaced seeds..." << std::endl;
           uint32_t seed_weight = map_parameters.spaced_seed_params.weight;
           uint32_t seed_count = map_parameters.spaced_seed_params.seed_count;
           float similarity = map_parameters.spaced_seed_params.similarity;
           uint32_t region_length = map_parameters.spaced_seed_params.region_length;
 
           ales::spaced_seeds sps = ales::generate_spaced_seeds(seed_weight, seed_count, similarity, region_length);
           std::chrono::duration<double> time_spaced_seeds = skch::Time::now() - t0;
-          std::cerr << "[wfmash::map] Time spent generating spaced seeds " << time_spaced_seeds.count()  << " seconds" << std::endl;
           map_parameters.spaced_seed_sensitivity = sps.sensitivity;
           map_parameters.spaced_seeds =  sps.seeds;
           ales::printSpacedSeeds(map_parameters.spaced_seeds);
-          std::cerr << "[wfmash::map] Spaced seed sensitivity " << sps.sensitivity << std::endl;
+          std::cerr << "[wfmash::mashmap] Generated spaced seeds in " << time_spaced_seeds.count() << "s (sensitivity: " << sps.sensitivity << ")" << std::endl;
         }
 
         //Map the sequences in query file
@@ -73,8 +72,7 @@ int main(int argc, char** argv) {
         skch::Map mapper = skch::Map(map_parameters);
 
         std::chrono::duration<double> timeMapQuery = skch::Time::now() - t0;
-        std::cerr << "[wfmash::map] time spent mapping the query: " << timeMapQuery.count() << " sec" << std::endl;
-        std::cerr << "[wfmash::map] mapping results saved in: " << map_parameters.outFileName << std::endl;
+        std::cerr << "[wfmash::mashmap] Mapped query in " << timeMapQuery.count() << "s, results saved to: " << map_parameters.outFileName << std::endl;
 
         if (yeet_parameters.approx_mapping) {
             return 0;
-Original file line number
+Diff line change
@@ Expand Up / @@ -453,7 +453,7 @@ void write_sam_header(std::ofstream& outstream) { @@
                     });
             }
         }
-        outstream << "@PG\tID:wfmash\tPN:wfmash\tVN:0.1\tCL:wfmash\n";
+        outstream << "@PG\tID:wfmash\tPN:wfmash\tVN:" << WFMASH_GIT_VERSION << "\tCL:wfmash\n";
     }
     void writer_thread(const std::string& output_file,
@@ Expand Down @@