Skip to content

Commit

Permalink
add sequencing functions (#49)
Browse files Browse the repository at this point in the history
* Add sequencing

* add checks for minimap2

* Add external functions needed for sequencing example

* add megamash (#50)

* changed how generics work

* add some docs to megamash

* Added changelog
  • Loading branch information
Koeng101 authored Jan 19, 2024
1 parent 919d685 commit 3aab61c
Show file tree
Hide file tree
Showing 27 changed files with 923 additions and 154 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Added the megamash algorithm [#50](https://github.com/Koeng101/dnadesign/pull/50)
- Changed parsers to return values instead of pointers. Added some sequencing utils [#49](https://github.com/Koeng101/dnadesign/pull/49)
- Added minimap2 and samtools(pileup) integrations in external [#46](https://github.com/Koeng101/dnadesign/pull/46)
- Added sam parser [#5](https://github.com/Koeng101/dnadesign/pull/5)
- Added the LinearFold folding algorithms [#38](https://github.com/Koeng101/dnadesign/pull/38)
Expand Down
57 changes: 56 additions & 1 deletion external/minimap2/minimap2.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@ For more information on minimap2, please visit Heng Li's git: https://github.com
package minimap2

import (
"context"
"io"
"os"
"os/exec"

"github.com/koeng101/dnadesign/lib/bio"
"github.com/koeng101/dnadesign/lib/bio/fastq"
"github.com/koeng101/dnadesign/lib/bio/sam"
"golang.org/x/sync/errgroup"
)

// Minimap2 aligns sequences using minimap2 over the command line. Right
Expand Down Expand Up @@ -65,7 +71,7 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e
tmpFile.Close() // Close the file as it's no longer needed

// Start minimap2 pointing to the temporary file and stdin for sequencing data
cmd := exec.Command("minimap2", "-ax", "map-ont", tmpFile.Name(), "-")
cmd := exec.Command("minimap2", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
cmd.Stdout = w
cmd.Stdin = fastqInput
if err := cmd.Start(); err != nil {
Expand All @@ -74,3 +80,52 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e

return cmd.Wait()
}

// Minimap2Channeled uses channels rather than io.Reader and io.Writers.
func Minimap2Channeled(fastaTemplates io.Reader, fastqChan <-chan fastq.Read, samChan chan<- sam.Alignment) error {
ctx := context.Background()
g, ctx := errgroup.WithContext(ctx)

// Create a pipe for writing fastq reads and reading them as an io.Reader
fastqPr, fastqPw := io.Pipe()

// Goroutine to consume fastq reads and write them to the PipeWriter
g.Go(func() error {
defer fastqPw.Close()
for read := range fastqChan {
_, err := read.WriteTo(fastqPw)
if err != nil {
return err // return error to be handled by errgroup
}
}
return nil
})

// Create a pipe for SAM alignments.
samPr, samPw := io.Pipe()

// Use Minimap2 function to process the reads and write SAM alignments.
g.Go(func() error {
defer samPw.Close()
return Minimap2(fastaTemplates, fastqPr, samPw) // Minimap2 writes to samPw
})

// Create a SAM parser from samPr (the PipeReader connected to Minimap2 output).
samParser, err := bio.NewSamParser(samPr)
if err != nil {
return err
}

// Parsing SAM and sending to channel.
g.Go(func() error {
return samParser.ParseToChannel(ctx, samChan, false)
})

// Wait for all goroutines in the group to finish.
if err := g.Wait(); err != nil {
return err // This will return the first non-nil error from the group of goroutines
}

// At this point, all goroutines have finished successfully
return nil
}
170 changes: 170 additions & 0 deletions lib/align/megamash/megamash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
Package megamash is an implementation of the megamash algorithm.
Megamash is an algorithm developed by Keoni Gandall to find templates from
sequencing reactions. For example, you may have a pool of amplicons, and need
to get a count of how many times each amplicon shows up in a given sequencing
reaction.
*/
package megamash

import (
"context"
"fmt"

"github.com/koeng101/dnadesign/lib/bio/fasta"
"github.com/koeng101/dnadesign/lib/bio/fastq"
"github.com/koeng101/dnadesign/lib/transform"
)

// StandardizedDNA returns the alphabetically lesser strand of a double
// stranded DNA molecule.
func StandardizedDNA(sequence string) string {
var deterministicSequence string
reverseComplement := transform.ReverseComplement(sequence)
if sequence > reverseComplement {
deterministicSequence = reverseComplement
} else {
deterministicSequence = sequence
}
return deterministicSequence
}

var (
DefaultKmerSize uint = 16
DefaultMinimalKmerCount uint = 10
DefaultScoreThreshold float64 = 0.2
)

type MegamashMap struct {
Identifiers []string
Kmers []map[string]bool
KmerSize uint
KmerMinimalCount uint
Threshold float64
}

// NewMegamashMap creates a megamash map that can be searched against.
func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount uint, threshold float64) (MegamashMap, error) {
var megamashMap MegamashMap
megamashMap.KmerSize = kmerSize
megamashMap.KmerMinimalCount = kmerMinimalCount
megamashMap.Threshold = threshold

for _, fastaRecord := range sequences {
megamashMap.Identifiers = append(megamashMap.Identifiers, fastaRecord.Identifier)
sequence := fastaRecord.Sequence

// First get all kmers with a given sequence
kmerMap := make(map[string]bool)
for i := 0; i <= len(sequence)-int(kmerSize); i++ {
kmerString := StandardizedDNA(sequence[i : i+int(kmerSize)])
kmerMap[kmerString] = true
}

// Then, get unique kmers for this sequence and only this sequence
uniqueKmerMap := make(map[string]bool)
for kmerBase64 := range kmerMap {
unique := true
for _, otherMegaMashMap := range megamashMap.Kmers {
_, ok := otherMegaMashMap[kmerBase64]
// If this kmer is found, set both to false
if ok {
otherMegaMashMap[kmerBase64] = false
unique = false
break
}
}
if unique {
uniqueKmerMap[kmerBase64] = true
}
}
// Check if we have the minimal kmerCount
var kmerCount uint = 0
for _, unique := range uniqueKmerMap {
if unique {
kmerCount++
}
}
if kmerCount < kmerMinimalCount {
return megamashMap, fmt.Errorf("Got only %d unique kmers of required %d for sequence %s", kmerCount, kmerMinimalCount, fastaRecord.Identifier)
}

// Now we have a unique Kmer map for the given sequence.
// Add it to megamashMap
megamashMap.Kmers = append(megamashMap.Kmers, uniqueKmerMap)
}
return megamashMap, nil
}

// Match contains the identifier and score of a potential match to the searched
// sequence.
type Match struct {
Identifier string
Score float64
}

// Match matches a sequence to all the sequences in a megamash map.
func (m *MegamashMap) Match(sequence string) []Match {
var scores []float64
// The algorithm is as follows:
// - Go through each map.
// - Get the number of matching kmers
// - Divide that by the total kmers available for matching

// First, get the kmer total
var kmerSize int
out:
for _, maps := range m.Kmers {
for kmer := range maps {
kmerSize = len(kmer)
break out
}
}

// Now, iterate through each map
for _, sequenceMap := range m.Kmers {
var score float64
var totalKmers = len(sequenceMap)
var matchedKmers int
for i := 0; i <= len(sequence)-kmerSize; i++ {
kmerString := StandardizedDNA(sequence[i : i+kmerSize])
unique, ok := sequenceMap[kmerString]
if ok && unique {
matchedKmers++
}
}
if totalKmers == 0 {
score = 0
} else {
score = float64(matchedKmers) / float64(totalKmers)
}
scores = append(scores, score)
}

var matches []Match
for i, score := range scores {
if score > m.Threshold {
matches = append(matches, Match{Identifier: m.Identifiers[i], Score: score})
}
}
return matches
}

// FastqMatchChannel processes a channel of fastq.Read and pushes to a channel of matches.
func (m *MegamashMap) FastqMatchChannel(ctx context.Context, sequences <-chan fastq.Read, matches chan<- []Match) error {
for {
select {
case <-ctx.Done():
// Clean up resources, handle cancellation.
return ctx.Err()
case sequence, ok := <-sequences:
if !ok {
close(matches)
return nil
}
sequenceMatches := m.Match(sequence.Sequence)
matches <- sequenceMatches
}
}
}
39 changes: 39 additions & 0 deletions lib/align/megamash/megamash_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package megamash

import (
"testing"

"github.com/koeng101/dnadesign/lib/bio/fasta"
)

func TestMegamash(t *testing.T) {
oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"

samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
m, err := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
if err != nil {
t.Errorf("Failed to make NewMegamashMap: %s", err)
}
for _, sample := range samples {
scores := m.Match(sample)
if scores[0].Identifier != "oligo2" {
t.Errorf("Should have gotten oligo2. Got: %s", scores[0].Identifier)
}
}
}

func BenchmarkMegamash(b *testing.B) {
for i := 0; i < b.N; i++ {
oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"

samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
m, _ := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
for _, sample := range samples {
_ = m.Match(sample)
}
}
}
Loading

0 comments on commit 3aab61c

Please sign in to comment.