From 88b9bb0fff33109b12bf1a6a5da5925e141cdd4f Mon Sep 17 00:00:00 2001 From: Christina-hshi Date: Thu, 14 Nov 2019 08:28:39 -0500 Subject: [PATCH] add cqf-denoise for assembly --- base/Hash.h | 66 +- base/Params.h | 53 +- base/Utility.h | 57 ++ base/global.h | 16 +- cqf/CQF_mt.h | 10 +- cqf/cqf_deNoise.h | 2 +- cqf/gqf.c | 18 +- src/CMakeLists.txt | 8 +- src/CQF-deNoise.cpp | 209 ++++++ src/compare_contigs.cpp | 67 ++ src/contig_assembly.cpp | 1405 +++++++++++++++++++++++++++++++++++---- 11 files changed, 1736 insertions(+), 175 deletions(-) create mode 100644 src/CQF-deNoise.cpp create mode 100644 src/compare_contigs.cpp diff --git a/base/Hash.h b/base/Hash.h index a5b2a7a..90e1883 100644 --- a/base/Hash.h +++ b/base/Hash.h @@ -6,9 +6,71 @@ #include "global.h" //#include "Utility.h" -uint32_t MurmurHash2 ( const void * key, int len, u_int seed=0); +/* MurmurHash2, by Austin Appleby +// Note - This code makes a few assumptions about how your machine behaves - +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. */ -uint32_t MurmurHash2 ( string seq, u_int seed=0); +uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed ) +{ + /* 'm' and 'r' are mixing constants generated offline. + They're not really 'magic', they just happen to work well. */ + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + /* Initialize the hash to a 'random' value */ + + uint32_t h = seed ^ len; + + /* Mix 4 bytes at a time into the hash */ + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + uint32_t k = *(uint32_t*)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + /* Handle the last few bytes of the input array */ + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + /* Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. */ + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +uint32_t MurmurHash2 ( string seq, u_int seed=0){ + return MurmurHash2(seq.c_str(), seq.length(), seed); +} /* uint32_t MurmurHash2_kmer( const string seq, int k, u_int seed); diff --git a/base/Params.h b/base/Params.h index ee037e4..a9e9043 100644 --- a/base/Params.h +++ b/base/Params.h @@ -1,27 +1,44 @@ #pragma once +#include "base/Utility.h" + struct Params{ - size_t K; //k-mer size - size_t readLen_mean; - size_t readLen_max; + //general parameters size_t thread_num; + + //k-mer related parameters + size_t K; //k-mer size + int kmer_abundance_min; //k-mer with lower abundance won't be used + int solid_kmer_abundance_min; + int solid_kmer_abundance_max; + + //input and output + string readFileList; + FILE_TYPE ftype; + FILE_MODE fmode; + string cqfFile; + string output; + + // //read related + // size_t readLen_mean; + // size_t readLen_max; + + // //contig related parameters + // int CONTIG_min_cov;//minimum coverage for a K-mer to use + // int CONTIG_min_SR;//minimum number of supportive reads for two unitigs near hub to be paired. and also for reliable connection between two unitigs. + // int CONTIG_min_len;//minimum length of contigs to output. + // int CONTIG_tip_len;//branches with len no longer than contig_tip_len are removed while there is alternative branch with length larger than contig_tip_len, which is regarded as true path. + + // //For solve repeat + // int REPEAT_p;//reliable distance with at least p(ratio) supportive connections - //contig related parameters - int CONTIG_min_cov;//minimum coverage for a K-mer to use - int CONTIG_min_SR;//minimum number of supportive reads for two unitigs near hub to be paired. and also for reliable connection between two unitigs. - int CONTIG_min_len;//minimum length of contigs to output. - int CONTIG_tip_len;//branches with len no longer than contig_tip_len are removed while there is alternative branch with length larger than contig_tip_len, which is regarded as true path. - - //For solve repeat - int REPEAT_p;//reliable distance with at least p(ratio) supportive connections - - //scaffold related parameters - int SCAF_min_MAPQ;//minimum mapping quality for an alignment to use - //int SCAF_min_MAPLEN;//minimum aligned length of an qualified alignment + // //scaffold related parameters + // int SCAF_min_MAPQ;//minimum mapping quality for an alignment to use + // //int SCAF_min_MAPLEN;//minimum aligned length of an qualified alignment - //For output - bool OUT_repeat; - int OUT_CONTIG_min_len; + // //For output + // bool OUT_repeat; + // int OUT_CONTIG_min_len; bool OUT_haploid;//whether output in haploid or diploid mode //For recording statistics diff --git a/base/Utility.h b/base/Utility.h index c2e3bf7..41268d5 100644 --- a/base/Utility.h +++ b/base/Utility.h @@ -14,6 +14,23 @@ typedef vector vec_double; typedef vector> matrix_int; typedef vector vec_int; +/*Compute median from a vector of int +*/ +double median(vector& nums){ + if(nums.size()==0){ + return 0; + }else if(nums.size()==1){ + return nums[0]; + } + sort(nums.begin(), nums.end()); + int tmp = nums.size()/2; + if(nums.size()%2==0){ + return (nums[tmp-1]+nums[tmp])/2.0; + }else{ + return nums[tmp]; + } +} + /** Get reverse complement of DNA sequence */ inline string RC_DNA(const string seq){ @@ -915,3 +932,43 @@ inline void contig_summary(vector contigs, int ref_len=4800000){ cout<<"Contig N50 "< contigs, int ref_len=4800000){ + size_t contig_num = 0; + contig_num += contigs.size(); + + int total_len = 0; + int NG50, N50; NG50 = N50 = 0; + vector contig_lens(contig_num); + + int tmp_idx = 0; + for(auto contig:contigs){ + contig_lens[tmp_idx] = contig.seq.length(); + total_len += contig_lens[tmp_idx]; + tmp_idx++; + } + + std::sort(contig_lens.rbegin(), contig_lens.rend()); + int sum_tmp = 0; + for(auto len:contig_lens){ + sum_tmp += len; + if(sum_tmp>=total_len/2){ + N50 = len; + break; + } + } + + sum_tmp = 0; + for(auto len:contig_lens){ + sum_tmp += len; + if(sum_tmp>=ref_len/2){ + NG50 = len; + break; + } + } + cout< #include #include -#include +//#include #include #include #include @@ -27,6 +27,7 @@ #include #include #include +#include //#include //#include //#include @@ -80,9 +81,20 @@ typedef boost::tokenizer> tokenizer; enum SEQ_LIB_TYPE{MP, PE, SE}; -//char DNA_bases[4] = {'A', 'C', 'G', 'T'}; +const char DNA_bases[4] = {'A', 'C', 'G','T'}; const std::string currentDateTime(); + +struct Contig +{ + string seq; + double median_abundance;//median abundances of k-mers in + Contig(string s="", double a=0){ + seq = s; + median_abundance = a; + } +}; + #if 0 std::ostream& operator<<( std::ostream& dest, __int128_t value ){ std::ostream::sentry s( dest ); diff --git a/cqf/CQF_mt.h b/cqf/CQF_mt.h index d71fef1..dacebcc 100644 --- a/cqf/CQF_mt.h +++ b/cqf/CQF_mt.h @@ -348,8 +348,8 @@ struct seqFile_batch{ while(this->num_files){ if (this->ip_files.pop(fp)) { if (fastq_read_parts(fp->fmode, fp)) { + data.set(fp->part, fp->size); //assign the pointer, instead of copy the data this->ip_files.push(fp); - data.set(fp->part, fp->size); return true; }else{ /* close the file */ @@ -444,7 +444,11 @@ class CQF_mt{ uint64_t count(uint64_t key){ return qf_count_key_value(qf, key, default_value); } - + + void reset_iterator(){ + qf_iterator(qf, &qfi, 0); + } + bool set_iterator_2pos(uint64_t pos){ return qf_iterator(qf, &qfi, pos); } @@ -480,7 +484,7 @@ class CQF_mt{ bool next_untraveled(){ return qfi_next_untraveled(&qfi)?false:true; } - //return whether it is traveled before setting + //return whether it is traveled(true) or untraveled(false) before setting bool count_key_value_set_traveled(uint64_t key, uint64_t& count){ return qf_count_key_value_set_traveled(qf, key, default_value, &count); } diff --git a/cqf/cqf_deNoise.h b/cqf/cqf_deNoise.h index 8d2f749..47de8c7 100644 --- a/cqf/cqf_deNoise.h +++ b/cqf/cqf_deNoise.h @@ -829,7 +829,7 @@ static bool fastq_to_uint64kmers_prod(flush_object* obj, seqFile_batch* seqFiles if (seqFiles->ip_files.pop(fp)) { if (fastq_read_parts(fp->fmode, fp)) { seqFiles->ip_files.push(fp); - chunk c(fp->part, fp->size); + chunk c(fp->part, fp->size);//assign pointer "part" to chunk, instead of copy the data. reads_to_kmers(c, obj); if (obj->count) { dump_local_qf_to_main(obj); diff --git a/cqf/gqf.c b/cqf/gqf.c index 784232b..95c30b6 100644 --- a/cqf/gqf.c +++ b/cqf/gqf.c @@ -1432,7 +1432,8 @@ static inline bool insert1(QF *qf, __uint128_t hash, bool lock, bool spin) shift_runends(qf, insert_index, empty_slot_index1-1, 2); //METADATA_WORD(qf, runends, (insert_index+1)) &= ~(1ULL << (((insert_index+1)%SLOTS_PER_BLOCK)%64)); } - if(abs(runend_index-insert_index) < 2){ + if((runend_index>insert_index?(runend_index-insert_index):(insert_index - runend_index)) < 2){ + //if(abs(runend_index-insert_index) < 2){ METADATA_WORD(qf, runends, runend_index) &= ~(1ULL << ((runend_index%SLOTS_PER_BLOCK)%64)); } @@ -1732,7 +1733,8 @@ static inline bool insert1_advance(QF *qf, __uint128_t hash, bool lock, bool spi shift_runends(qf, insert_index, empty_slot_index1-1, 2); //METADATA_WORD(qf, runends, (insert_index+1)) &= ~(1ULL << (((insert_index+1)%SLOTS_PER_BLOCK)%64)); } - if(abs(runend_index-insert_index) < 2){ + //if(abs(runend_index-insert_index) < 2){ + if((runend_index>insert_index?(runend_index-insert_index):(insert_index - runend_index)) < 2){ METADATA_WORD(qf, runends, runend_index) &= ~(1ULL << ((runend_index%SLOTS_PER_BLOCK)%64)); } @@ -3068,15 +3070,15 @@ int qfi_next_untraveled(QFi * qfi){ return isEnd; } -//return whether is traveled -//and set it to be traveled +//return whether is traveled (true) or not traveled (false) +//and then set it to be traveled bool qf_count_key_value_set_traveled(const QF *qf, uint64_t key, uint64_t value, uint64_t* count){ __uint128_t hash = key; uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; if (!is_occupied(qf, hash_bucket_index)){ - count = 0; + *count = 0; return false; } @@ -3104,7 +3106,7 @@ bool qf_count_key_value_set_traveled(const QF *qf, uint64_t key, uint64_t value, runstart_index = current_end + 1; } while (!is_runend(qf, current_end)); - count = 0; + *count = 0; return false; } @@ -3116,7 +3118,7 @@ bool qf_count_key_value_is_traveled(const QF *qf, uint64_t key, uint64_t value, int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; if (!is_occupied(qf, hash_bucket_index)){ - count = 0; + *count = 0; return false; } @@ -3139,7 +3141,7 @@ bool qf_count_key_value_is_traveled(const QF *qf, uint64_t key, uint64_t value, runstart_index = current_end + 1; } while (!is_runend(qf, current_end)); - count = 0; + *count = 0; return false; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7d61263..c830da3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,13 @@ include_directories(${PROJECT_BINARY_DIR}) set(SH-lib base-lib) add_executable(Contiger contig_assembly.cpp) -target_link_libraries(Contiger cqf-core-lib base-lib z bz2 boost_system-mt boost_thread-mt boost_iostreams boost_program_options-mt) +target_link_libraries(Contiger cqf-core-lib base-lib z bz2 boost_system boost_thread boost_iostreams boost_program_options pthread rt boost_timer boost_chrono tbb) + +add_executable(CQF-deNoise CQF-deNoise.cpp) +target_link_libraries(CQF-deNoise cqf-core-lib base-lib z bz2 boost_system boost_thread boost_iostreams boost_program_options pthread rt boost_timer boost_chrono) + +add_executable(compare_contigs compare_contigs.cpp) +target_link_libraries(compare_contigs cqf-core-lib base-lib boost_program_options) # #gkm-kernel # add_executable(gkm-kernel main_calKernel.cpp) diff --git a/src/CQF-deNoise.cpp b/src/CQF-deNoise.cpp new file mode 100644 index 0000000..97f22f3 --- /dev/null +++ b/src/CQF-deNoise.cpp @@ -0,0 +1,209 @@ +#define GRAPH_TRAVERSE + +#include "cqf/CQF_mt.h" +#include "cqf/true2falseKmer_DP.h" + +boost::program_options::variables_map get_opts(int argc, char* argv[]){ + namespace po = boost::program_options; + po::options_description desc(string(argv[0])+" \nOptions:"); + desc.add_options() + ("help,h", "print help messages") + (",k", po::value()->required(), "kmer length") + ("trueKmer,n", po::value()->required(), "number of true(or non singleton) k-mers") + (",N", po::value()->required(), "total number of k-mers to process") + ("alpha,e", po::value()->default_value(-1), "provide estimated base error rate of data for allocation of memory.") + ("errorProfile", po::value()->default_value(""), "error profile, each line with error rate for corresponding base.") + ("fr", po::value()->default_value(0), "overall probability of removing non-singleton true k-mers, default: 1/#trueKmer") + ("deNoise", po::value()->default_value(-1), "number of times the deNoise is called, when specified, 'fr' is ignored.") + ("endDeNoise", po::bool_switch()->default_value(false), "call deNoise after processing all the k-mers(not counted into the #deNoise)") + (",t", po::value()->default_value(16), "number of threads") + ("format,f", po::value()->required(), "format of the input: g(gzip); b(bzip2); f(plain fastq)") + ("input,i", po::value()->required(), "a file containing list of input file name(s), should be in the same directory as the fastq file(s)") + ("output,o", po::value(), "output file name"); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + + if (argc==1 || vm.count("help") || (!vm.count("alpha") && !vm.count("errorProfile"))) { + cerr << desc << "\n"; + exit(0); + } + + po::notify(vm); + + return vm; +} + +int main(int argc, char* argv[]){ + auto opts = get_opts(argc, argv); + + time_t start_time; + vector files; + + string flist = opts["input"].as(); + string file_prefix=""; + auto pos = flist.find_last_of("/\\"); + if(pos != string::npos){ + file_prefix = flist.substr(0, pos+1); + } + + fstream fin; + fin.open(flist, ios::in); + string line; + if(fin.is_open()){ + while(!fin.eof()){ + getline(fin, line); + if(line.empty()){ + continue; + } + files.push_back(file_prefix+line); + } + fin.close(); + }else{ + cerr<<"Failed to open file: "<(); + int K = opts["-k"].as(); + double alpha = opts["alpha"].as(); + string errorProfile = opts["errorProfile"].as(); + uint64_t n_true_kmers = opts["trueKmer"].as(); + uint64_t total_kmers = opts["-N"].as(); + int num_deNoise = opts["deNoise"].as(); + double fr = opts["fr"].as(); + bool end_deNoise = opts["endDeNoise"].as(); + uint64_t num_slots; + uint64_t n_distinct_elts_for_DeNoise; + + uint64_t num_true_kmers; + uint64_t num_false_kmers; + if(alpha==-1){ + double tmp = true2falseKmer_DP(errorProfile, K); + num_true_kmers = total_kmers * tmp/(1+tmp); + num_false_kmers = total_kmers - num_true_kmers; + }else{ + num_true_kmers = total_kmers * pow(1-alpha, K); + num_false_kmers = total_kmers - num_true_kmers; + } + + if(num_deNoise < 0 ){ + if(!fr){ + fr = 1.0/n_true_kmers; + } + num_deNoise = mean_CDF2deNoise(num_true_kmers/n_true_kmers, fr); + } + + int ave_num_slot_for_encode=0; + uint64_t tmp=num_true_kmers/n_true_kmers+1; + while(tmp){ + tmp >>= 7; + ave_num_slot_for_encode++; + } + num_slots = n_true_kmers * (ave_num_slot_for_encode + (double)3/2) + num_false_kmers * 10 /((num_deNoise+1) * 9); + + uint64_t qb; + uint64_t hb; + qb=1; + uint64_t base=2; + while(base < num_slots){ + qb++; + base <<= 1; + } + uint64_t num_deNoise_upper_bound, num_deNoise_lower_bound, num_slots_tmp; + num_deNoise_upper_bound = num_deNoise_lower_bound = num_deNoise; + + num_slots_tmp = num_slots; + while(num_deNoise && num_slots_tmp < (1ULL<= (1ULL< (1ULL<<(qb-1))){ + num_deNoise_upper_bound = 0; + }else{ + num_slots_tmp = num_slots; + //cerr<<"finding upper boud..."<= (1ULL<<(qb-1))){ + //cerr<(); + } + + FILE_MODE ftype; + char tmp_str = opts["format"].as(); + if(tmp_str == 'g'){ + ftype = FILE_MODE::GZIP; + }else if(tmp_str == 'b'){ + ftype = FILE_MODE::BZIP2; + }else if(tmp_str == 'f'){ + ftype = FILE_MODE::TEXT; + }else{ + cerr<<"Unrecognized file type "<\nOptions:"); + desc.add_options() + ("help,h", "print help messages") + (",k", po::value()->required(), "kmer length, kmers are used as seed to index sequences.") + ("input1,1", po::value()->required(), "first DNA sequence file in fasta format") + ("input2,2", po::value()->required(), "second DNA sequence file in fasta format") + ("output,o", po::value()->default_value("unitigs.fa"), "output shared DNA sequences"); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + + if (argc==1 || vm.count("help")) { + cerr << desc << "\n"; + exit(0); + } + + po::notify(vm); + return vm; +} + +int main(int argc, char* argv[]){ + namespace po = boost::program_options; + po::variables_map options = get_opts(argc, argv); + + int k = options["-k"].as(); + string input1 = options["input1"].as(); + string input2 = options["input2"].as(); + string output = options["output"].as(); + + ifstream fin1, fin2; + ofstream fout; + fin1.open(input1, ios::in); + fin2.open(input2, ios::in); + fout.open(output, ios::out); + + //load all sequences + std::vector seqs1, seqs2; + string line, seq; + getline(fin1, line); + while(getline(fin1, line)){ + seq = line; + while(getline(fin1, line)){ + if(line[0]=='>') break; + seq += line; + } + seqs1.push_back(seq); + } + getline(fin2, line); + while(getline(fin2, line)){ + seq = line; + while(getline(fin2, line)){ + if(line[0]=='>') break; + seq += line; + } + seqs2.push_back(seq); + } + + fin1.close(); + fin2.close(); + fout.close(); + + return 0; +} \ No newline at end of file diff --git a/src/contig_assembly.cpp b/src/contig_assembly.cpp index e1b9845..ac32245 100644 --- a/src/contig_assembly.cpp +++ b/src/contig_assembly.cpp @@ -5,16 +5,42 @@ #include "base/unordered_map_mt.h" #include "base/vector_mt.h" #include "base/Params.h" +#include "base/Hash.h" //#include "base/nthash.h" #include #include #include +#include +#include +#include + +//using namespace tbb; +using tbb::concurrent_vector; +using tbb::concurrent_queue; + +struct str_hasher +{ + size_t operator()(const string& val)const + { + return MurmurHash2(val); + } +}; + +struct str_equalto +{ + bool operator()(const string& one, const string& two) const + { + return (one == two); + } +}; + +typedef tbb::concurrent_unordered_set unordered_set_mt; //using namespace boost::program_options; //namespace po = boost::program_options; //using boost::program_options::variables_map; -boost::program_options::variables_map get_opts(int argc, char* argv[]){ +Params get_opts(int argc, char* argv[]){ namespace po = boost::program_options; po::options_description desc(string(argv[0])+" \nOptions:"); desc.add_options() @@ -22,10 +48,12 @@ boost::program_options::variables_map get_opts(int argc, char* argv[]){ (",k", po::value()->required(), "kmer length") ("input,i", po::value()->required(), "a file containing list of input file name(s), should be absolute address or file names when in the running directory.") ("format,f", po::value()->default_value('f'), "format of the input: g(gzip); b(bzip2); f(plain fastq)") - ("cqf,f", po::value()->required(), "the counting quotient filter built with the same 'k'") - ("abundance_min, s", po::value()->default_value(2), "minimum coverage of a solid k-mer to start the assembly of a contig") + ("cqf,c", po::value()->required(), "the counting quotient filter built with the same 'k'") + ("abundance_min,s", po::value()->default_value(2), "minimum coverage of k-mers used to extend the assembly") + ("solid_abundance_min,x", po::value()->default_value(2), "minimum coverage of a solid k-mer to start the assembly") + ("solid_abundance_max,X", po::value()->default_value(100), "maximum coverage of a solid k-mer to start the assembly") (",t", po::value()->default_value(16), "number of threads") - ("output,o", po::value(), "output contig file name (fasta)"); + ("output,o", po::value()->default_value("unitigs.fa"), "output contig file name (fasta)"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); @@ -37,34 +65,63 @@ boost::program_options::variables_map get_opts(int argc, char* argv[]){ po::notify(vm); - return vm; + Params options; + options.K = vm["-k"].as(); + options.readFileList = vm["input"].as(); + options.cqfFile = vm["cqf"].as(); + options.kmer_abundance_min = vm["abundance_min"].as(); + options.solid_kmer_abundance_min = vm["solid_abundance_min"].as(); + options.solid_kmer_abundance_max = vm["solid_abundance_max"].as(); + options.thread_num = vm["-t"].as(); + options.output = vm["output"].as(); + switch(vm["format"].as()){ + case 'g': + options.fmode = FILE_MODE::GZIP; + break; + case 'b': + options.fmode = FILE_MODE::BZIP2; + break; + case 'f': + options.fmode = FILE_MODE::TEXT; + break; + default: + std::cerr<<"[Error] unrecognized file format "<()<& seqFiles, const Params& params, vector_mt& contigs); -void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const boost::program_options::variables_map& options, vector_mt& unitigs); +//void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& options, vector_mt& unitigs); +void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& options, concurrent_vector& unitigs); //void find_unitigs_mt_worker(CQF_mt& cqf, const Params& params, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue); -void find_unitigs_mt_worker(CQF_mt& cqf, const boost::program_options::variables_map& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue); +//void find_unitigs_mt_worker(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue); + +void find_unitigs_mt_worker(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& options, concurrent_vector& contigs, unordered_set_mt& startKmer2unitig, WorkQueue* work_queue); //void get_unitig_forward(CQF_mt& cqf, const Params& params, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id); -void get_unitig_forward(CQF_mt& cqf, const boost::program_options::variables_map& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id); +//void get_unitig_forward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, const int& contig_id); + +//void get_unitig_forward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_set_mt& startKmer2unitig, WorkQueue* work_queue, const int& contig_id); +void get_unitig_forward(CQF_mt& cqf, const Params& options, concurrent_vector& contigs, unordered_set_mt& startKmer2unitig, WorkQueue* work_queue, concurrent_vector::iterator& contigIter); + +//void get_unitig_forward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, Contig& contig); + +//void get_unitig_forward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_set_mt& startKmer2unitig, WorkQueue* work_queue, Contig& contig); //void get_unitig_backward(CQF_mt& cqf, const Params& params, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id); -void get_unitig_backward(CQF_mt& cqf, const boost::program_options::variables_map& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id); +//void get_unitig_backward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id); int main(int argc, char* argv[]){ - //string fileList = "/research/kevinyip11/hshi/NCBI/F.vesca/files.list1"; - //string fileList = "/research/kevinyip11/hshi/Tools/art_src_MountRainier_Linux/44_data/chr6_cox_hap2/MSv1_C50_L250/files.list"; - //string cqfFile = "/research/kevinyip11/hshi/Tools/squeakr/F.vesca.k55.s33.nthash.ser"; - //string prefix = "/research/kevinyip11/hshi/Tools/art_src_MountRainier_Linux/44_data/chr6_cox_hap2/MSv1_C50_L250/single_end"; - //string dir="/research/kevinyip11/hshi/Tools/art_src_MountRainier_Linux/44_data/chr6_cox_hap2/MSv1_C50_L250/"; - //tring dir="/research/kevinyip11/hshi/NCBI/F.vesca/"; - boost::program_options::variables_map options = get_opts(argc, argv); - + Params options = get_opts(argc, argv); + vector seqFileNames; ifstream fin; - fin.open(options["input"].as(), ios::in); + fin.open(options.readFileList, ios::in); string line; while(getline(fin, line)){ if(line.empty()) @@ -72,40 +129,178 @@ int main(int argc, char* argv[]){ seqFileNames.push_back(line); } FILE_TYPE ftype=FILE_TYPE::FASTQ; - FILE_MODE fmode; - switch(options["format"].as()){ - case 'g': - fmode = FILE_MODE::GZIP; - break; - case 'b': - fmode = FILE_MODE::BZIP2; - break; - case 'f': - fmode = FILE_MODE::TEXT; - break; - default: - break; - } - seqFile_batch seqFiles(seqFileNames, ftype, fmode); + seqFile_batch seqFiles(seqFileNames, ftype, options.fmode); DisplayCurrentDateTime(); cout<<"[CQF] loading cqf from disk"<()); + cqf_mt.load(options.cqfFile); cout<<"[CQF] cqf loaded!"<metadata->nelts< contigs; + concurrent_vector contigs; contigs.resize(1); find_unitigs_mt_master(cqf_mt, seqFiles, options, contigs); + //bulid the graph + uint64_t totalUnitigs_len, duplicateUnitigs_len; totalUnitigs_len = duplicateUnitigs_len=0; + int duplicateUnitig_num=0; + auto contig_num = contigs.size(); + unordered_map startKmer2unitig(contig_num*2); + vector isDuplicate(contig_num, false); + vector id2id_afterRemoveDuplicate(contig_num, -1); + + string first_kmer, last_kmer_RC; + int noduplicate_contig_num=0; + for(int contig_id = 1; contig_idsecond); + // while(id2id_afterRemoveDuplicate[tmp] != abs(it->second)){ + // tmp++; + // } + + // cout<<"[Warning] contig "<<(it->second>0?tmp:-tmp)<<" and "<second>0?tmp:-tmp)<<": "; + // if(it->second < 0){ + // cout<second); + // while(id2id_afterRemoveDuplicate[id] < abs(it->second)){ + // id++; + // } + // constructed_len += (contigs[id].seq.length() - options.K +1); + // cout<<"[Test] find kmer: "<second<<" of length "<second<<": "; + // if(it->second < 0) + // cout<= refSeq.length()){ + // cout<<"[Test] reference seq is encoded in the DBG of unitigs"<metadata->range); + // cout<<"[Test] count: "<"<<(id2id_afterRemoveDuplicate[contig_id]-1)<<" LN:i:"<second; + if(tmp>0){ + fout<<" L:+:"<second; + if(tmp>0){ + fout<<" L:-:"< next_work; volatile atomic total_work; @@ -114,19 +309,22 @@ struct WorkQueue{ boost::mutex mut; WorkQueue(){ - next_work = 0; + next_work = 1; total_work = 0; + work_done = 0; master_done = false; } WorkQueue(uint32_t n, uint32_t t){ + assert(n>0 && t>0); next_work = n; total_work = t; + work_done = n-1; master_done = false; } bool get_next_work(uint32_t& work_id){ boost::unique_lock lock(mut); - if(next_work < total_work){ + if(next_work <= total_work){ work_id = next_work; next_work++; lock.unlock(); @@ -147,9 +345,48 @@ struct WorkQueue{ boost::unique_lock lock(mut); next_work += work_num; total_work += work_num; + work_done += work_num; lock.unlock(); } }; +*/ + +struct WorkQueue{ + concurrent_queue::iterator> jobQueue; + volatile atomic total_work; + volatile atomic work_done; + volatile atomic master_done; + //boost::mutex mut; + + WorkQueue(){ + total_work = 0; + work_done = 0; + master_done = false; + } + //WorkQueue(uint32_t t){ + // assert(t>0); + //next_work = n; + // total_work = t; + // work_done = ; + // master_done = false; + // } + + bool get_next_work(concurrent_vector::iterator& jobIter){ + if(jobQueue.try_pop(jobIter)){ + return true; + }else{ + return false; + } + } + void report_work_done(int num=1){ + work_done+=num; + } + + void add_work(concurrent_vector::iterator jobIter){ + jobQueue.push(jobIter); + total_work ++; + } +}; /* void find_unitigs_mt_master(CQF_mt& cqf, const vector& seqFiles, const Params& params, vector_mt& contigs){ @@ -233,18 +470,13 @@ void find_unitigs_mt_master(CQF_mt& cqf, const vector& seqFiles, const P prod_threads.join_all(); } */ - -void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const boost::program_options::variables_map& options, vector_mt& contigs){ - int K = options["-k"].as(); - int thread_num = options["-t"].as(); - +/* +void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& options, vector_mt& contigs){ WorkQueue* work_queue = new WorkQueue(); unordered_map_mt startKmer2unitig(1000); - int abundance_min = options["abundance_min"].as(); - boost::thread_group prod_threads; - for(int t = 0; t master_contigs; chunk dataChunk; while(seqFiles.getDataChunk(dataChunk)){ std::string line, seq; @@ -267,47 +500,224 @@ void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const boost::p break; } //cout<metadata->range, kmer_count) && cqf.count_key_value_set_traveled(kmer_RC_hash%cqf.qf->metadata->range, kmer_RC_count)){ + kmer_hash = NTPC64(kmer.c_str(), options.K, kmer_hash, kmer_RC_hash); + + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ continue; - }else if(kmer_count < abundance_min || kmer_RC_count < abundance_min){ + }else if(kmer_count < options.kmer_abundance_min){ continue; } - contig_id = contigs.push_back_mt(seq); - work_queue->add_skip_work(1); - startKmer2unitig.insert_mt(kmer, contig_id); //may not be the start k-mer - startKmer2unitig.insert_mt(kmer_RC, -contig_id); //may not be the end k-mer - - get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, contig_id); - get_unitig_backward(cqf, options, contigs, startKmer2unitig, work_queue, contig_id); + //contig_id = contigs.push_back_mt(Contig(kmer, kmer_count)); + //work_queue->add_skip_work(1); + //startKmer2unitig.insert_mt(kmer, contig_id); //may not be the start k-mer + startKmer2unitig.insert_mt(kmer, 0);//fake id + //startKmer2unitig.insert_mt(kmer_RC, -contig_id); //may not be the end k-mer + Contig master_contig(kmer, kmer_count); + get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, master_contig); + master_contig.seq = RC_DNA(master_contig.seq); + get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, master_contig); + master_contigs.push_back(master_contig); + //std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + break; + //get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, contig_id); + //contigs.set_mt(contig_id, Contig(RC_DNA(contigs[contig_id].seq), contigs[contig_id].median_abundance)); + //get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, contig_id); + //get_unitig_backward(cqf, options, contigs, startKmer2unitig, work_queue, contig_id); //wait for the end of this run. - while(work_queue->work_done != work_queue->total_work){ - std::this_thread::sleep_for(std::chrono::milliseconds(1000));//sleep for 1 seconds + //while(work_queue->work_done != work_queue->total_work){ + // std::this_thread::sleep_for(std::chrono::milliseconds(1000));//sleep for 1 seconds + //} + } + //skip two lines + dataChunk.skipLines(2); + } + //if load detected is less than half. start processing a new chunk + while(work_queue->work_done != work_queue->total_work){ + //while(work_queue->total_work/work_queue->work_done) > ){ + if(work_queue->next_work > work_queue->total_work){ + cout<<"[test] stucked.."<next_work<<":"<work_done<<":"<total_work<master_done = true; + prod_threads.join_all(); + + contigs.insert(contigs.end(), master_contigs.begin(), master_contigs.end()); +} +*/ + +void processDataChunk(CQF_mt& cqf, const Params& options, concurrent_vector& contigs, unordered_set_mt& startKmers, WorkQueue* work_queue, chunk& dataChunk){ + string kmer; + uint64_t kmer_hash, kmer_RC_hash; + uint64_t kmer_count, kmer_RC_count; + std::string line, seq; + while(dataChunk.readLine(line)){ + //skip the header line + if(line.empty()){ + continue; + }else if(line[0] != '@'){ + continue; + } + if(!dataChunk.readLine(seq)){ + break; + } + if(seq.length()metadata->range, kmer_count)){ + continue; + }else if(kmer_count < options.solid_kmer_abundance_min || kmer_count > options.solid_kmer_abundance_max){ + continue; + } + + auto iter = contigs.push_back(Contig(kmer, kmer_count)); + to_upper_DNA(kmer); + startKmers.insert(kmer);//fake id + + get_unitig_forward(cqf, options, contigs, startKmers, work_queue, iter); + iter->seq = RC_DNA(iter->seq); + get_unitig_forward(cqf, options, contigs, startKmers, work_queue, iter); + } + //skip two lines + dataChunk.skipLines(2); + } + //free data chunk + free(dataChunk.get_reads()); +} + +void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& options, concurrent_vector& contigs){ + WorkQueue* work_queue = new WorkQueue(); + unordered_set_mt startKmers; + + boost::thread_group prod_threads; + for(int t = 0; t master_contigs; + chunk dataChunk; + while(seqFiles.getDataChunk(dataChunk)){ + std::string line, seq; + while(dataChunk.readLine(line)){ + //skip the header line + if(line.empty()){ + continue; + }else if(line[0] != '@'){ + continue; + } + if(!dataChunk.readLine(seq)){ + break; + } + //cout<metadata->range, kmer_count)){ + continue; + }else if(kmer_count < options.solid_kmer_abundance_min || kmer_count > options.solid_kmer_abundance_max){ + continue; } + + //contig_id = contigs.push_back_mt(Contig(kmer, kmer_count)); + //work_queue->add_skip_work(1); + auto iter = contigs.push_back(Contig(kmer, kmer_count)); + //startKmer2unitig.insert_mt(kmer, contig_id); //may not be the start k-mer + startKmers.insert(kmer);//fake id + //startKmer2unitig.insert_mt(kmer_RC, -contig_id); //may not be the end k-mer + // Contig master_contig(kmer, kmer_count); + // get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, master_contig); + // master_contig.seq = RC_DNA(master_contig.seq); + // get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, master_contig); + // master_contigs.push_back(master_contig); + //std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + //break; + get_unitig_forward(cqf, options, contigs, startKmers, work_queue, iter); + //contigs.set(contig_id, Contig(RC_DNA(contigs[contig_id].seq), contigs[contig_id].median_abundance)); + iter->seq = RC_DNA(iter->seq); + get_unitig_forward(cqf, options, contigs, startKmers, work_queue, iter); + //get_unitig_backward(cqf, options, contigs, startKmer2unitig, work_queue, contig_id); + + //std::this_thread::sleep_for(std::chrono::milliseconds(300)); + //wait for the end of this run. } //skip two lines dataChunk.skipLines(2); + //auto tmp_total = work_queue->total_work; + //auto tmp_done = work_queue->work_done; + while((work_queue->total_work - work_queue->work_done) > options.thread_num){ + //cout<<" "<work_done<<"/"<total_work<<"("<total_work-work_queue->work_done<<")"<work_done != work_queue->total_work){ + //while(work_queue->total_work/work_queue->work_done) > ){ + //if(work_queue->next_work > work_queue->total_work){ + // cout<<"[test] stucked.."<next_work<<":"<work_done<<":"<total_work<total_work > work_queue->work_done){ + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } work_queue->master_done = true; prod_threads.join_all(); + + //contigs.insert(contigs.end(), master_contigs.begin(), master_contigs.end()); } /* @@ -345,8 +755,8 @@ void find_unitigs_mt_worker(CQF_mt& cqf, const Params& params, vector_mt } } */ - -void find_unitigs_mt_worker(CQF_mt& cqf, const boost::program_options::variables_map& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue){ +/* +void find_unitigs_mt_worker(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue){ uint32_t contig_id=0; while(!work_queue->master_done){ if(!work_queue->get_next_work(contig_id)){ @@ -357,7 +767,36 @@ void find_unitigs_mt_worker(CQF_mt& cqf, const boost::program_options::variables work_queue->report_work_done(); } } - +*/ +/* +void find_unitigs_mt_worker(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_set_mt& startKmers, WorkQueue* work_queue){ + uint32_t contig_id=0; + while(!work_queue->master_done){ + if(!work_queue->get_next_work(contig_id)){ + std::this_thread::sleep_for(std::chrono::milliseconds(500));//sleep for 0.5 seconds + continue; + } + get_unitig_forward(cqf, options, contigs, startKmers, work_queue, contig_id); + work_queue->report_work_done(); + } +} +*/ +void find_unitigs_mt_worker(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& options, concurrent_vector& contigs, unordered_set_mt& startKmers, WorkQueue* work_queue){ + concurrent_vector::iterator contigIter; + chunk dataChunk; + while(!work_queue->master_done){ + if(!work_queue->get_next_work(contigIter)){ + if(seqFiles.getDataChunk(dataChunk)){ + processDataChunk(cqf, options, contigs, startKmers, work_queue, dataChunk); + }else{ + std::this_thread::sleep_for(std::chrono::milliseconds(500));//sleep for 0.5 seconds + } + }else{ + get_unitig_forward(cqf, options, contigs, startKmers, work_queue, contigIter); + work_queue->report_work_done(); + } + } +} /* void get_unitig_forward(CQF_mt& cqf, const Params& params, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id){ array candidates_before({false, false, false, false}), candidates_after({false, false, false, false}); @@ -469,70 +908,94 @@ void get_unitig_forward(CQF_mt& cqf, const Params& params, vector_mt& co } } */ - -void get_unitig_forward(CQF_mt& cqf, const boost::program_options::variables_map& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id){ +/* +void get_unitig_forward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, const int& contig_id){ array candidates_before({false, false, false, false}), candidates_after({false, false, false, false}); - array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; - auto abundance_min = options["abundance_min"].as(); - auto K = options["k"].as(); + //array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; + array kmer_abundance_befores, kmer_abundance_afters; + auto abundance_min = options.kmer_abundance_min; + auto K = options.K; int candidates_before_num, candidates_after_num; int nodes_before_num, nodes_after_num; uint64_t kmer_hash, kmer_RC_hash, current_kmer_hash, current_kmer_RC_hash; - string kmer, kmer_RC, current_kmer; + string kmer, kmer_RC, current_kmer, current_kmer_RC, current_kmer_fix; uint64_t kmer_count, kmer_RC_count; int idx; - string contig_seq = contigs.at_mt(contig_id); + string contig_seq = contigs.at_mt(contig_id).seq; current_kmer = contig_seq.substr(contig_seq.length()-K); + current_kmer_RC = RC_DNA(current_kmer); + + std::vector abundances; + abundances.push_back(int(contigs.at_mt(contig_id).median_abundance)); + NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + int node_after_x, node_before_x;//useful only when there is only one node after and without candidates after. while(true){ - kmer_afters = kmers_after(current_kmer); - kmer_befores = kmers_before(kmer_afters[0]); + //NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + current_kmer_fix = current_kmer.substr(1); + //kmer_afters = kmers_after(current_kmer); + //kmer_befores = kmers_before(kmer_afters[0]); candidates_before = candidates_after = {{false, false, false, false}}; candidates_before_num = candidates_after_num = 0; nodes_before_num = nodes_after_num = 0; + //kmers with current_kmer_fix as prefix for(int x = 0; x<4; x++){ - kmer = kmer_afters[x]; - kmer_RC = RC_DNA(kmer); - kmer_afters_RC[x] = kmer_RC; - - NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + //kmer = kmer_afters[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_afters_RC[x] = kmer_RC; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64(current_kmer[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); //kmer_hash = MurmurHash2(kmer); //kmer_RC_hash = MurmurHash2(kmer_RC); - if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count) && cqf.count_key_value_set_traveled(kmer_RC_hash%cqf.qf->metadata->range, kmer_RC_count)){ - if(startKmer2unitig.find_mt(kmer, idx)){ + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmer2unitig.find_mt(current_kmer_fix+DNA_bases[x], idx)){ nodes_after_num++; - }else if(kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; + node_after_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; candidates_after[x] = true; //possible because of hash collisions candidates_after_num ++; } - }else if(kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; candidates_after[x] = true; candidates_after_num ++; } } + //kmers with RC(current_kmer_fix) as prefix + NTPC64(current_kmer[0], 'A', options.K, current_kmer_hash, current_kmer_RC_hash); + kmer = current_kmer_RC; for(int x = 0; x<4; x++){ - if(kmer_befores[x] == current_kmer){ + if(DNA_bases[x] == current_kmer_RC[K-1]){ continue; } - kmer = kmer_befores[x]; - kmer_RC = RC_DNA(kmer); - kmer_befores_RC[x] = kmer_RC; - - NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + //kmer = kmer_befores[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_befores_RC[x] = kmer_RC; + //kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + //kmer_hash = NTPC64(current_kmer_RC[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer = current_kmer_RC; + kmer[K-1] = DNA_bases[x]; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64('T', DNA_bases[x], options.K, kmer_RC_hash, kmer_hash); + //kmer_hash = NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); //kmer_hash = MurmurHash2(kmer); //kmer_RC_hash = MurmurHash2(kmer_RC); - if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count) && cqf.count_key_value_set_traveled(kmer_RC_hash%cqf.qf->metadata->range, kmer_RC_count)){ - if(startKmer2unitig.find_mt(kmer_RC, idx)){ - nodes_before_num++; - }else if(kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmer2unitig.find_mt(kmer, idx)){ + nodes_before_num++; node_before_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; candidates_before[x] = true; candidates_before_num++; } - }else if(kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; candidates_before[x] = true; candidates_before_num++; } @@ -548,24 +1011,27 @@ void get_unitig_forward(CQF_mt& cqf, const boost::program_options::variables_map // }else{ // throw logic_error("Unexpectedly not found start kmer!"); // } - startKmer2unitig.insert_mt(RC_DNA(current_kmer), -contig_id); - contigs.set_mt(contig_id, contig_seq); + startKmer2unitig.insert_mt(current_kmer_RC, -contig_id); + contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); for(int x= 0; x < 4; x++){ if(candidates_after[x]){ - if(!startKmer2unitig.find_mt(kmer_afters[x], idx)){ - int new_unitig_idx = contigs.push_back_mt(kmer_afters[x]); - startKmer2unitig.insert_mt(kmer_afters[x], new_unitig_idx); + kmer = current_kmer_fix+DNA_bases[x]; + if(!startKmer2unitig.find_mt(kmer, idx)){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_afters[x])); + startKmer2unitig.insert_mt(kmer, new_unitig_idx); work_queue->add_work(1); } } } + kmer = current_kmer_RC; for(int x = 0; x < 4; x++){ if(candidates_before[x]){ - if(!startKmer2unitig.find_mt(kmer_befores_RC[x], idx)){ - int new_unitig_idx = contigs.push_back_mt(kmer_befores_RC[x]); + kmer[K-1] = DNA_bases[x]; + if(!startKmer2unitig.find_mt(kmer, idx)){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_befores[x])); //startKmer2unitig[kmer_befores_RC[x]] = -new_unitig_idx; - startKmer2unitig.insert_mt(kmer_befores_RC[x], -new_unitig_idx); + startKmer2unitig.insert_mt(kmer, new_unitig_idx); work_queue->add_work(1); } } @@ -574,48 +1040,695 @@ void get_unitig_forward(CQF_mt& cqf, const boost::program_options::variables_map }else if(candidates_after_num==1){ //only one candidate k-mer after for(int x = 0; x<4; x++){ if(candidates_after[x]){ - current_kmer = kmer_afters[x]; - contig_seq += current_kmer.back(); + current_kmer = current_kmer_fix+DNA_bases[x]; + current_kmer_RC = RC_DNAbase(DNA_bases[x])+current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[x]; + abundances.push_back(kmer_abundance_afters[x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[x], options.K, current_kmer_hash, current_kmer_RC_hash); break; } } continue; + }else if (nodes_after_num==1){ + current_kmer = current_kmer_fix + DNA_bases[node_after_x]; + current_kmer_RC = RC_DNAbase(DNA_bases[node_after_x]) + current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[node_after_x]; + abundances.push_back(kmer_abundance_afters[node_after_x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[node_after_x], options.K, current_kmer_hash, current_kmer_RC_hash); }else{ //stop - startKmer2unitig.insert_mt(RC_DNA(current_kmer), -contig_id); - contigs.set_mt(contig_id, contig_seq); + startKmer2unitig.insert_mt(current_kmer_RC, -contig_id); + contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); break; } } } +void get_unitig_forward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_set_mt& startKmers, WorkQueue* work_queue, const int& contig_id){ + array candidates_before({false, false, false, false}), candidates_after({false, false, false, false}); + //array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; + array kmer_abundance_befores, kmer_abundance_afters; + auto abundance_min = options.kmer_abundance_min; + auto K = options.K; -/* -void get_unitig_backward(CQF_mt& cqf, const Params& params, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id){ - array candidates_before, candidates_after; - array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; int candidates_before_num, candidates_after_num; int nodes_before_num, nodes_after_num; - uint64_t kmer_hash, kmer_RC_hash; - string kmer, kmer_RC, current_kmer; + uint64_t kmer_hash, kmer_RC_hash, current_kmer_hash, current_kmer_RC_hash; + string kmer, kmer_RC, current_kmer, current_kmer_RC, current_kmer_fix; uint64_t kmer_count, kmer_RC_count; int idx; + + string contig_seq = contigs.at_mt(contig_id).seq; + current_kmer = contig_seq.substr(contig_seq.length()-K); + current_kmer_RC = RC_DNA(current_kmer); - string contig_seq = contigs.at_mt(contig_id); - current_kmer = contig_seq.substr(0, params.K); + std::vector abundances; + abundances.push_back(int(contigs.at_mt(contig_id).median_abundance)); + + NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + int node_after_x, node_before_x;//useful only when there is only one node after and without candidates after. while(true){ + //NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + current_kmer_fix = current_kmer.substr(1); + //kmer_afters = kmers_after(current_kmer); + //kmer_befores = kmers_before(kmer_afters[0]); candidates_before = candidates_after = {{false, false, false, false}}; - kmer_befores = kmers_before(current_kmer); - kmer_afters = kmers_after(kmer_befores[0]); candidates_before_num = candidates_after_num = 0; nodes_before_num = nodes_after_num = 0; - + + //kmers with current_kmer_fix as prefix for(int x = 0; x<4; x++){ - if(kmer_afters[x]==current_kmer){ - continue; - } - kmer = kmer_afters[x]; - kmer_RC = RC_DNA(kmer); - kmer_afters_RC[x] = kmer_RC; - + //kmer = kmer_afters[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_afters_RC[x] = kmer_RC; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64(current_kmer[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmers.find(current_kmer_fix+DNA_bases[x]) != startKmers.end()){ + nodes_after_num++; + kmer_abundance_afters[x] = kmer_count; + node_after_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; + candidates_after[x] = true; //possible because of hash collisions + candidates_after_num ++; + } + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; + candidates_after[x] = true; + candidates_after_num ++; + } + } + + //kmers with RC(current_kmer_fix) as prefix + NTPC64(current_kmer[0], 'A', options.K, current_kmer_hash, current_kmer_RC_hash); + kmer = current_kmer_RC; + for(int x = 0; x<4; x++){ + if(DNA_bases[x] == current_kmer_RC[K-1]){ + continue; + } + //kmer = kmer_befores[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_befores_RC[x] = kmer_RC; + //kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + //kmer_hash = NTPC64(current_kmer_RC[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer = current_kmer_RC; + kmer[K-1] = DNA_bases[x]; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64('T', DNA_bases[x], options.K, kmer_RC_hash, kmer_hash); + //kmer_hash = NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmers.find(kmer) != startKmers.end()){ + nodes_before_num++; node_before_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; + candidates_before[x] = true; + candidates_before_num++; + } + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; + candidates_before[x] = true; + candidates_before_num++; + } + } + + if((nodes_before_num + candidates_before_num) || (nodes_after_num+candidates_after_num)>1){ //no-linear extension + // if(startKmer2unitig.find_mt(contig_seq.substr(0, params.K), idx)){ + // //A contig has been constructed by another program from RC way + // if(abs(idx) != contig_id){ + // contigs.set_mt(contig_id, ""); + // break; + // } + // }else{ + // throw logic_error("Unexpectedly not found start kmer!"); + // } + startKmers.insert(current_kmer_RC); + contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + + for(int x= 0; x < 4; x++){ + if(candidates_after[x]){ + kmer = current_kmer_fix+DNA_bases[x]; + if(startKmers.find(kmer)==startKmers.end()){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_afters[x])); + startKmers.insert(kmer); + work_queue->add_work(1); + } + } + } + kmer = current_kmer_RC; + for(int x = 0; x < 4; x++){ + if(candidates_before[x]){ + kmer[K-1] = DNA_bases[x]; + if(startKmers.find(kmer) == startKmers.end()){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_befores[x])); + //startKmer2unitig[kmer_befores_RC[x]] = -new_unitig_idx; + startKmers.insert(kmer); + work_queue->add_work(1); + } + } + } + break; + }else if(candidates_after_num==1){ //only one candidate k-mer after + for(int x = 0; x<4; x++){ + if(candidates_after[x]){ + current_kmer = current_kmer_fix+DNA_bases[x]; + current_kmer_RC = RC_DNAbase(DNA_bases[x])+current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[x]; + abundances.push_back(kmer_abundance_afters[x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[x], options.K, current_kmer_hash, current_kmer_RC_hash); + break; + } + } + continue; + }else if (nodes_after_num==1){//be careful about circles + current_kmer = current_kmer_fix + DNA_bases[node_after_x]; + current_kmer_RC = RC_DNAbase(DNA_bases[node_after_x]) + current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[node_after_x]; + abundances.push_back(kmer_abundance_afters[node_after_x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[node_after_x], options.K, current_kmer_hash, current_kmer_RC_hash); + }else{ //stop + startKmers.insert(current_kmer_RC); + contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + break; + } + } +} + +void get_unitig_forward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, Contig& contig){ + int contig_id = 0;//fake contig id + + array candidates_before({false, false, false, false}), candidates_after({false, false, false, false}); + //array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; + array kmer_abundance_befores, kmer_abundance_afters; + auto abundance_min = options.kmer_abundance_min; + auto K = options.K; + + int candidates_before_num, candidates_after_num; + int nodes_before_num, nodes_after_num; + uint64_t kmer_hash, kmer_RC_hash, current_kmer_hash, current_kmer_RC_hash; + string kmer, kmer_RC, current_kmer, current_kmer_RC, current_kmer_fix; + uint64_t kmer_count, kmer_RC_count; + int idx; + + string contig_seq = contig.seq; + current_kmer = contig_seq.substr(contig_seq.length()-K); + current_kmer_RC = RC_DNA(current_kmer); + + std::vector abundances; + abundances.push_back(int(contig.median_abundance)); + + NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + int node_after_x, node_before_x;//useful only when there is only one node after and without candidates after. + while(true){ + //NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + current_kmer_fix = current_kmer.substr(1); + //kmer_afters = kmers_after(current_kmer); + //kmer_befores = kmers_before(kmer_afters[0]); + candidates_before = candidates_after = {{false, false, false, false}}; + candidates_before_num = candidates_after_num = 0; + nodes_before_num = nodes_after_num = 0; + + //kmers with current_kmer_fix as prefix + for(int x = 0; x<4; x++){ + //kmer = kmer_afters[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_afters_RC[x] = kmer_RC; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64(current_kmer[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmer2unitig.find_mt(current_kmer_fix+DNA_bases[x], idx)){ + nodes_after_num++; + kmer_abundance_afters[x] = kmer_count; + node_after_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; + candidates_after[x] = true; //possible because of hash collisions + candidates_after_num ++; + } + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; + candidates_after[x] = true; + candidates_after_num ++; + } + } + + //kmers with RC(current_kmer_fix) as prefix + NTPC64(current_kmer[0], 'A', options.K, current_kmer_hash, current_kmer_RC_hash); + kmer = current_kmer_RC; + for(int x = 0; x<4; x++){ + if(DNA_bases[x] == current_kmer_RC[K-1]){ + continue; + } + //kmer = kmer_befores[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_befores_RC[x] = kmer_RC; + //kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + //kmer_hash = NTPC64(current_kmer_RC[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer = current_kmer_RC; + kmer[K-1] = DNA_bases[x]; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64('T', DNA_bases[x], options.K, kmer_RC_hash, kmer_hash); + //kmer_hash = NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmer2unitig.find_mt(kmer, idx)){ + nodes_before_num++; node_before_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; + candidates_before[x] = true; + candidates_before_num++; + } + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; + candidates_before[x] = true; + candidates_before_num++; + } + } + + if((nodes_before_num + candidates_before_num) || (nodes_after_num+candidates_after_num)>1){ //no-linear extension + // if(startKmer2unitig.find_mt(contig_seq.substr(0, params.K), idx)){ + // //A contig has been constructed by another program from RC way + // if(abs(idx) != contig_id){ + // contigs.set_mt(contig_id, ""); + // break; + // } + // }else{ + // throw logic_error("Unexpectedly not found start kmer!"); + // } + startKmer2unitig.insert_mt(current_kmer_RC, -contig_id); + //contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + contig.seq = contig_seq; + contig.median_abundance = median(abundances); + + for(int x= 0; x < 4; x++){ + if(candidates_after[x]){ + kmer = current_kmer_fix+DNA_bases[x]; + if(!startKmer2unitig.find_mt(kmer, idx)){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_afters[x])); + startKmer2unitig.insert_mt(kmer, new_unitig_idx); + work_queue->add_work(1); + } + } + } + kmer = current_kmer_RC; + for(int x = 0; x < 4; x++){ + if(candidates_before[x]){ + kmer[K-1] = DNA_bases[x]; + if(!startKmer2unitig.find_mt(kmer, idx)){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_befores[x])); + //startKmer2unitig[kmer_befores_RC[x]] = -new_unitig_idx; + startKmer2unitig.insert_mt(kmer, new_unitig_idx); + work_queue->add_work(1); + } + } + } + break; + }else if(candidates_after_num==1){ //only one candidate k-mer after + for(int x = 0; x<4; x++){ + if(candidates_after[x]){ + current_kmer = current_kmer_fix+DNA_bases[x]; + current_kmer_RC = RC_DNAbase(DNA_bases[x])+current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[x]; + abundances.push_back(kmer_abundance_afters[x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[x], options.K, current_kmer_hash, current_kmer_RC_hash); + break; + } + } + continue; + }else if (nodes_after_num==1){ + current_kmer = current_kmer_fix + DNA_bases[node_after_x]; + current_kmer_RC = RC_DNAbase(DNA_bases[node_after_x]) + current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[node_after_x]; + abundances.push_back(kmer_abundance_afters[node_after_x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[node_after_x], options.K, current_kmer_hash, current_kmer_RC_hash); + }else{ //stop + startKmer2unitig.insert_mt(current_kmer_RC, -contig_id); + //contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + contig.seq = contig_seq; + contig.median_abundance = median(abundances); + break; + } + } +} +*/ +void get_unitig_forward(CQF_mt& cqf, const Params& options, concurrent_vector& contigs, unordered_set_mt& startKmers, WorkQueue* work_queue, concurrent_vector::iterator& contigIter){ + array candidates_before({false, false, false, false}), candidates_after({false, false, false, false}); + //array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; + array kmer_abundance_befores, kmer_abundance_afters; + auto abundance_min = options.kmer_abundance_min; + auto K = options.K; + + int candidates_before_num, candidates_after_num; + int nodes_before_num, nodes_after_num; + uint64_t kmer_hash, kmer_RC_hash, current_kmer_hash, current_kmer_RC_hash; + string kmer, kmer_RC, current_kmer, current_kmer_RC, current_kmer_fix; + uint64_t kmer_count, kmer_RC_count; + int idx; + + string contig_seq = contigIter->seq; + current_kmer = contig_seq.substr(contig_seq.length()-K); + current_kmer_RC = RC_DNA(current_kmer); + + std::vector abundances; + abundances.push_back(int(contigIter->median_abundance)); + + NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + int node_after_x, node_before_x;//useful only when there is only one node after and without candidates after. + while(true){ + //NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + current_kmer_fix = current_kmer.substr(1); + //kmer_afters = kmers_after(current_kmer); + //kmer_befores = kmers_before(kmer_afters[0]); + candidates_before = candidates_after = {{false, false, false, false}}; + candidates_before_num = candidates_after_num = 0; + nodes_before_num = nodes_after_num = 0; + + //kmers with current_kmer_fix as prefix + for(int x = 0; x<4; x++){ + //kmer = kmer_afters[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_afters_RC[x] = kmer_RC; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64(current_kmer[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + bool isTraveled=cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count); + if(kmer_count>= abundance_min){//if not traveled, then go + if(isTraveled && startKmers.find(current_kmer_fix+DNA_bases[x]) != startKmers.end()){ + nodes_after_num++; + kmer_abundance_afters[x] = kmer_count; + node_after_x = x; + //}else if(kmer_count >= abundance_min){ + }else{ + kmer_abundance_afters[x] = kmer_count; + candidates_after[x] = true; //possible because of hash collisions + candidates_after_num ++; + } + } + } + + //kmers with RC(current_kmer_fix) as prefix + NTPC64(current_kmer[0], 'A', options.K, current_kmer_hash, current_kmer_RC_hash); + kmer = current_kmer_RC; + for(int x = 0; x<4; x++){ + if(DNA_bases[x] == current_kmer_RC[K-1]){ + continue; + } + //kmer = kmer_befores[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_befores_RC[x] = kmer_RC; + //kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + //kmer_hash = NTPC64(current_kmer_RC[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer = current_kmer_RC; + kmer[K-1] = DNA_bases[x]; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64('T', DNA_bases[x], options.K, kmer_RC_hash, kmer_hash); + //kmer_hash = NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + bool isTraveled = cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count); + if(kmer_count >= abundance_min){ + if(isTraveled && startKmers.find(kmer) != startKmers.end()){ + nodes_before_num++; node_before_x = x; + //}else if(kmer_count >= abundance_min){ + }else{ + kmer_abundance_befores[x] = kmer_count; + candidates_before[x] = true; + candidates_before_num++; + } + } + } + + if((nodes_before_num + candidates_before_num) || (nodes_after_num+candidates_after_num)>1){ //no-linear extension + // if(startKmer2unitig.find_mt(contig_seq.substr(0, params.K), idx)){ + // //A contig has been constructed by another program from RC way + // if(abs(idx) != contig_id){ + // contigs.set_mt(contig_id, ""); + // break; + // } + // }else{ + // throw logic_error("Unexpectedly not found start kmer!"); + // } + startKmers.insert(current_kmer_RC); + //contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + contigIter->seq = contig_seq; + contigIter->median_abundance = median(abundances); + + for(int x= 0; x < 4; x++){ + if(candidates_after[x]){ + kmer = current_kmer_fix+DNA_bases[x]; + if(startKmers.find(kmer)==startKmers.end()){ + startKmers.insert(kmer); + auto it = contigs.push_back(Contig(kmer, kmer_abundance_afters[x])); + work_queue->add_work(it); + } + } + } + kmer = current_kmer_RC; + for(int x = 0; x < 4; x++){ + if(candidates_before[x]){ + kmer[K-1] = DNA_bases[x]; + if(startKmers.find(kmer) == startKmers.end()){ + startKmers.insert(kmer); + auto it = contigs.push_back(Contig(kmer, kmer_abundance_befores[x])); + //startKmer2unitig[kmer_befores_RC[x]] = -new_unitig_idx; + work_queue->add_work(it); + } + } + } + break; + }else if(candidates_after_num==1){ //only one candidate k-mer after + for(int x = 0; x<4; x++){ + if(candidates_after[x]){ + current_kmer = current_kmer_fix+DNA_bases[x]; + current_kmer_RC = RC_DNAbase(DNA_bases[x])+current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[x]; + abundances.push_back(kmer_abundance_afters[x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[x], options.K, current_kmer_hash, current_kmer_RC_hash); + break; + } + } + continue; + }else if (nodes_after_num==1){//be careful about circles + current_kmer = current_kmer_fix + DNA_bases[node_after_x]; + current_kmer_RC = RC_DNAbase(DNA_bases[node_after_x]) + current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[node_after_x]; + abundances.push_back(kmer_abundance_afters[node_after_x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[node_after_x], options.K, current_kmer_hash, current_kmer_RC_hash); + }else{ //stop + startKmers.insert(current_kmer_RC); + contigIter->seq = contig_seq; + contigIter->median_abundance = median(abundances); + //contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + break; + } + } +} +/* +void get_unitig_forward(CQF_mt& cqf, const Params& options, concurrent_vector& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, Contig& contig){ + int contig_id = 0;//fake contig id + + array candidates_before({false, false, false, false}), candidates_after({false, false, false, false}); + //array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; + array kmer_abundance_befores, kmer_abundance_afters; + auto abundance_min = options.kmer_abundance_min; + auto K = options.K; + + int candidates_before_num, candidates_after_num; + int nodes_before_num, nodes_after_num; + uint64_t kmer_hash, kmer_RC_hash, current_kmer_hash, current_kmer_RC_hash; + string kmer, kmer_RC, current_kmer, current_kmer_RC, current_kmer_fix; + uint64_t kmer_count, kmer_RC_count; + int idx; + + string contig_seq = contig.seq; + current_kmer = contig_seq.substr(contig_seq.length()-K); + current_kmer_RC = RC_DNA(current_kmer); + + std::vector abundances; + abundances.push_back(int(contig.median_abundance)); + + NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + int node_after_x, node_before_x;//useful only when there is only one node after and without candidates after. + while(true){ + //NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + current_kmer_fix = current_kmer.substr(1); + //kmer_afters = kmers_after(current_kmer); + //kmer_befores = kmers_before(kmer_afters[0]); + candidates_before = candidates_after = {{false, false, false, false}}; + candidates_before_num = candidates_after_num = 0; + nodes_before_num = nodes_after_num = 0; + + //kmers with current_kmer_fix as prefix + for(int x = 0; x<4; x++){ + //kmer = kmer_afters[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_afters_RC[x] = kmer_RC; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64(current_kmer[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmer2unitig.find_mt(current_kmer_fix+DNA_bases[x], idx)){ + nodes_after_num++; + kmer_abundance_afters[x] = kmer_count; + node_after_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; + candidates_after[x] = true; //possible because of hash collisions + candidates_after_num ++; + } + }else if(kmer_count >= abundance_min){ + kmer_abundance_afters[x] = kmer_count; + candidates_after[x] = true; + candidates_after_num ++; + } + } + + //kmers with RC(current_kmer_fix) as prefix + NTPC64(current_kmer[0], 'A', options.K, current_kmer_hash, current_kmer_RC_hash); + kmer = current_kmer_RC; + for(int x = 0; x<4; x++){ + if(DNA_bases[x] == current_kmer_RC[K-1]){ + continue; + } + //kmer = kmer_befores[x]; + //kmer_RC = RC_DNA(kmer); + //kmer_befores_RC[x] = kmer_RC; + //kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + //kmer_hash = NTPC64(current_kmer_RC[0], DNA_bases[x], K, kmer_hash, kmer_RC_hash); + //kmer = current_kmer_RC; + kmer[K-1] = DNA_bases[x]; + kmer_hash = current_kmer_hash; kmer_RC_hash = current_kmer_RC_hash; + kmer_hash = NTPC64('T', DNA_bases[x], options.K, kmer_RC_hash, kmer_hash); + //kmer_hash = NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + //kmer_hash = MurmurHash2(kmer); + //kmer_RC_hash = MurmurHash2(kmer_RC); + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ + if(startKmer2unitig.find_mt(kmer, idx)){ + nodes_before_num++; node_before_x = x; + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; + candidates_before[x] = true; + candidates_before_num++; + } + }else if(kmer_count >= abundance_min){ + kmer_abundance_befores[x] = kmer_count; + candidates_before[x] = true; + candidates_before_num++; + } + } + + if((nodes_before_num + candidates_before_num) || (nodes_after_num+candidates_after_num)>1){ //no-linear extension + // if(startKmer2unitig.find_mt(contig_seq.substr(0, params.K), idx)){ + // //A contig has been constructed by another program from RC way + // if(abs(idx) != contig_id){ + // contigs.set_mt(contig_id, ""); + // break; + // } + // }else{ + // throw logic_error("Unexpectedly not found start kmer!"); + // } + startKmer2unitig.insert_mt(current_kmer_RC, -contig_id); + //contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + contig.seq = contig_seq; + contig.median_abundance = median(abundances); + + for(int x= 0; x < 4; x++){ + if(candidates_after[x]){ + kmer = current_kmer_fix+DNA_bases[x]; + if(!startKmer2unitig.find_mt(kmer, idx)){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_afters[x])); + startKmer2unitig.insert_mt(kmer, new_unitig_idx); + work_queue->add_work(1); + } + } + } + kmer = current_kmer_RC; + for(int x = 0; x < 4; x++){ + if(candidates_before[x]){ + kmer[K-1] = DNA_bases[x]; + if(!startKmer2unitig.find_mt(kmer, idx)){ + int new_unitig_idx = contigs.push_back_mt(Contig(kmer, kmer_abundance_befores[x])); + //startKmer2unitig[kmer_befores_RC[x]] = -new_unitig_idx; + startKmer2unitig.insert_mt(kmer, new_unitig_idx); + work_queue->add_work(1); + } + } + } + break; + }else if(candidates_after_num==1){ //only one candidate k-mer after + for(int x = 0; x<4; x++){ + if(candidates_after[x]){ + current_kmer = current_kmer_fix+DNA_bases[x]; + current_kmer_RC = RC_DNAbase(DNA_bases[x])+current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[x]; + abundances.push_back(kmer_abundance_afters[x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[x], options.K, current_kmer_hash, current_kmer_RC_hash); + break; + } + } + continue; + }else if (nodes_after_num==1){ + current_kmer = current_kmer_fix + DNA_bases[node_after_x]; + current_kmer_RC = RC_DNAbase(DNA_bases[node_after_x]) + current_kmer_RC.substr(0, K-1); + contig_seq += DNA_bases[node_after_x]; + abundances.push_back(kmer_abundance_afters[node_after_x]); + NTPC64('T', 'A', options.K, current_kmer_RC_hash, current_kmer_hash); + NTPC64('T', DNA_bases[node_after_x], options.K, current_kmer_hash, current_kmer_RC_hash); + }else{ //stop + startKmer2unitig.insert_mt(current_kmer_RC, -contig_id); + //contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); + contig.seq = contig_seq; + contig.median_abundance = median(abundances); + break; + } + } +} +*/ + +/* +void get_unitig_backward(CQF_mt& cqf, const Params& params, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id){ + array candidates_before, candidates_after; + array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; + int candidates_before_num, candidates_after_num; + int nodes_before_num, nodes_after_num; + uint64_t kmer_hash, kmer_RC_hash; + string kmer, kmer_RC, current_kmer; + uint64_t kmer_count, kmer_RC_count; + int idx; + + string contig_seq = contigs.at_mt(contig_id); + current_kmer = contig_seq.substr(0, params.K); + while(true){ + candidates_before = candidates_after = {{false, false, false, false}}; + kmer_befores = kmers_before(current_kmer); + kmer_afters = kmers_after(kmer_befores[0]); + candidates_before_num = candidates_after_num = 0; + nodes_before_num = nodes_after_num = 0; + + for(int x = 0; x<4; x++){ + if(kmer_afters[x]==current_kmer){ + continue; + } + kmer = kmer_afters[x]; + kmer_RC = RC_DNA(kmer); + kmer_afters_RC[x] = kmer_RC; + NTPC64(kmer.c_str(), params.K, kmer_hash, kmer_RC_hash); //kmer_hash = MurmurHash2(kmer); //kmer_RC_hash = MurmurHash2(kmer_RC); @@ -698,25 +1811,30 @@ void get_unitig_backward(CQF_mt& cqf, const Params& params, vector_mt& c } */ -void get_unitig_backward(CQF_mt& cqf, const boost::program_options::variables_map& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id){ - auto abundance_min = options["abundance_min"].as(); - auto K = options["k"].as(); +/* +void get_unitig_backward(CQF_mt& cqf, const Params& options, vector_mt& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id){ + auto abundance_min = options.kmer_abundance_min; + auto K = options.K; array candidates_before, candidates_after; - array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; + //array kmer_befores, kmer_afters, kmer_befores_RC, kmer_afters_RC; int candidates_before_num, candidates_after_num; int nodes_before_num, nodes_after_num; uint64_t kmer_hash, kmer_RC_hash; string kmer, kmer_RC, current_kmer; - uint64_t kmer_count, kmer_RC_count; + uint64_t kmer_count, kmer_RC_count; int idx; string contig_seq = contigs.at_mt(contig_id); current_kmer = contig_seq.substr(0, K); + current_kmer_RC = RC_DNA(current_kmer); + while(true){ + + candidates_before = candidates_after = {{false, false, false, false}}; - kmer_befores = kmers_before(current_kmer); - kmer_afters = kmers_after(kmer_befores[0]); + //kmer_befores = kmers_before(current_kmer); + //kmer_afters = kmers_after(kmer_befores[0]); candidates_before_num = candidates_after_num = 0; nodes_before_num = nodes_after_num = 0; @@ -729,16 +1847,19 @@ void get_unitig_backward(CQF_mt& cqf, const boost::program_options::variables_ma kmer_afters_RC[x] = kmer_RC; NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + if(kmer_hash > kmer_RC_hash){ + kmer_hash = kmer_RC_hash; + } //kmer_hash = MurmurHash2(kmer); //kmer_RC_hash = MurmurHash2(kmer_RC); - if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count) && cqf.count_key_value_set_traveled(kmer_RC_hash%cqf.qf->metadata->range, kmer_RC_count)){ + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ if(startKmer2unitig.find_mt(kmer, idx)){ nodes_after_num++; - }else if(kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + }else if(kmer_count >= abundance_min){ candidates_after[x] = true; candidates_after_num ++; } - }else if(kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + }else if(kmer_count >= abundance_min){ candidates_after[x] = true; candidates_after_num ++; } @@ -749,16 +1870,19 @@ void get_unitig_backward(CQF_mt& cqf, const boost::program_options::variables_ma kmer_befores_RC[x] = kmer_RC; NTPC64(kmer.c_str(), K, kmer_hash, kmer_RC_hash); + if(kmer_hash > kmer_RC_hash){ + kmer_hash = kmer_RC_hash; + } //kmer_hash = MurmurHash2(kmer); //kmer_RC_hash = MurmurHash2(kmer_RC); - if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count) && cqf.count_key_value_set_traveled(kmer_RC_hash%cqf.qf->metadata->range, kmer_RC_count)){ + if(cqf.count_key_value_set_traveled(kmer_hash%cqf.qf->metadata->range, kmer_count)){ if(startKmer2unitig.find_mt(kmer_RC, idx)){ nodes_before_num++; - }else if (kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + }else if (kmer_count >= abundance_min){ candidates_before[x] = true; candidates_before_num++; } - }else if(kmer_count >= abundance_min && kmer_RC_count >= abundance_min){ + }else if(kmer_count >= abundance_min){ candidates_before[x] = true; candidates_before_num++; } @@ -811,6 +1935,7 @@ void get_unitig_backward(CQF_mt& cqf, const boost::program_options::variables_ma } } } +*/ #if 0 void find_unitigs_hash_mt(CQF_mt& cqf, const Sequences& seqs, const Params& params, vector& contigs){