kNumHits = 6;
88 const TGisubjid[kNumHits] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
89 const doublescores[kNumHits] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
91 for(
intindex=0; index<kNumHits; index++)
93pair<uint32_t, double> retval(index, scores[index]);
94prelim_vector.push_back(retval);
107 for(TSeqLocVector::iterator iter=tsl.begin(); iter != tsl.end(); ++iter)
109BOOST_REQUIRE_EQUAL((*iter).seqloc->GetId()->GetGi(), subjid[index]);
115 for(TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
118BOOST_REQUIRE_EQUAL(sid->
GetGi(), subjid[index]);
119BOOST_REQUIRE_EQUAL((*iter).second, scores[index]);
130 const intkNumQueries = 2;
131 const intkNumHits1 = 6;
132 const intkNumHits2 = 2;
134 const TGisubjid1[kNumHits1] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
135 const TGisubjid2[kNumHits2] = { 129295, 677974076};
136 const doublescores1[kNumHits1] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
137 const doublescores2[kNumHits1] = {0.359375, 0.710938};
140 for(
intindex=0; index<kNumHits1; index++)
142pair<uint32_t, double> retval(index, scores1[index]);
143prelim_vector1.push_back(retval);
146 for(
intindex=0; index<kNumHits2; index++)
148pair<uint32_t, double> retval(index+4, scores2[index]);
149prelim_vector2.push_back(retval);
152vec_set.push_back(prelim_vector1);
153vec_set.push_back(prelim_vector2);
157id_vec.push_back(qid1);
158id_vec.push_back(qid2);
166stats_vec.push_back(stats1);
167stats_vec.push_back(stats2);
169errs.resize(kNumQueries);
173BOOST_REQUIRE_EQUAL(result_set.
GetNumQueries(), kNumQueries);
179 for(TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
182BOOST_REQUIRE_EQUAL(sid->
GetGi(), subjid1[index]);
183BOOST_REQUIRE_EQUAL((*iter).second, scores1[index]);
195 for(TBlastKmerScoreVector::const_iterator iter=kmerscores2.begin(); iter != kmerscores2.end(); ++iter)
198BOOST_REQUIRE_EQUAL(sid->
GetGi(), subjid2[index]);
199BOOST_REQUIRE_EQUAL((*iter).second, scores2[index]);
205BOOST_REQUIRE(resultsNotNULL.
IsNull() ==
false);
210BOOST_REQUIRE(resultsNull.
IsNull() ==
true);
215 const intkNumQueries = 2;
216 const intkNumHits1 = 6;
217 const intkNumHits2 = 2;
219 const TGisubjid1[kNumHits1] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
220 const TGisubjid2[kNumHits2] = { 129295, 677974076};
221 const doublescores1[kNumHits1] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
222 const doublescores2[kNumHits1] = {0.359375, 0.710938};
225 for(
intindex=0; index<kNumHits1; index++)
227pair<uint32_t, double> retval(index, scores1[index]);
228prelim_vector1.push_back(retval);
231 for(
intindex=0; index<kNumHits2; index++)
233pair<uint32_t, double> retval(index+4, scores2[index]);
234prelim_vector2.push_back(retval);
255BOOST_REQUIRE_EQUAL(result_set.
GetNumQueries(), kNumQueries);
261 for(TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
264BOOST_REQUIRE_EQUAL(sid->
GetGi(), subjid1[index]);
265BOOST_REQUIRE_EQUAL((*iter).second, scores1[index]);
277 for(TBlastKmerScoreVector::const_iterator iter=kmerscores2.begin(); iter != kmerscores2.end(); ++iter)
280BOOST_REQUIRE_EQUAL(sid->
GetGi(), subjid2[index]);
281BOOST_REQUIRE_EQUAL((*iter).second, scores2[index]);
403 for(
int i=0;
i<numHashes;
i++)
409 while(
a[
i] == 0);
418 stringqueryseq_eaa =
419 "MDSISVTNAKFCFDVFNEMKVHHVNENILYCPLSILTALAMVYLGARGNTESQMKKVLHFDSITGAGSTTDSQCGSSEYV" 420 "HNLFKELLSEITRPNATYSLEIADKLYVDKTFSVLPEYLSCARKFYTGGVEEVNFKTAAEEARQLINSWVEKETNGQIKD" 421 "LLVSSSIDFGTTMVFINTIYFKGIWKIAFNTEDTREMPFSMTKEESKPVQMMCMNNSFNVATLPAEKMKILELPYASGDL";
423 stringqueryseq_stdaa;
427 const intkNumHashes=128;
429 const intkKmerNum=5;
430 const intkAlphabet=0;
436vector < vector <uint32_t> > seq_hash;
440BOOST_REQUIRE_EQUAL(seq_hash.size(), 2);
441BOOST_REQUIRE_EQUAL(seq_hash[0].
size(), kNumHashes);
442BOOST_REQUIRE_EQUAL(seq_hash[0][0], 529895);
443BOOST_REQUIRE_EQUAL(seq_hash[0][1], 798115);
444BOOST_REQUIRE_EQUAL(seq_hash[0][63], 90979);
445BOOST_REQUIRE_EQUAL(seq_hash[0][83], 336201);
448 const intlsh_hash_length = 13;
449 const intlsh_hash_vals[lsh_hash_length] = { 973119,1097197,1157729,1681152,1913970,1933659,2018075,2123893,2355301,2800673,2940688,2941967,3535701};
451 const intkRowsPerBand=2;
452 const intkNumBands = kNumHashes/kRowsPerBand;
453vector< vector <uint32_t> > lsh_hash_vec;
456 intnumChunks = lsh_hash_vec.size();
457 for(
int i=0;
i<numChunks && index<lsh_hash_length;
i++)
459 for(vector<uint32_t>::iterator iter=lsh_hash_vec[
i].begin(); iter != lsh_hash_vec[
i].end(); ++iter)
461BOOST_REQUIRE_EQUAL(*iter, lsh_hash_vals[index]);
463 if(index == lsh_hash_length)
471 stringqueryseq_eaa =
472 "MDSISVTNAKFCFDVFNEMKVHHVNENILYCPLSILTALAMVYLGARGNTESQMKKVLHFDSITGAGSTTDSQCGSSEYV" 473 "HNLFKELLSEITRPNATYSLEIADKLYVDKTFSVLPEYLSCARKFYTGGVEEVNFKTAAEEARQLINSWVEKETNGQIKD" 474 "LLVSSSIDFGTTMVFINTIYFKGIWKIAFNTEDTREMPFSMTKEESKPVQMMCMNNSFNVATLPAEKMKILELPYASGDL";
476 stringqueryseq_stdaa;
480 const intkNumHashes=32;
481 const intkKmerNum=5;
482 const intkAlphabet=0;
486vector < vector <uint32_t> > seq_hash;
490BOOST_REQUIRE_EQUAL(seq_hash.size(), 2);
491BOOST_REQUIRE_EQUAL(seq_hash[0].
size(), kNumHashes);
492BOOST_REQUIRE_EQUAL(seq_hash[0][0], 2683052);
493BOOST_REQUIRE_EQUAL(seq_hash[0][1], 26519505);
494BOOST_REQUIRE_EQUAL(seq_hash[0][2], 45619224);
495BOOST_REQUIRE_EQUAL(seq_hash[0][15], 396863844);
497BOOST_REQUIRE(seq_hash[1][0] < seq_hash[0][1]);
498BOOST_REQUIRE(seq_hash[1][1] < seq_hash[0][2]);
499BOOST_REQUIRE(seq_hash[1][3] < seq_hash[0][kNumHashes-1]);
502 const intlsh_hash_length = 13;
503 const intlsh_hash_vals[lsh_hash_length] = { 700168,1293774,1377419,1712432,1819660,2314660,2484152,2944352,2951476,3273866,3625878,3709806,3837843};
505 const intkRowsPerBand=2;
506vector< vector <uint32_t> > lsh_hash_vec;
509 intnumChunks = lsh_hash_vec.size();
510 for(
int i=0;
i<numChunks && index<lsh_hash_length;
i++)
512 for(vector<uint32_t>::iterator iter=lsh_hash_vec[
i].begin(); iter != lsh_hash_vec[
i].end(); ++iter)
514BOOST_REQUIRE_EQUAL(*iter, lsh_hash_vals[index]);
516 if(index == lsh_hash_length)
525 for(
intindex=0; index<lshSize; index++)
538 const intkHashFct=32;
539 const intkKmerSize=5;
543 stringindex_name(
"nr_test");
546build_index.
Build();
552BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.
GetKmerSize());
556BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
559BOOST_REQUIRE_EQUAL(187, lsh_counts);
566 const intkHashFct=32;
567 const intkKmerSize=5;
571 stringindex_name(
"XP_001468867");
574build_index.
Build();
579BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.
GetKmerSize());
583BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
586BOOST_REQUIRE_EQUAL(213, lsh_counts);
594 const intkHashFct=32;
595 const intkKmerSize=5;
599 stringindex_name(
"manyXs");
602build_index.
Build();
614 const intkHashFct=32;
615 const intkKmerSize=4;
620 stringindex_name(
"nr_test");
623build_index.
Build();
628BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.
GetKmerSize());
632BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
635BOOST_REQUIRE_EQUAL(172, lsh_counts);
643 const intkHashFct=64;
644 const intkKmerSize=4;
649 stringindex_name(
"nr_test");
652build_index.
Build();
657BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.
GetKmerSize());
661BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
664BOOST_REQUIRE_EQUAL(339, lsh_counts);
672 const intkHashFct=32;
673 const intkKmerSize=5;
675 const intkAlphabet=1;
679 stringindex_name(
"nr_test");
682build_index.
Build();
687BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.
GetKmerSize());
688BOOST_REQUIRE_EQUAL(kAlphabet, mhfile.
GetAlphabet());
692BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
695BOOST_REQUIRE_EQUAL(173, lsh_counts);
703 const intkHashFct=32;
704 const intkKmerSize=5;
705 const intkSamples=30;
707 const intkAlphabet=1;
709 const intkLSHStart=312;
713 stringindex_name(
"nr_test");
716build_index.
Build();
721BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.
GetKmerSize());
722BOOST_REQUIRE_EQUAL(kAlphabet, mhfile.
GetAlphabet());
724BOOST_REQUIRE_EQUAL(kLSHStart, mhfile.
GetLSHStart());
728BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
731BOOST_REQUIRE_EQUAL(563, lsh_counts);
734BOOST_REQUIRE_EQUAL(150, chunkSize);
740 stringindex_name=
"";
757unique_ptr<SSeqLoc> ssl(
new SSeqLoc(*loc, *scope));
759query_vector.push_back(*ssl);
763 CBlastKmerkmersearch(query_vector, options, seqdb);
773BOOST_REQUIRE_EQUAL(opts->
GetThresh(), myThresh);
777BOOST_REQUIRE_EQUAL(opts->
GetMinHits(), hits);
783BOOST_REQUIRE_EQUAL(opts->
Validate(),
true);
786BOOST_REQUIRE_EQUAL(opts->
Validate(),
true);
789BOOST_REQUIRE_EQUAL(opts->
Validate(),
false);
792BOOST_REQUIRE_EQUAL(opts->
Validate(),
false);
810unique_ptr<SSeqLoc> ssl(
new SSeqLoc(*loc, *scope));
812query_vector.push_back(*ssl);
Declares the CBlastAdvancedProteinOptionsHandle class.
Interface for converting sources of sequence data into blast sequence input.
Declares the CBlastProteinOptionsHandle class.
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
vector< pair< CRef< CSeq_id >, double > > TBlastKmerScoreVector
Vector of pairs of seq-ids and scores.
vector< pair< uint32_t, double > > TBlastKmerPrelimScoreVector
Vector of pairs of database OIDs and scores.
void get_LSH_hashes(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)
void get_LSH_hashes5(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)
Gets the LSH hash for one hash function.
bool minhash_query2(const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)
Hash the query for the minimum values;.
bool minhash_query(const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)
void Build(int numThreads=1)
Build the index.
Class of optiosn for the KMEr search.
void SetNumTargetSeqs(int matches)
Sets the number of matches (subject sequences) to return.
void SetThresh(double thresh)
Set the threshold.
void SetMinHits(int minhits)
Set the minimum number of LSH hits to initiate a calculation of the Jaccard distance.
int GetNumTargetSeqs() const
Gets the number of matches (subject sequences) to return.
double GetThresh() const
Get the threshold.
int GetMinHits() const
Get the number of LSH hits to initiate the calculation of the Jaccard distance.
bool Validate() const
Checks that options are valid.
This class holds one or more CBlastKmerResults.
size_t GetNumQueries() const
Returns the number of queries.
vector< BlastKmerStats > TBlastKmerStatsVector
Vector of KmerStats.
void push_back(value_type &element)
Add an element to m_Results.
vector< CConstRef< objects::CSeq_id > > TQueryIdVector
List of query ids.
vector< TBlastKmerPrelimScoreVector > TBlastKmerPrelimScoreVectorSet
Vector of TBlastKmerScoreVector (scores)
This class represents the results for one KMER search (one query).
Class to perform a KMER-BLASTP search.
CRef< CBlastKmerResultsSet > Run()
Performs search on one or more queries Performs search on one or more queries.
Access data in Minhash files.
int GetLSHSize(void) const
int GetNumHashes(void) const
Returns the number of values in an array of hashes (probably 32)
uint64_t * GetLSHArray(void) const
int GetVersion(void) const
int GetDataWidth(void) const
int GetChunkSize(void) const
Get number of letters in a chunk (version 3 or higher)
int GetKmerSize(void) const
Returns the length of the KMER.
int GetAlphabet(void) const
One of two alphabets from Shiryev et al.
int GetLSHStart(void) const
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
static CRef< CScope > NewScope(bool with_defaults=true)
Return a new scope, possibly (by default) with default loaders, which will include the Genbank loader...
Class for the messages for an individual query sequence.
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
static void Add(const string &path)
Add the name of a dir entry; it will be deleted on (normal) exit.
@ eSerial_AsnText
ASN.1 text.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
bool IsNull(void) const THROWS_NONE
Check if pointer is null â same effect as Empty().
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
TGi GetGi(void) const
Get the variant data.
const TId & GetId(void) const
Get the Id member data.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
BOOST_AUTO_TEST_CASE(KmerResults)
void s_GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)
int s_GetNumLSHHits(uint64_t *lsh, int lshSize)
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Structure for ancillary data on KMER search.
int jd_count
How often was the Jaccard distance calculated.
int hit_count
How many hits to the hash array were there?
Structure to represent a single sequence to be fed to BLAST.
Utility stuff for more convenient using of Boost.Test library.
Uniform BLAST Search Interface.
static const char * kWidth
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4