: m_QueryVector(query_vector),
53m_KmerFiles.push_back(kmerfile);
55seqdb->FindVolumePaths(m_KmerFiles,
false);
57 if(options->Validate() ==
false)
86 boolkmerFound =
false;
98 else if(kmerParams.
version== 2)
116vector< set<uint32_t > > candidates;
117candidates.resize(query_hash.size());
148 CSeqVectorseqvect(*(query_vector[queryNum].seqloc), *(query_vector[queryNum].scope));
152seqid->
Assign(*(query_vector[queryNum].seqloc->GetId()));
158 for(TBlastKmerPrelimScoreVector::iterator iter=score_vector.begin(); iter != score_vector.end(); ++iter)
167 returnone.second > two.second;
173 for(TBlastKmerPrelimScoreVector::iterator itr=
results.begin(); itr !=
results.end(); ++itr)
175seqdb->
GetGis((*itr).first, retvalue,
true);
184 intnumFiles =
static_cast<int>(
m_KmerFiles.size());
185 if(numThreads > numQuery)
186numThreads = numFiles;
188vector<SOneBlastKmerSearch> kmerSearchVector;
189kmerSearchVector.reserve(numQuery);
190 for(
int i=0;
i<numQuery;
i++)
197 if(query_seq.length() <
static_cast<string::size_type
>(kmerParams.
kmerNum))
200kmerSearch.
qSeqid= qseqid;
201 x_ProcessQuery(query_seq, kmerSearch, kmerParams,
a,
b, kValues, badMers);
202}
catch(
constncbi::CException& e) {
204 string msg= e.GetMsg();
206 if(
msg.find(
"WARNING:") != std::string::npos)
210}
catch(
conststd::exception& e) {
219kmerSearchVector.push_back(kmerSearch);
222 #pragma omp parallel for num_threads(numThreads) 223 for(
intindex=0; index<numFiles; index++)
226 for(
int i=0;
i<numQuery;
i++)
237 for(
int i=0;
i<numQuery;
i++)
244kmerResultSet->push_back(kmerResults);
252 for(
intindex=0; index<numFiles; index++)
253final_size += kmerSearch.
scoreVector[index].size();
254final_results.reserve(final_size);
257 for(
intindex=0; index<numFiles; index++)
265final_results.insert(final_results.end(), score_vector.begin(), score_vector.end());
280 intvec_size =
static_cast<int>( final_results.size() );
282 if(vec_size > num_matches)
283final_results.erase(final_results.begin()+num_matches, final_results.end());
293 if(intersect->
Size() > 0)
296final_results.erase(final_results.begin(), final_results.end());
304 if(intersect->
Size() > 0)
307final_results.erase(final_results.begin(), final_results.end());
313kmerResultSet->push_back(kmerResults);
315 returnkmerResultSet;
333 introws_per_band = mhfile.
GetRows();
338vector<uint32_t>
a(num_hashes);
339vector<uint32_t>
b(num_hashes);
344 a[0] =random_nums[0];
345 b[0] =random_nums[1];
349 for(
int i=0;
i<num_hashes;
i++)
350 a[
i] = random_nums[
i];
351 for(
int i=0;
i<num_hashes;
i++)
352 b[
i] = random_nums[
i+num_hashes];
355vector < vector<int> > kValues;
359 unsigned char* kvaluesArray = mhfile.
GetKValues();
360 for(
int i=0;
i<samples;
i++)
363 for(
intj=0; j<rows_per_band; j++)
364temp.push_back(kvaluesArray[total++]);
365kValues.push_back(temp);
369 SBlastKmerParameterskmerParams(num_hashes, rows_per_band, samples, kmerNum, alphabetChoice, kmerVer);
377 returnkmerResultsSet;
void s_GetAllGis(vector< TGi > &retvalue, TBlastKmerPrelimScoreVector results, CRef< CSeqDB > seqdb)
static void s_AdjustPrelimScoreVectorOID(TBlastKmerPrelimScoreVector &score_vector, int offset)
bool s_SortFinalResults(const pair< uint32_t, double > &one, const pair< uint32_t, double > &two)
static void s_GetQuerySequence(const TSeqLocVector &query_vector, string &query_seq, CRef< CSeq_id > &seqid, int queryNum)
CRef< CBlastKmerResults > MakeEmptyResults(TSeqLocVector &queryVector, int queryNum, const string &errMsg, EBlastSeverity severity=eBlastSevError)
Empty results (use on error)
vector< pair< uint32_t, double > > TBlastKmerPrelimScoreVector
Vector of pairs of database OIDs and scores.
void neighbor_query(const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVersion)
void get_LSH_hashes(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)
void get_LSH_hashes5(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)
Gets the LSH hash for one hash function.
bool minhash_query2(const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)
Hash the query for the minimum values;.
void get_LSH_match_from_hash(const vector< vector< uint32_t > > &lsh_hash_vec, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates)
bool minhash_query(const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)
void get_LSH_hashes2(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kValues)
Class of optiosn for the KMEr search.
int GetNumTargetSeqs() const
Gets the number of matches (subject sequences) to return.
double GetThresh() const
Get the threshold.
int GetMinHits() const
Get the number of LSH hits to initiate the calculation of the Jaccard distance.
bool Validate() const
Checks that options are valid.
This class holds one or more CBlastKmerResults.
This class represents the results for one KMER search (one query).
Class to perform a KMER-BLASTP search.
CRef< CBlastKmerOptions > m_Opts
Specifies values for some options (e.g., threshold)
CRef< CSeqDBNegativeList > m_NegGIList
Negative GIList to limit search by.
TSeqLocVector m_QueryVector
Holds the query seqloc and scope.
CRef< CBlastKmerResultsSet > Run()
Performs search on one or more queries Performs search on one or more queries.
void x_RunKmerFile(const vector< vector< uint32_t > > &query_hash, const vector< vector< uint32_t > > &query_LSH_hash, CMinHashFile &mhfile, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats)
Search individual kmer file.
CRef< CBlastKmerResultsSet > x_SearchMultipleQueries(int firstQuery, int numQuery, const SBlastKmerParameters &kmerParams, uint32_t *a, uint32_t *b, vector< vector< int > > &kValues, vector< int > badMers)
Search multiple queries.
CRef< CSeqDBGiList > m_GIList
GIList to limit search by.
CRef< CBlastKmerResultsSet > RunSearches()
CBlastKmer(TSeqLocVector &query_vector, CRef< CBlastKmerOptions > options, CRef< CSeqDB > seqdb, string kmerfile=kEmptyStr)
Constructor Processes all proteins in TSeqLocVector.
CRef< CSeqDB > m_SeqDB
CSeqDB for BLAST db.
vector< string > m_KmerFiles
Name of the kmer files.
void x_ProcessQuery(const string &query_seq, SOneBlastKmerSearch &kmerSearch, const SBlastKmerParameters &kmerParams, uint32_t *a, uint32_t *b, vector< vector< int > > &kvalues, vector< int > badMers)
Preprocess query to sequence hashes.
GI list containing the intersection of two other lists of GIs.
Access data in Minhash files.
void GetBadMers(vector< int > &badMers) const
Overrepresented KMERs.
int GetNumHashes(void) const
Returns the number of values in an array of hashes (probably 32)
uint64_t * GetLSHArray(void) const
int GetVersion(void) const
int GetNumSeqs(void) const
uint32_t * GetRandomNumbers(void) const
int GetChunkSize(void) const
Get number of letters in a chunk (version 3 or higher)
int GetKmerSize(void) const
Returns the length of the KMER.
int GetSegStatus(void) const
int GetAlphabet(void) const
One of two alphabets from Shiryev et al.
unsigned char * GetKValues(void) const
LSH points for Buhler approach.
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
void GetGis(int oid, vector< TGi > &gis, bool append=false) const
Gets a list of GIs for an OID.
const string & GetDBNameList() const
Get list of database names.
Class for the messages for an individual query sequence.
size_t GetNumberOfThreads(void) const
Accessor for the number of threads to use.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
void SetCoding(TCoding coding)
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty â pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty â not pointing to any object, which means having a null value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
@ e_Ncbistdaa
consecutive codes for std aas
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
constexpr auto sort(_Init &&init)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Structure for ancillary data on KMER search.
int jd_count
How often was the Jaccard distance calculated.
int total_matches
How many matches returned.
int num_sequences
Number of database sequences considered (in this volume)
int oids_considered
How many OIDs were considered as candidates.
int hit_count
How many hits to the hash array were there?
int jd_oid_count
How many OIDs was the Jaccard distance calculated for.
int version
Version of index used (0 indicates default).
int chunkSize
size of a query chunk to process (default is 150).
int numHashes
Number of hash functions per signature.
int samples
Number of samples of query signature are made?
int rowsPerBand
Number of values sampled from signature.
int alphabetChoice
15 or 10 letter alphabet (0 for 15, 1 for 10).
int kmerNum
number of letters in KMER.
vector< TBlastKmerPrelimScoreVector > scoreVector
Scores for one query.
EBlastSeverity severity
Error or warning (only use if status is non-zero).
int status
Status of the query (0 is good, otherwise an error has occurred)
vector< vector< uint32_t > > queryLSHHash
LSH Hashes for one query (multiple chunks)
vector< BlastKmerStats > kmerStatsVector
Stats for one query.
vector< vector< uint32_t > > queryHash
Hashes for one query (multiple chunks)
CRef< CSeq_id > qSeqid
Seqid of the query.
string errDescription
Error description.
Structure to represent a single sequence to be fed to BLAST.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4