( (
a*x +
b) % p );
66 const Uint4fnv_prime = 16777619u;
67 const Uint4fnv_offset_basis = 2166136261u;
73 key[1] = (num >> 8) & 0xff;
74 key[2] = (num >> 16) & 0xff;
75 key[3] = (num >> 24) & 0xff;
78 hash= fnv_offset_basis;
79 for(
i= 0;
i< 4;
i++) {
97 for(
inth=0;h<num_hashes;h++)
103 return(
double) score / num_hashes;
117 for(
inth=0;h<num_hashes;h++)
119 while(bindex < num_hashes &&
a[h] >
b[bindex])
123 if(bindex == num_hashes)
125 if(
a[h] ==
b[bindex])
129 return(
double) score / num_hashes;
137vector<Uint1> trans_table;
140 intseq_length =
static_cast<int>(query_sequence.length());
141 const char*
query= query_sequence.c_str();
144 if(seq_length < kmerNum)
150 for(
int i=0;
i<seq_length-kmerNum+1;
i++)
155 for(
intkindex=0; kindex<kmerNum; kindex++)
157 if(
query[
i+kindex] == 21)
162index = (index << 4);
163index += trans_table[
query[
i+kindex]];
169 stringkmer_str=
"";
175kmerCountPriv[kmer_str]++;
177 if(index != 0 &&
i<seq_length-kmerNum &&
query[
i+kmerNum] != 21)
179 intindex_short = index;
180index = (index << 4);
181index += trans_table[
query[
i+kmerNum]];
182 stringkmer_str=
"";
184kmerCountPlusPriv[kmer_str]++;
191kmerCount[(*i).first]++;
193kmerCountPlus[(*i).first]++;
198kmerCount[(*i).first] += (*i).second;
200kmerCountPlus[(*i).first] += (*i).second;
211vector<Uint1> trans_table;
214 intseq_length =
static_cast<int>(query_sequence.length());
215 const char*
query= query_sequence.c_str();
218 if(seq_length < kmerNum)
222 char*query_private=(
char*)
malloc(chunk_length);
224 for(
unsigned int i=range.
GetFrom();
i<=range.
GetTo();
i++, private_index++)
225query_private[private_index] =
query[
i];
232 SeqBufferSeg((
unsigned char*)query_private, chunk_length, 0, sp, &seq_locs);
238 for(
int i=itr->ssr->left; i <= itr->ssr->right;
i++)
239query_private[
i]=21;
245 for(
int i=0;
i<chunk_length-kmerNum+1;
i++)
250 for(
intkindex=0; kindex<kmerNum; kindex++)
252 if(query_private[
i+kindex] == 21)
257index = (index << 4);
258index += trans_table[query_private[
i+kindex]];
265 free(query_private);
275vector<Uint1> trans_table;
278 intseq_length =
static_cast<int>(query_sequence.length());
279 const char*
query= query_sequence.c_str();
282 if(seq_length < kmerNum)
286 char*query_private=(
char*)
malloc(chunk_length);
288 for(
unsigned int i=range.
GetFrom();
i<=range.
GetTo();
i++, private_index++)
289query_private[private_index] =
query[
i];
291 for(
int i=0;
i<chunk_length-kmerNum+1;
i++)
296 for(
intkindex=0; kindex<kmerNum; kindex++)
298 if(query_private[
i+kindex] == 21)
303index = (index << 4);
304index += trans_table[query_private[
i+kindex]];
308 if(
i< chunk_length-kmerNum && !badMers.empty())
310std::vector<int>::iterator it;
311it = std::find(badMers.begin(), badMers.end(), index);
312 if(it != badMers.end() &&
i< chunk_length-1)
314index = (index << 4);
315index += trans_table[query_private[
i+kmerNum]];
323 free(query_private);
332 const unsigned intkAlphabetLen = 28;
335 const char* kCompAlphabets[] = {
337 "ST IJV LM KR EQZ A G BD P N F Y H C W",
339 "IJLMV AST BDENZ KQR G FY P H C W" 343 const char* trans_string = kCompAlphabets[alphabetChoice];
345 Uint4compressed_letter = 1;
347trans_table.resize(kAlphabetLen + 1, 0);
348 for(
Uint4 i= 0;
i< strlen(trans_string);
i++) {
352 else if(
isalpha(trans_string[
i])) {
355 _ASSERT(aa_letter < trans_table.size());
357trans_table[aa_letter] = compressed_letter;
366 const intkOverlap=50;
376numChunks =
MAX(1, (length - kOverlap)/(
ChunkSize- kOverlap));
377newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;
381newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;
386 for(
int i=0;
i<numChunks;
i++)
393range_v.push_back(range);
403 intnum_hashes =
static_cast<int>(minhash1.size());
405 for(
intindex=0; index<num_hashes; index++)
407 if(minhash1[index] != minhash2[index])
416vector < vector <uint32_t> >& seq_hash,
425 boolkmersFound=
false;
426 intseq_length =
static_cast<int>(
query.length());
427vector<TSeqRange> range_v;
429seq_hash.resize(chunk_num);
430 boolseg = (do_seg > 0) ?
true:
false;
432vector<uint32_t> idx_tmp(num_hashes);
433vector<uint32_t> hash_tmp(num_hashes);
435 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
438seq_hash[chunk_iter].resize(num_hashes);
442 if(seq_kmer.
empty())
447 for(
inth=0;h<num_hashes;h++)
449hash_tmp[h]=0xffffffff;
450idx_tmp[h]=0xffffffff;
457 for(
inth=0;h<num_hashes;h++)
461 if(hashval < hash_tmp[h])
463hash_tmp[h] = hashval;
470 for(
inth=0;h<num_hashes;h++)
472seq_hash[chunk_iter][h] = idx_tmp[h];
480vector < vector <uint32_t> >& seq_hash,
487 boolkmersFound=
false;
488 intseq_length =
static_cast<int>(
query.length());
489vector<TSeqRange> range_v;
491seq_hash.resize(chunk_num);
493vector<uint32_t> hash_values;
495 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
498seq_hash[chunk_iter].resize(numHashes);
502 if(seq_kmer.
empty())
511hash_values.push_back(hashval);
515 if(hash_values.size() <
static_cast<size_t>(numHashes))
517 intrem = 1 + numHashes -
static_cast<int>(hash_values.size());
519 for(
int i=0;
i<rem;
i++)
520hash_values.push_back(hashval);
522 std::sort(hash_values.begin(), hash_values.end());
525 for(
inth=0;h<numHashes;h++)
526seq_hash[chunk_iter][h] = hash_values[h];
533vector < vector <uint32_t> >&lsh_hash_vec,
537 intnum_chunks=
static_cast<int>(query_hash.size());
539 for(
int n=0;
n<num_chunks;
n++)
541vector <uint32_t> lsh_chunk_vec;
542 for(
int b=0;
b<num_bands;
b++)
546 unsigned char key[9];
547 for(
int r=0;
r<rows_per_band;
r++)
549temp_hash = query_hash[
n][
b*rows_per_band+
r];
550 key[
r*4] = (temp_hash) & 0xff;
551 key[1+
r*4] = ((temp_hash) >> 8) & 0xff;
552 key[2+
r*4] = ((temp_hash) >> 16) & 0xff;
553 key[3+
r*4] = ((temp_hash) >> 24) & 0xff;
555 key[8] = (
unsignedchar)
b;
557lsh_chunk_vec.push_back(foo);
559 std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
560lsh_hash_vec.push_back(lsh_chunk_vec);
567vector < vector <uint32_t> >&lsh_hash_vec,
571 intnum_chunks=
static_cast<int>(query_hash.size());
573 intnumHashMax = numHashes - numRows + 1;
574 for(
int n=0;
n<num_chunks;
n++)
576vector <uint32_t> lsh_chunk_vec;
577 for(
int b=0;
b<numHashMax;
b++)
580 unsigned char key[8];
581 for(
int r=0;
r<numRows;
r++)
583temp_hash = query_hash[
n][
b+
r];
584 key[
r*4] = (temp_hash) & 0xff;
585 key[1+
r*4] = ((temp_hash) >> 8) & 0xff;
586 key[2+
r*4] = ((temp_hash) >> 16) & 0xff;
587 key[3+
r*4] = ((temp_hash) >> 24) & 0xff;
590lsh_chunk_vec.push_back(foo);
592 for(
int b=0;
b<numHashMax-1;
b++)
594 unsigned char key[8];
595 for(
int r=0;
r<numRows;
r++)
597temp_hash = query_hash[
n][
b+2*
r];
598 key[
r*4] = (temp_hash) & 0xff;
599 key[1+
r*4] = ((temp_hash) >> 8) & 0xff;
600 key[2+
r*4] = ((temp_hash) >> 16) & 0xff;
601 key[3+
r*4] = ((temp_hash) >> 24) & 0xff;
604lsh_chunk_vec.push_back(foo);
606 std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
607lsh_hash_vec.push_back(lsh_chunk_vec);
617 for(
int i=0;
i<numHashes;
i++)
623 while(
a[
i] == 0);
629 void GetKValues(vector< vector <int> >& kvector,
intk_value,
intl_value,
intarray_size)
634 for(
int i=0;
i<l_value;
i++)
637 for(
intk=0; k<k_value; k++)
640ktemp.push_back(temp);
642kvector.push_back(ktemp);
652vector < vector <uint32_t> >& lsh_hash_vec,
655vector< vector<int> >& kvector)
658vector<unsigned char>
key(
max);
659 intnum_chunks=
static_cast<int>(query_hash.size());
662 for(
int n=0;
n<num_chunks;
n++)
664vector <uint32_t> lsh_chunk_vec;
665 for(
int r=0;
r<num_l;
r++)
667 for(
int i=0;
i<num_k;
i++)
669temp_index=kvector[
r][
i];
670temp_hash = query_hash[
n][temp_index];
671 key[
i*4] = (temp_hash) & 0xff;
672 key[1+
i*4] = ((temp_hash) >> 8) & 0xff;
673 key[2+
i*4] = ((temp_hash) >> 16) & 0xff;
674 key[3+
i*4] = ((temp_hash) >> 24) & 0xff;
678lsh_chunk_vec.push_back(foo);
680 std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
681lsh_hash_vec.push_back(lsh_chunk_vec);
691 intnum=
static_cast<int>(query_LSH_hash.size());
692 for(
int i=0;
i<num;
i++)
694 for(vector<uint32_t>::const_iterator iter=query_LSH_hash[
i].begin(); iter != query_LSH_hash[
i].end(); ++iter)
696 if(lsh_array[*iter] != 0)
697candidates[
i].insert(*iter);
709 intnum_chunks=
static_cast<int>(query_hash.size());
711 for(
int n=0;
n<num_chunks;
n++)
713vector<uint32_t> tmp_hash;
714 for(vector<uint32_t>::const_iterator
i= query_hash[
n].begin();
i!= query_hash[
n].end(); ++
i)
726tmp_hash.push_back(0xffff);
728tmp_hash.push_back(0xff);
730tmp_hash.push_back(kBig);
733tmp_hash.push_back(hash_value);
736 std::sort(tmp_hash.begin(), tmp_hash.end());
737query_hash_hash.push_back(tmp_hash);
755 intnum_chunks=
static_cast<int>(query_hash.size());
757vector< vector<uint32_t> > candidate_oids;
758candidate_oids.resize(num_chunks);
760 for(
intnum=0; num<num_chunks; num++)
768 while(lsh[index] == 0)
774 while(index < remaining)
776 intoid = *(oid_offset+index);
777candidate_oids[num].push_back(oid);
784 for(
intindex=0; index<num_chunks; index++)
786kmer_stats.
hit_count+= candidates[index].size();
790vector < vector <uint32_t> > query_hash_hash;
793vector <uint32_t> subject_hash;
794subject_hash.resize(num_hashes);
799 for(
int n=0;
n<num_chunks;
n++)
801 std::sort(candidate_oids[
n].begin(), candidate_oids[
n].end());
804 for(vector<uint32_t>::iterator
i= candidate_oids[
n].begin();
i!= candidate_oids[
n].end(); ++
i)
806 if(last_oid == *
i)
809 if(min_hits != oid_count)
824mhfile.
GetMinHits(oid, subject_oid, subject_hash);
825 doublecurrent_score=0;
827current_score =
estimate_jaccard(query_hash_hash[
n], subject_hash, num_hashes);
831 if(current_score < thresh)
835 if(!score_map.count(subject_oid))
836score_map.
insert(score_mapValType(subject_oid, current_score));
837 else if(current_score > score_map[subject_oid])
838score_map[subject_oid] = current_score;
845 doubletotal_score = (*i).second;
846 if(total_score > thresh)
848score_vector.push_back((*
i));
883 for(
i=0;
i<lshSize;
i++)
885 if(lsh_array[
i] > 0)
894vector<uint32_t> hits;
901 if(hits.size() !=
static_cast<size_t>(num_hashes))
909 for(vector<uint32_t>::iterator iter=hits.begin(); iter != hits.end(); ++iter)
913error_msg =
"Signature array for version 3 index not in ascending order in volume "+
NStr::NumericToString(volume);
926vector<string> kmerFiles;
928 intnumFiles =
static_cast<int>(kmerFiles.size());
929 for(
int i=0;
i<numFiles;
i++)
935 catch(
constblast::CMinHashException& e) {
936error_msg = e.GetMsg();
940error_msg = e.GetMsg();
944error_msg =
"Unknown error";
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
BLAST filtering functions.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Int2 SeqBufferSeg(Uint1 *sequence, Int4 length, Int4 offset, SegParameters *sparamsp, BlastSeqLoc **seg_locs)
Runs seg on a protein sequence in ncbistdaa.
SegParameters * SegParametersNewAa(void)
Allocated SeqParameter struct for proteins and fills with default values.
void SegParametersFree(SegParameters *sparamsp)
Free SegParameters structure.
static int s_BlastKmerVerifyVolume(CMinHashFile &mhfile, string &error_msg, int volume)
uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)
void BlastKmerGetCompressedTranslationTable(vector< Uint1 > &trans_table, int alphabetChoice)
Creates translation table for compressed alphabets.
int BlastKmerVerifyIndex(CRef< CSeqDB > seqdb, string &error_msg)
double estimate_jaccard(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int ChunkSize)
Breaks a sequences up into chunks if the sequences is above a certain length.
int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)
Calculates the number of differences between two minhash arrays.
void get_LSH_hashes(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)
void get_LSH_hashes5(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)
Gets the LSH hash for one hash function.
set< uint32_t > BlastKmerGetKmerSetStats(const string &query_sequence, int kmerNum, map< string, int > &kmerCount, map< string, int > &kmerCountPlus, int alphabetChoice, bool perQuery)
Simplified version of BlastKmerGetKmerSet.
DEFINE_STATIC_MUTEX(randMutex)
set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)
Get KMERs for a given sequence using a compressed alphabet.
bool minhash_query2(const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)
Hash the query for the minimum values;.
static uint32_t FNV_hash(uint32_t num)
FNV hash, see http://www.isthe.com/chongo/tech/comp/fnv/index.html.
void s_HashHashQuery(const vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &query_hash_hash, int compress, int version)
void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)
Get the random numbers for the hash function.
set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)
Get KMERs for a given sequence using a compressed alphabet.
void neighbor_query(const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVer)
double estimate_jaccard2(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
bool minhash_query(const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)
void get_LSH_match_from_hash(const vector< vector< uint32_t > > &query_LSH_hash, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates)
void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)
Function to get the k sites to compare for Buhler LSH.
void get_LSH_hashes2(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kvector)
vector< pair< uint32_t, double > > TBlastKmerPrelimScoreVector
Vector of pairs of database OIDs and scores.
Access data in Minhash files.
int GetLSHSize(void) const
uint32_t * GetMinHits(int oid) const
int GetNumHashes(void) const
Returns the number of values in an array of hashes (probably 32)
uint64_t * GetLSHArray(void) const
int GetVersion(void) const
int GetNumSeqs(void) const
int GetDataWidth(void) const
int GetKmerSize(void) const
Returns the length of the KMER.
int * GetHits(uint64_t offset) const
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
const_iterator begin() const
const_iterator end() const
iterator_bool insert(const value_type &val)
iterator_bool insert(const value_type &val)
const_iterator begin() const
parent_type::iterator iterator
const_iterator end() const
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
unsigned int TSeqPos
Type for sequence locations and lengths.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
position_type GetLength(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetTo(TTo value)
Assign a value to To data member.
unsigned int
A callback function used to compare two keys in a database.
pair< int, int > ChunkSize(const CSpliced_exon_chunk &chunk)
constexpr auto sort(_Init &&init)
const string version
version string
const struct ncbi::grid::netcache::search::fields::KEY key
#define MIN(a, b)
returns smaller of a and b.
#define MAX(a, b)
returns larger of a and b.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
uint32_t do_pearson_hash(unsigned char *key, int length)
uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)
Pearson hash an integer into two bytes.
unsigned char pearson_hash_int2byte(uint32_t input, int seed1)
Pearson hash an integer into one byte.
static size_t read_size(CNcbiIstream &stream, const char *name)
Defines BLAST database access classes.
Structure for ancillary data on KMER search.
int jd_count
How often was the Jaccard distance calculated.
int total_matches
How many matches returned.
int oids_considered
How many OIDs were considered as candidates.
int hit_count
How many hits to the hash array were there?
int jd_oid_count
How many OIDs was the Jaccard distance calculated for.
Used to hold a set of positions, mostly used for filtering.
struct BlastSeqLoc * next
next in linked list
Structure to hold parameters for seg search.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4