: m_NumHashFct(numHashFct),
117m_KmerSize(kmerSize),
122m_Alphabet(alphabet),
124m_ChunkSize(chunkSize)
131m_NumBands = m_NumHashFct/m_RowsPerBand;
141 return( (
a*x +
b) % p );
148 const Uint4fnv_prime = 16777619u;
149 const Uint4fnv_offset_basis = 2166136261u;
154 key[0] = num & 0xff;
155 key[1] = (num >> 8) & 0xff;
156 key[2] = (num >> 16) & 0xff;
157 key[3] = (num >> 24) & 0xff;
160 hash= fnv_offset_basis;
161 for(
i= 0;
i< 4;
i++) {
172vector < vector < vector < uint32_t > > > & seq_hash,
184 intfullOID=q_oid+oidOffset;
186vector<TSeqRange> range_v;
188seq_hash[q_oid].resize(chunk_num);
193 boolfirst_time=
true;
195 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
200 if(seq_kmer.
empty())
206vector<uint32_t> idx_tmp(num_hashes);
207vector<uint32_t> hash_tmp(num_hashes);
209 for(
inth=0;h<num_hashes;h++)
211hash_tmp[h]=0xffffffff;
212idx_tmp[h]=0xffffffff;
219 for(
inth=0;h<num_hashes;h++)
223 if(hashval < hash_tmp[h])
225hash_tmp[h] = hashval;
231 if(first_time ==
false)
244seq_hash[q_oid][chunk_counter].resize(num_hashes+1);
245 for(h=0;h<num_hashes;h++)
246seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];
250seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;
252seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;
256 if(chunk_num > chunk_counter+1)
257seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());
258 if(first_time ==
true)
259seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());
266vector < vector < vector < uint32_t > > > & seq_hash,
276 intfullOID=q_oid+oidOffset;
278vector<TSeqRange> range_v;
280seq_hash[q_oid].resize(chunk_num);
285 boolfirst_time=
true;
287 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
292 if(seq_kmer.
empty())
298vector<uint32_t> hash_values;
299vector<uint32_t> idx_tmp(num_hashes);
305hash_values.push_back(hashval);
308 if(hash_values.size() <
static_cast<size_t>(num_hashes))
310 intrem = 1 + num_hashes -
static_cast<int>(hash_values.size());
312 for(
int i=0;
i<rem;
i++)
313hash_values.push_back(hashval);
315 std::sort(hash_values.begin(), hash_values.end());
317 for(
int i=0;
i<num_hashes;
i++)
318idx_tmp[
i] = hash_values[
i];
320 if(first_time ==
false)
333seq_hash[q_oid][chunk_counter].resize(num_hashes+1);
334 for(h=0;h<num_hashes;h++)
335seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];
339seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;
341seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;
345 if(chunk_num > chunk_counter+1)
346seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());
347 if(first_time ==
true)
348seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());
354 intq_oid,
intnumBands,
intnumRows,
int& total_chunks,
uint8_t* uniqueHash)
357 intnum_chunks =
static_cast<int>(seq_hash[q_oid].size());
358 for(
int n=0;
n<num_chunks;
n++)
360 for(
int b=0;
b<numBands;
b++)
362 unsigned char key[9];
363 for(
int r=0;
r<numRows;
r++)
365 key[
r*4] = (seq_hash[q_oid][
n][
b*numRows+
r]) & 0xff;
366 key[1+
r*4] = ((seq_hash[q_oid][
n][
b*numRows+
r]) >> 8) & 0xff;
367 key[2+
r*4] = ((seq_hash[q_oid][
n][
b*numRows+
r]) >> 16) & 0xff;
368 key[3+
r*4] = ((seq_hash[q_oid][
n][
b*numRows+
r]) >> 24) & 0xff;
370 key[8] = (
unsignedchar)
b;
374lsh[foo].push_back(total_chunks);
382 intq_oid,
intnum_k,
intnum_l,
intarray_size,
int& total_chunks,
uint8_t* uniqueHash,
383vector < vector <int> >& kvector)
387vector<unsigned char>
key(
max);
388 intnum_chunks=
static_cast<int>(seq_hash[q_oid].size());
391 for(
int n=0;
n<num_chunks;
n++)
393 for(
int r=0;
r<num_l;
r++)
395 for(
int i=0;
i<num_k;
i++)
397temp_index = kvector[
r][
i];
398temp_hash = seq_hash[q_oid][
n][temp_index];
399 key[
i*4] = (temp_hash) & 0xff;
400 key[1+
i*4] = ((temp_hash) >> 8) & 0xff;
401 key[2+
i*4] = ((temp_hash) >> 16) & 0xff;
402 key[3+
i*4] = ((temp_hash) >> 24) & 0xff;
408lsh[foo].push_back(total_chunks);
416 intq_oid,
intnumHashes,
intnumRows,
int& total_chunks,
uint8_t* uniqueHash)
419 intnum_chunks =
static_cast<int>(seq_hash[q_oid].size());
420 intnumHashMax = numHashes - numRows + 1;
422 for(
int n=0;
n<num_chunks;
n++)
424 for(
int b=0;
b<numHashMax;
b++)
426 unsigned char key[12];
427 for(
int r=0;
r<numRows;
r++)
429 key[
r*4] = (seq_hash[q_oid][
n][
b+
r]) & 0xff;
430 key[1+
r*4] = ((seq_hash[q_oid][
n][
b+
r]) >> 8) & 0xff;
431 key[2+
r*4] = ((seq_hash[q_oid][
n][
b+
r]) >> 16) & 0xff;
432 key[3+
r*4] = ((seq_hash[q_oid][
n][
b+
r]) >> 24) & 0xff;
437lsh[foo].push_back(total_chunks);
439 for(
int b=0;
b<numHashMax-1;
b++)
441 unsigned char key[8];
442 for(
int r=0;
r<numRows;
r++)
444temp_hash = seq_hash[q_oid][
n][
b+2*
r];
445 key[
r*4] = (temp_hash) & 0xff;
446 key[1+
r*4] = (temp_hash >> 8) & 0xff;
447 key[2+
r*4] = (temp_hash >> 16) & 0xff;
448 key[3+
r*4] = (temp_hash >> 24) & 0xff;
453lsh[foo].push_back(total_chunks);
465vector<string> paths;
468vector<TSeqRange> range_vec;
469vector<string> volname_vec;
471 for(vector<string>::iterator iter=paths.begin(); iter != paths.end(); ++iter)
476 stringvolName = base + ext;
477volname_vec.push_back(volName);
482range.
SetTo(oid_offset);
483range_vec.push_back(range);
487 intnumVols =
static_cast<int>(paths.size());
488 #pragma omp parallel for num_threads(numThreads) 489 for(
intindex=0; index<numVols; index++)
491 x_BuildIndex(volname_vec[index], range_vec[index].GetFrom(), range_vec[index].GetTo());
499 char* loadBadMers = getenv(
"LOADBADMERS");
510badMers.push_back(badKmer);
511cerr << badKmer <<
'\n';
515 char* noBadMers = getenv(
"NOBADMERS");
517 returnvector<int>();
521 const intkLength=10;
522 int array[] = {139810, 69905, 70161, 70177, 74257,
52369921, 69906, 74001, 135441, 69922};
528 returnvector<int>();
537 intvectorRandNums=0;
548 stringindexFile = name +
".pki";
549 stringdataFile = name +
".pkd";
561num_seqs = stop - start;
566index_file.write((
char*) &(
m_Version), 4);
567index_file.write((
char*) &(num_seqs), 4);
569index_file.write((
char*) &(
m_Samples), 4);
570index_file.write((
char*) &(
m_KmerSize), 4);
572index_file.write((
char*) &(
m_Compress), 4);
573index_file.write((
char*) &(
m_Alphabet), 4);
574index_file.write((
char*) &(StartLSH), 4);
575index_file.write((
char*) &(kSizeLSH), 4);
585 for(
intq_oid=0;q_oid<3*num_seqs;q_oid++)
592vector < vector < vector < uint32_t > > > seq_hash(num_seqs);
595 for(
intq_oid=0;q_oid<num_seqs;q_oid++)
625 const uint32_tkUniqueHash = 0x1000000;
627 for(
uint32_tindex=0; index<kUniqueHash; index++)
628uniqueHash[index] = 0;
630vector< vector<uint32_t> > lsh(kSizeLSH);
633vector < vector <int> > kvector;
638 for(
intq_oid=0;q_oid<num_seqs;q_oid++)
645total_chunks, uniqueHash, kvector);
653 for(
uint32_tindex=0; index<kUniqueHash; index++)
655 if(uniqueHash[index] > 0)
658 delete[] uniqueHash;
662 for(
intindex=0; index<kSizeLSH-1; index++)
664LSHMatchSize += lsh[index].size();
668 const uint64_tkLSHMatchEnd = 4*LSHMatchSize + StartLSH + 8*kSizeLSH;
669index_file.write((
char*) &(kLSHMatchEnd), 8);
676 const intkFutureUse=0;
677index_file.write((
char*) &(kFutureUse), 4);
679index_file.write((
char*) &(kFutureUse), 4);
686index_file.write((
char*) &(
a[
i]), 4);
688index_file.write((
char*) &(
b[
i]), 4);
693 intnum=
static_cast<int>(badMers.size());
696index_file.write((
char*) &(num), 4);
697 for(vector<int>::iterator iter=badMers.begin(); iter != badMers.end() &&
i<2*
m_NumHashFct; ++iter, ++
i)
698index_file.write((
char*) &(*iter), 4);
703index_file.write((
char*) &(kZero), 4);
713vector<int> temp = kvector[
i];
715index_file.write((
char*) &(temp[j]), 1);
720 for(
int i=0;
i<extra;
i++)
721index_file.write((
char*) &(temp), 1);
725 uint64_tlsh_offset = StartLSH + 8*kSizeLSH;
727 for(
intindex=0; index<kSizeLSH-1; index++)
729 if(lsh[index].
size() == 0)
730index_file.write((
char*) &(kNoValue), 8);
732index_file.write((
char*) &(lsh_offset), 8);
733lsh_offset += 4*(lsh[index].size());
735index_file.write((
char*) &(lsh_offset), 8);
738 for(
intindex=0; index<kSizeLSH-1; index++)
740 for(vector<uint32_t>::iterator
i=lsh[index].begin();
i!= lsh[index].end(); ++
i)
742index_file.write((
char*) &(*i), 4);
767 for(
intq_oid=0;q_oid<num_seqs;q_oid++)
769 intnum_chunks =
static_cast<int>(seq_hash[q_oid].size());
770 for(
int n=0;
n<num_chunks;
n++)
772vector<uint32_t> tmp_hash;
778tmp_hash.push_back(hash_val);
784tmp_hash.push_back(hash_val);
790tmp_hash.push_back(seq_hash[q_oid][
n][
b]);
794 std::sort(tmp_hash.begin(), tmp_hash.end());
797data_file.write((
char*) &(tmp_hash[
b]), width);
799data_file.write((
char*) &(seq_hash[q_oid][
n][
m_NumHashFct]), 4);
static void s_Get_LSH_index_hashes5(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numHashes, int numRows, int &total_chunks, uint8_t *uniqueHash)
uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)
static Uint4 FNV_hash(uint32_t num)
FNV Hash. See http://www.isthe.com/chongo/tech/comp/fnv/index.html.
void s_MinhashSequences(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, const uint32_t *a, const uint32_t *b, bool do_seg, int kmerNum, int oidOffset, int alphabetChoice, int version, int chunkSize)
static void s_Get_LSH_index_hashes2(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int num_k, int num_l, int array_size, int &total_chunks, uint8_t *uniqueHash, vector< vector< int > > &kvector)
static void s_Get_LSH_index_hashes(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numBands, int numRows, int &total_chunks, uint8_t *uniqueHash)
void s_MinhashSequences2(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, int kmerNum, int oidOffset, int alphabetChoice, int version, vector< int > badMers, int chunkSize)
vector< int > s_BlastKmerLoadBadMers(int alphabet)
int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)
Calculates the number of differences between two minhash arrays.
set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)
Get KMERs for a given sequence using a compressed alphabet.
void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)
Get the random numbers for the hash function.
set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)
Get KMERs for a given sequence using a compressed alphabet.
int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int chunkSize)
Breaks a sequences up into chunks if the sequences is above a certain length.
void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)
Function to get the k sites to compare for Buhler LSH.
CRef< CSeqDB > m_SeqDB
Residues in kmer.
int m_ChunkSize
version of index file
int m_KmerSize
Number of rows per band.
int m_Compress
Number of samples (Buhler only)
bool m_DoSeg
BLAST database.
int m_Version
0 for 15 letters, 1 for 10 letters.
int m_Samples
Should Seg be run on sequences.
void x_BuildIndex(string &name, int start=0, int number=0)
BUild index for an individual BLAST volume.
void Build(int numThreads=1)
Build the index.
int m_RowsPerBand
Number of LSH bands.
int m_NumBands
Number of hash functions.
int m_Alphabet
Compress the arrays for Jaccard matches.
void x_WriteDataFile(vector< vector< vector< uint32_t > > > &seq_hash, int num_seqs, CNcbiOfstream &data_file)
Writes out the data file.
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
void GetSequenceAsString(int oid, CSeqUtil::ECoding coding, string &output, TSeqRange range=TSeqRange()) const
Get a sequence in a given encoding.
const string & GetDBNameList() const
Get list of database names.
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
int GetNumSeqs() const
Returns the number of sequences available.
void SetNumberOfThreads(int num_threads, bool force_mt=false)
Setting the number of threads.
const_iterator begin() const
parent_type::iterator iterator
const_iterator end() const
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
void SetFrom(TFrom value)
Assign a value to From data member.
void SetTo(TTo value)
Assign a value to To data member.
#define KMER_LSH_ARRAY_SIZE
constexpr auto sort(_Init &&init)
const string version
version string
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
uint32_t do_pearson_hash(unsigned char *key, int length)
uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)
Pearson hash an integer into two bytes.
unsigned char pearson_hash_int2byte(uint32_t input, int seed1)
Pearson hash an integer into one byte.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4