RetroSearch Browse

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/blastkmerindex_8cpp_source.html below:

NCBI C++ ToolKit: src/algo/blast/proteinkmer/blastkmerindex.cpp Source File

116

: m_NumHashFct(numHashFct),

117

m_KmerSize(kmerSize),

122

m_Alphabet(alphabet),

124

m_ChunkSize(chunkSize)

131

m_NumBands = m_NumHashFct/m_RowsPerBand;

141 return

( (

*x +

) % p );

148 const Uint4

fnv_prime = 16777619u;

149 const Uint4

fnv_offset_basis = 2166136261u;

154 key

[0] = num & 0xff;

155 key

[1] = (num >> 8) & 0xff;

156 key

[2] = (num >> 16) & 0xff;

157 key

[3] = (num >> 24) & 0xff;

160 hash

= fnv_offset_basis;

161 for

(

= 0;

< 4;

++) {

172

vector < vector < vector < uint32_t > > > & seq_hash,

184 int

fullOID=q_oid+oidOffset;

186

vector<TSeqRange> range_v;

188

seq_hash[q_oid].resize(chunk_num);

193 bool

first_time=

true

;

195 for

(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)

200 if

(seq_kmer.

empty

())

206

vector<uint32_t> idx_tmp(num_hashes);

207

vector<uint32_t> hash_tmp(num_hashes);

209 for

(

int

h=0;h<num_hashes;h++)

211

hash_tmp[h]=0xffffffff;

212

idx_tmp[h]=0xffffffff;

219 for

(

int

h=0;h<num_hashes;h++)

223 if

(hashval < hash_tmp[h])

225

hash_tmp[h] = hashval;

231 if

(first_time ==

false

)

244

seq_hash[q_oid][chunk_counter].resize(num_hashes+1);

245 for

(h=0;h<num_hashes;h++)

246

seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];

250

seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;

252

seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;

256 if

(chunk_num > chunk_counter+1)

257

seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());

258 if

(first_time ==

true

)

259

seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());

266

vector < vector < vector < uint32_t > > > & seq_hash,

276 int

fullOID=q_oid+oidOffset;

278

vector<TSeqRange> range_v;

280

seq_hash[q_oid].resize(chunk_num);

285 bool

first_time=

true

;

287 for

(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)

292 if

(seq_kmer.

empty

())

298

vector<uint32_t> hash_values;

299

vector<uint32_t> idx_tmp(num_hashes);

305

hash_values.push_back(hashval);

308 if

(hash_values.size() <

static_cast<size_t>

(num_hashes))

310 int

rem = 1 + num_hashes -

static_cast<int>

(hash_values.size());

312 for

(

int i

=0;

<rem;

++)

313

hash_values.push_back(hashval);

315 std::sort

(hash_values.begin(), hash_values.end());

317 for

(

int i

=0;

<num_hashes;

++)

318

idx_tmp[

] = hash_values[

];

320 if

(first_time ==

false

)

333

seq_hash[q_oid][chunk_counter].resize(num_hashes+1);

334 for

(h=0;h<num_hashes;h++)

335

seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];

339

seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;

341

seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;

345 if

(chunk_num > chunk_counter+1)

346

seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());

347 if

(first_time ==

true

)

348

seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());

354 int

q_oid,

int

numBands,

int

numRows,

int

& total_chunks,

uint8_t

* uniqueHash)

357 int

num_chunks =

static_cast<int>

(seq_hash[q_oid].size());

358 for

(

int n

=0;

<num_chunks;

++)

360 for

(

int b

=0;

<numBands;

++)

362 unsigned char key

[9];

363 for

(

int r

=0;

<numRows;

++)

365 key

[

*4] = (seq_hash[q_oid][

][

*numRows+

]) & 0xff;

366 key

[1+

*4] = ((seq_hash[q_oid][

][

*numRows+

]) >> 8) & 0xff;

367 key

[2+

*4] = ((seq_hash[q_oid][

][

*numRows+

]) >> 16) & 0xff;

368 key

[3+

*4] = ((seq_hash[q_oid][

][

*numRows+

]) >> 24) & 0xff;

370 key

[8] = (

unsigned

char)

;

374

lsh[foo].push_back(total_chunks);

382 int

q_oid,

int

num_k,

int

num_l,

int

array_size,

int

& total_chunks,

uint8_t

* uniqueHash,

383

vector < vector <int> >& kvector)

387

vector<unsigned char>

key

(

max

);

388 int

num_chunks=

static_cast<int>

(seq_hash[q_oid].size());

391 for

(

int n

=0;

<num_chunks;

++)

393 for

(

int r

=0;

<num_l;

++)

395 for

(

int i

=0;

<num_k;

++)

397

temp_index = kvector[

][

];

398

temp_hash = seq_hash[q_oid][

][temp_index];

399 key

[

*4] = (temp_hash) & 0xff;

400 key

[1+

*4] = ((temp_hash) >> 8) & 0xff;

401 key

[2+

*4] = ((temp_hash) >> 16) & 0xff;

402 key

[3+

*4] = ((temp_hash) >> 24) & 0xff;

408

lsh[foo].push_back(total_chunks);

416 int

q_oid,

int

numHashes,

int

numRows,

int

& total_chunks,

uint8_t

* uniqueHash)

419 int

num_chunks =

static_cast<int>

(seq_hash[q_oid].size());

420 int

numHashMax = numHashes - numRows + 1;

422 for

(

int n

=0;

<num_chunks;

++)

424 for

(

int b

=0;

<numHashMax;

++)

426 unsigned char key

[12];

427 for

(

int r

=0;

<numRows;

++)

429 key

[

*4] = (seq_hash[q_oid][

][

]) & 0xff;

430 key

[1+

*4] = ((seq_hash[q_oid][

][

]) >> 8) & 0xff;

431 key

[2+

*4] = ((seq_hash[q_oid][

][

]) >> 16) & 0xff;

432 key

[3+

*4] = ((seq_hash[q_oid][

][

]) >> 24) & 0xff;

437

lsh[foo].push_back(total_chunks);

439 for

(

int b

=0;

<numHashMax-1;

++)

441 unsigned char key

[8];

442 for

(

int r

=0;

<numRows;

++)

444

temp_hash = seq_hash[q_oid][

][

+2*

];

445 key

[

*4] = (temp_hash) & 0xff;

446 key

[1+

*4] = (temp_hash >> 8) & 0xff;

447 key

[2+

*4] = (temp_hash >> 16) & 0xff;

448 key

[3+

*4] = (temp_hash >> 24) & 0xff;

453

lsh[foo].push_back(total_chunks);

465

vector<string> paths;

468

vector<TSeqRange> range_vec;

469

vector<string> volname_vec;

471 for

(vector<string>::iterator iter=paths.begin(); iter != paths.end(); ++iter)

476 string

volName = base + ext;

477

volname_vec.push_back(volName);

482

range.

SetTo

(oid_offset);

483

range_vec.push_back(range);

487 int

numVols =

static_cast<int>

(paths.size());

488 #pragma omp parallel for num_threads(numThreads) 489 for

(

int

index=0; index<numVols; index++)

491 x_BuildIndex

(volname_vec[index], range_vec[index].GetFrom(), range_vec[index].GetTo());

499 char

* loadBadMers = getenv(

"LOADBADMERS"

);

510

badMers.push_back(badKmer);

511

cerr << badKmer <<

'\n'

;

515 char

* noBadMers = getenv(

"NOBADMERS"

);

517 return

vector<int>();

521 const int

kLength=10;

522 int array

[] = {139810, 69905, 70161, 70177, 74257,

523

69921, 69906, 74001, 135441, 69922};

528 return

vector<int>();

537 int

vectorRandNums=0;

548 string

indexFile = name +

".pki"

;

549 string

dataFile = name +

".pkd"

;

561

num_seqs = stop - start;

566

index_file.write((

char

*) &(

m_Version

), 4);

567

index_file.write((

char

*) &(num_seqs), 4);

569

index_file.write((

char

*) &(

m_Samples

), 4);

570

index_file.write((

char

*) &(

m_KmerSize

), 4);

572

index_file.write((

char

*) &(

m_Compress

), 4);

573

index_file.write((

char

*) &(

m_Alphabet

), 4);

574

index_file.write((

char

*) &(StartLSH), 4);

575

index_file.write((

char

*) &(kSizeLSH), 4);

585 for

(

int

q_oid=0;q_oid<3*num_seqs;q_oid++)

592

vector < vector < vector < uint32_t > > > seq_hash(num_seqs);

595 for

(

int

q_oid=0;q_oid<num_seqs;q_oid++)

625 const uint32_t

kUniqueHash = 0x1000000;

627 for

(

uint32_t

index=0; index<kUniqueHash; index++)

628

uniqueHash[index] = 0;

630

vector< vector<uint32_t> > lsh(kSizeLSH);

633

vector < vector <int> > kvector;

638 for

(

int

q_oid=0;q_oid<num_seqs;q_oid++)

645

total_chunks, uniqueHash, kvector);

653 for

(

uint32_t

index=0; index<kUniqueHash; index++)

655 if

(uniqueHash[index] > 0)

658 delete

[] uniqueHash;

662 for

(

int

index=0; index<kSizeLSH-1; index++)

664

LSHMatchSize += lsh[index].size();

668 const uint64_t

kLSHMatchEnd = 4*LSHMatchSize + StartLSH + 8*kSizeLSH;

669

index_file.write((

char

*) &(kLSHMatchEnd), 8);

676 const int

kFutureUse=0;

677

index_file.write((

char

*) &(kFutureUse), 4);

679

index_file.write((

char

*) &(kFutureUse), 4);

686

index_file.write((

char

*) &(

[

]), 4);

688

index_file.write((

char

*) &(

[

]), 4);

693 int

num=

static_cast<int>

(badMers.size());

696

index_file.write((

char

*) &(num), 4);

697 for

(vector<int>::iterator iter=badMers.begin(); iter != badMers.end() &&

<2*

m_NumHashFct

; ++iter, ++

)

698

index_file.write((

char

*) &(*iter), 4);

703

index_file.write((

char

*) &(kZero), 4);

713

vector<int> temp = kvector[

];

715

index_file.write((

char

*) &(temp[j]), 1);

720 for

(

int i

=0;

<extra;

++)

721

index_file.write((

char

*) &(temp), 1);

725 uint64_t

lsh_offset = StartLSH + 8*kSizeLSH;

727 for

(

int

index=0; index<kSizeLSH-1; index++)

729 if

(lsh[index].

size

() == 0)

730

index_file.write((

char

*) &(kNoValue), 8);

732

index_file.write((

char

*) &(lsh_offset), 8);

733

lsh_offset += 4*(lsh[index].size());

735

index_file.write((

char

*) &(lsh_offset), 8);

738 for

(

int

index=0; index<kSizeLSH-1; index++)

740 for

(vector<uint32_t>::iterator

=lsh[index].begin();

!= lsh[index].end(); ++

)

742

index_file.write((

char

*) &(*i), 4);

767 for

(

int

q_oid=0;q_oid<num_seqs;q_oid++)

769 int

num_chunks =

static_cast<int>

(seq_hash[q_oid].size());

770 for

(

int n

=0;

<num_chunks;

++)

772

vector<uint32_t> tmp_hash;

778

tmp_hash.push_back(hash_val);

784

tmp_hash.push_back(hash_val);

790

tmp_hash.push_back(seq_hash[q_oid][

][

]);

794 std::sort

(tmp_hash.begin(), tmp_hash.end());

797

data_file.write((

char

*) &(tmp_hash[

]), width);

799

data_file.write((

char

*) &(seq_hash[q_oid][

][

m_NumHashFct

]), 4);

static void s_Get_LSH_index_hashes5(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numHashes, int numRows, int &total_chunks, uint8_t *uniqueHash)

uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)

static Uint4 FNV_hash(uint32_t num)

FNV Hash. See http://www.isthe.com/chongo/tech/comp/fnv/index.html.

void s_MinhashSequences(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, const uint32_t *a, const uint32_t *b, bool do_seg, int kmerNum, int oidOffset, int alphabetChoice, int version, int chunkSize)

static void s_Get_LSH_index_hashes2(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int num_k, int num_l, int array_size, int &total_chunks, uint8_t *uniqueHash, vector< vector< int > > &kvector)

static void s_Get_LSH_index_hashes(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numBands, int numRows, int &total_chunks, uint8_t *uniqueHash)

void s_MinhashSequences2(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, int kmerNum, int oidOffset, int alphabetChoice, int version, vector< int > badMers, int chunkSize)

vector< int > s_BlastKmerLoadBadMers(int alphabet)

int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)

Calculates the number of differences between two minhash arrays.

set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)

Get KMERs for a given sequence using a compressed alphabet.

void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)

Get the random numbers for the hash function.

set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)

Get KMERs for a given sequence using a compressed alphabet.

int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int chunkSize)

Breaks a sequences up into chunks if the sequences is above a certain length.

void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)

Function to get the k sites to compare for Buhler LSH.

CRef< CSeqDB > m_SeqDB

Residues in kmer.

int m_ChunkSize

version of index file

int m_KmerSize

Number of rows per band.

int m_Compress

Number of samples (Buhler only)

bool m_DoSeg

BLAST database.

int m_Version

0 for 15 letters, 1 for 10 letters.

int m_Samples

Should Seg be run on sequences.

void x_BuildIndex(string &name, int start=0, int number=0)

BUild index for an individual BLAST volume.

void Build(int numThreads=1)

Build the index.

int m_RowsPerBand

Number of LSH bands.

int m_NumBands

Number of hash functions.

int m_Alphabet

Compress the arrays for Jaccard matches.

void x_WriteDataFile(vector< vector< vector< uint32_t > > > &seq_hash, int num_seqs, CNcbiOfstream &data_file)

Writes out the data file.

static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)

Find volume paths.

void GetSequenceAsString(int oid, CSeqUtil::ECoding coding, string &output, TSeqRange range=TSeqRange()) const

Get a sequence in a given encoding.

const string & GetDBNameList() const

Get list of database names.

int GetSeqLength(int oid) const

Returns the sequence length in base pairs or residues.

int GetNumSeqs() const

Returns the number of sequences available.

void SetNumberOfThreads(int num_threads, bool force_mt=false)

Setting the number of threads.

const_iterator begin() const

parent_type::iterator iterator

const_iterator end() const

std::ofstream out("events_result.xml")

main entry point for tests

static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)

#define NCBI_THROW(exception_class, err_code, message)

Generic macro to throw an exception, given the exception class, error code and message string.

static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)

Split a path string into its basic components.

uint8_t Uint1

1-byte (8-bit) unsigned integer

int32_t Int4

4-byte (32-bit) signed integer

uint32_t Uint4

4-byte (32-bit) unsigned integer

#define END_NCBI_SCOPE

End previously defined NCBI scope.

#define END_SCOPE(ns)

End the previously defined scope.

#define BEGIN_NCBI_SCOPE

Define ncbi namespace.

#define BEGIN_SCOPE(ns)

Define a new scope.

IO_PREFIX::ofstream CNcbiOfstream

Portable alias for ofstream.

IO_PREFIX::ifstream CNcbiIfstream

Portable alias for ifstream.

void SetFrom(TFrom value)

Assign a value to From data member.

void SetTo(TTo value)

Assign a value to To data member.

#define KMER_LSH_ARRAY_SIZE

constexpr auto sort(_Init &&init)

const string version

version string

const struct ncbi::grid::netcache::search::fields::SIZE size

const struct ncbi::grid::netcache::search::fields::KEY key

Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...

std::istream & in(std::istream &in_, double &x_)

double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)

uint32_t do_pearson_hash(unsigned char *key, int length)

uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)

Pearson hash an integer into two bytes.

unsigned char pearson_hash_int2byte(uint32_t input, int seed1)

Pearson hash an integer into one byte.

RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4