A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/blastkmerindex_8cpp_source.html below:

NCBI C++ ToolKit: src/algo/blast/proteinkmer/blastkmerindex.cpp Source File

116

: m_NumHashFct(numHashFct),

117

m_KmerSize(kmerSize),

122

m_Alphabet(alphabet),

124

m_ChunkSize(chunkSize)

131

m_NumBands = m_NumHashFct/m_RowsPerBand;

141  return

( (

a

*x +

b

) % p );

148  const Uint4

fnv_prime = 16777619u;

149  const Uint4

fnv_offset_basis = 2166136261u;

154  key

[0] = num & 0xff;

155  key

[1] = (num >> 8) & 0xff;

156  key

[2] = (num >> 16) & 0xff;

157  key

[3] = (num >> 24) & 0xff;

160  hash

= fnv_offset_basis;

161  for

(

i

= 0;

i

< 4;

i

++) {

172

vector < vector < vector < uint32_t > > > & seq_hash,

184  int

fullOID=q_oid+oidOffset;

186

vector<TSeqRange> range_v;

188

seq_hash[q_oid].resize(chunk_num);

193  bool

first_time=

true

;

195  for

(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)

200  if

(seq_kmer.

empty

())

206

vector<uint32_t> idx_tmp(num_hashes);

207

vector<uint32_t> hash_tmp(num_hashes);

209  for

(

int

h=0;h<num_hashes;h++)

211

hash_tmp[h]=0xffffffff;

212

idx_tmp[h]=0xffffffff;

219  for

(

int

h=0;h<num_hashes;h++)

223  if

(hashval < hash_tmp[h])

225

hash_tmp[h] = hashval;

231  if

(first_time ==

false

)

244

seq_hash[q_oid][chunk_counter].resize(num_hashes+1);

245  for

(h=0;h<num_hashes;h++)

246

seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];

250

seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;

252

seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;

256  if

(chunk_num > chunk_counter+1)

257

seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());

258  if

(first_time ==

true

)

259

seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());

266

vector < vector < vector < uint32_t > > > & seq_hash,

276  int

fullOID=q_oid+oidOffset;

278

vector<TSeqRange> range_v;

280

seq_hash[q_oid].resize(chunk_num);

285  bool

first_time=

true

;

287  for

(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)

292  if

(seq_kmer.

empty

())

298

vector<uint32_t> hash_values;

299

vector<uint32_t> idx_tmp(num_hashes);

305

hash_values.push_back(hashval);

308  if

(hash_values.size() <

static_cast<size_t>

(num_hashes))

310  int

rem = 1 + num_hashes -

static_cast<int>

(hash_values.size());

312  for

(

int i

=0;

i

<rem;

i

++)

313

hash_values.push_back(hashval);

315  std::sort

(hash_values.begin(), hash_values.end());

317  for

(

int i

=0;

i

<num_hashes;

i

++)

318

idx_tmp[

i

] = hash_values[

i

];

320  if

(first_time ==

false

)

333

seq_hash[q_oid][chunk_counter].resize(num_hashes+1);

334  for

(h=0;h<num_hashes;h++)

335

seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];

339

seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;

341

seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;

345  if

(chunk_num > chunk_counter+1)

346

seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());

347  if

(first_time ==

true

)

348

seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());

354  int

q_oid,

int

numBands,

int

numRows,

int

& total_chunks,

uint8_t

* uniqueHash)

357  int

num_chunks =

static_cast<int>

(seq_hash[q_oid].size());

358  for

(

int n

=0;

n

<num_chunks;

n

++)

360  for

(

int b

=0;

b

<numBands;

b

++)

362  unsigned char key

[9];

363  for

(

int r

=0;

r

<numRows;

r

++)

365  key

[

r

*4] = (seq_hash[q_oid][

n

][

b

*numRows+

r

]) & 0xff;

366  key

[1+

r

*4] = ((seq_hash[q_oid][

n

][

b

*numRows+

r

]) >> 8) & 0xff;

367  key

[2+

r

*4] = ((seq_hash[q_oid][

n

][

b

*numRows+

r

]) >> 16) & 0xff;

368  key

[3+

r

*4] = ((seq_hash[q_oid][

n

][

b

*numRows+

r

]) >> 24) & 0xff;

370  key

[8] = (

unsigned

char)

b

;

374

lsh[foo].push_back(total_chunks);

382  int

q_oid,

int

num_k,

int

num_l,

int

array_size,

int

& total_chunks,

uint8_t

* uniqueHash,

383

vector < vector <int> >& kvector)

387

vector<unsigned char>

key

(

max

);

388  int

num_chunks=

static_cast<int>

(seq_hash[q_oid].size());

391  for

(

int n

=0;

n

<num_chunks;

n

++)

393  for

(

int r

=0;

r

<num_l;

r

++)

395  for

(

int i

=0;

i

<num_k;

i

++)

397

temp_index = kvector[

r

][

i

];

398

temp_hash = seq_hash[q_oid][

n

][temp_index];

399  key

[

i

*4] = (temp_hash) & 0xff;

400  key

[1+

i

*4] = ((temp_hash) >> 8) & 0xff;

401  key

[2+

i

*4] = ((temp_hash) >> 16) & 0xff;

402  key

[3+

i

*4] = ((temp_hash) >> 24) & 0xff;

408

lsh[foo].push_back(total_chunks);

416  int

q_oid,

int

numHashes,

int

numRows,

int

& total_chunks,

uint8_t

* uniqueHash)

419  int

num_chunks =

static_cast<int>

(seq_hash[q_oid].size());

420  int

numHashMax = numHashes - numRows + 1;

422  for

(

int n

=0;

n

<num_chunks;

n

++)

424  for

(

int b

=0;

b

<numHashMax;

b

++)

426  unsigned char key

[12];

427  for

(

int r

=0;

r

<numRows;

r

++)

429  key

[

r

*4] = (seq_hash[q_oid][

n

][

b

+

r

]) & 0xff;

430  key

[1+

r

*4] = ((seq_hash[q_oid][

n

][

b

+

r

]) >> 8) & 0xff;

431  key

[2+

r

*4] = ((seq_hash[q_oid][

n

][

b

+

r

]) >> 16) & 0xff;

432  key

[3+

r

*4] = ((seq_hash[q_oid][

n

][

b

+

r

]) >> 24) & 0xff;

437

lsh[foo].push_back(total_chunks);

439  for

(

int b

=0;

b

<numHashMax-1;

b

++)

441  unsigned char key

[8];

442  for

(

int r

=0;

r

<numRows;

r

++)

444

temp_hash = seq_hash[q_oid][

n

][

b

+2*

r

];

445  key

[

r

*4] = (temp_hash) & 0xff;

446  key

[1+

r

*4] = (temp_hash >> 8) & 0xff;

447  key

[2+

r

*4] = (temp_hash >> 16) & 0xff;

448  key

[3+

r

*4] = (temp_hash >> 24) & 0xff;

453

lsh[foo].push_back(total_chunks);

465

vector<string> paths;

468

vector<TSeqRange> range_vec;

469

vector<string> volname_vec;

471  for

(vector<string>::iterator iter=paths.begin(); iter != paths.end(); ++iter)

476  string

volName = base + ext;

477

volname_vec.push_back(volName);

482

range.

SetTo

(oid_offset);

483

range_vec.push_back(range);

487  int

numVols =

static_cast<int>

(paths.size());

488 #pragma omp parallel for num_threads(numThreads) 489  for

(

int

index=0; index<numVols; index++)

491  x_BuildIndex

(volname_vec[index], range_vec[index].GetFrom(), range_vec[index].GetTo());

499  char

* loadBadMers = getenv(

"LOADBADMERS"

);

510

badMers.push_back(badKmer);

511

cerr << badKmer <<

'\n'

;

515  char

* noBadMers = getenv(

"NOBADMERS"

);

517  return

vector<int>();

521  const int

kLength=10;

522  int array

[] = {139810, 69905, 70161, 70177, 74257,

523

69921, 69906, 74001, 135441, 69922};

528  return

vector<int>();

537  int

vectorRandNums=0;

548  string

indexFile = name +

".pki"

;

549  string

dataFile = name +

".pkd"

;

561

num_seqs = stop - start;

566

index_file.write((

char

*) &(

m_Version

), 4);

567

index_file.write((

char

*) &(num_seqs), 4);

569

index_file.write((

char

*) &(

m_Samples

), 4);

570

index_file.write((

char

*) &(

m_KmerSize

), 4);

572

index_file.write((

char

*) &(

m_Compress

), 4);

573

index_file.write((

char

*) &(

m_Alphabet

), 4);

574

index_file.write((

char

*) &(StartLSH), 4);

575

index_file.write((

char

*) &(kSizeLSH), 4);

585  for

(

int

q_oid=0;q_oid<3*num_seqs;q_oid++)

592

vector < vector < vector < uint32_t > > > seq_hash(num_seqs);

595  for

(

int

q_oid=0;q_oid<num_seqs;q_oid++)

625  const uint32_t

kUniqueHash = 0x1000000;

627  for

(

uint32_t

index=0; index<kUniqueHash; index++)

628

uniqueHash[index] = 0;

630

vector< vector<uint32_t> > lsh(kSizeLSH);

633

vector < vector <int> > kvector;

638  for

(

int

q_oid=0;q_oid<num_seqs;q_oid++)

645

total_chunks, uniqueHash, kvector);

653  for

(

uint32_t

index=0; index<kUniqueHash; index++)

655  if

(uniqueHash[index] > 0)

658  delete

[] uniqueHash;

662  for

(

int

index=0; index<kSizeLSH-1; index++)

664

LSHMatchSize += lsh[index].size();

668  const uint64_t

kLSHMatchEnd = 4*LSHMatchSize + StartLSH + 8*kSizeLSH;

669

index_file.write((

char

*) &(kLSHMatchEnd), 8);

676  const int

kFutureUse=0;

677

index_file.write((

char

*) &(kFutureUse), 4);

679

index_file.write((

char

*) &(kFutureUse), 4);

686

index_file.write((

char

*) &(

a

[

i

]), 4);

688

index_file.write((

char

*) &(

b

[

i

]), 4);

693  int

num=

static_cast<int>

(badMers.size());

696

index_file.write((

char

*) &(num), 4);

697  for

(vector<int>::iterator iter=badMers.begin(); iter != badMers.end() &&

i

<2*

m_NumHashFct

; ++iter, ++

i

)

698

index_file.write((

char

*) &(*iter), 4);

703

index_file.write((

char

*) &(kZero), 4);

713

vector<int> temp = kvector[

i

];

715

index_file.write((

char

*) &(temp[j]), 1);

720  for

(

int i

=0;

i

<extra;

i

++)

721

index_file.write((

char

*) &(temp), 1);

725  uint64_t

lsh_offset = StartLSH + 8*kSizeLSH;

727  for

(

int

index=0; index<kSizeLSH-1; index++)

729  if

(lsh[index].

size

() == 0)

730

index_file.write((

char

*) &(kNoValue), 8);

732

index_file.write((

char

*) &(lsh_offset), 8);

733

lsh_offset += 4*(lsh[index].size());

735

index_file.write((

char

*) &(lsh_offset), 8);

738  for

(

int

index=0; index<kSizeLSH-1; index++)

740  for

(vector<uint32_t>::iterator

i

=lsh[index].begin();

i

!= lsh[index].end(); ++

i

)

742

index_file.write((

char

*) &(*i), 4);

767  for

(

int

q_oid=0;q_oid<num_seqs;q_oid++)

769  int

num_chunks =

static_cast<int>

(seq_hash[q_oid].size());

770  for

(

int n

=0;

n

<num_chunks;

n

++)

772

vector<uint32_t> tmp_hash;

778

tmp_hash.push_back(hash_val);

784

tmp_hash.push_back(hash_val);

790

tmp_hash.push_back(seq_hash[q_oid][

n

][

b

]);

794  std::sort

(tmp_hash.begin(), tmp_hash.end());

797

data_file.write((

char

*) &(tmp_hash[

b

]), width);

799

data_file.write((

char

*) &(seq_hash[q_oid][

n

][

m_NumHashFct

]), 4);

static void s_Get_LSH_index_hashes5(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numHashes, int numRows, int &total_chunks, uint8_t *uniqueHash)

uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)

static Uint4 FNV_hash(uint32_t num)

FNV Hash. See http://www.isthe.com/chongo/tech/comp/fnv/index.html.

void s_MinhashSequences(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, const uint32_t *a, const uint32_t *b, bool do_seg, int kmerNum, int oidOffset, int alphabetChoice, int version, int chunkSize)

static void s_Get_LSH_index_hashes2(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int num_k, int num_l, int array_size, int &total_chunks, uint8_t *uniqueHash, vector< vector< int > > &kvector)

static void s_Get_LSH_index_hashes(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numBands, int numRows, int &total_chunks, uint8_t *uniqueHash)

void s_MinhashSequences2(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, int kmerNum, int oidOffset, int alphabetChoice, int version, vector< int > badMers, int chunkSize)

vector< int > s_BlastKmerLoadBadMers(int alphabet)

int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)

Calculates the number of differences between two minhash arrays.

set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)

Get KMERs for a given sequence using a compressed alphabet.

void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)

Get the random numbers for the hash function.

set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)

Get KMERs for a given sequence using a compressed alphabet.

int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int chunkSize)

Breaks a sequences up into chunks if the sequences is above a certain length.

void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)

Function to get the k sites to compare for Buhler LSH.

CRef< CSeqDB > m_SeqDB

Residues in kmer.

int m_ChunkSize

version of index file

int m_KmerSize

Number of rows per band.

int m_Compress

Number of samples (Buhler only)

bool m_DoSeg

BLAST database.

int m_Version

0 for 15 letters, 1 for 10 letters.

int m_Samples

Should Seg be run on sequences.

void x_BuildIndex(string &name, int start=0, int number=0)

BUild index for an individual BLAST volume.

void Build(int numThreads=1)

Build the index.

int m_RowsPerBand

Number of LSH bands.

int m_NumBands

Number of hash functions.

int m_Alphabet

Compress the arrays for Jaccard matches.

void x_WriteDataFile(vector< vector< vector< uint32_t > > > &seq_hash, int num_seqs, CNcbiOfstream &data_file)

Writes out the data file.

static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)

Find volume paths.

void GetSequenceAsString(int oid, CSeqUtil::ECoding coding, string &output, TSeqRange range=TSeqRange()) const

Get a sequence in a given encoding.

const string & GetDBNameList() const

Get list of database names.

int GetSeqLength(int oid) const

Returns the sequence length in base pairs or residues.

int GetNumSeqs() const

Returns the number of sequences available.

void SetNumberOfThreads(int num_threads, bool force_mt=false)

Setting the number of threads.

const_iterator begin() const

parent_type::iterator iterator

const_iterator end() const

std::ofstream out("events_result.xml")

main entry point for tests

static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)

#define NCBI_THROW(exception_class, err_code, message)

Generic macro to throw an exception, given the exception class, error code and message string.

static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)

Split a path string into its basic components.

uint8_t Uint1

1-byte (8-bit) unsigned integer

int32_t Int4

4-byte (32-bit) signed integer

uint32_t Uint4

4-byte (32-bit) unsigned integer

#define END_NCBI_SCOPE

End previously defined NCBI scope.

#define END_SCOPE(ns)

End the previously defined scope.

#define BEGIN_NCBI_SCOPE

Define ncbi namespace.

#define BEGIN_SCOPE(ns)

Define a new scope.

IO_PREFIX::ofstream CNcbiOfstream

Portable alias for ofstream.

IO_PREFIX::ifstream CNcbiIfstream

Portable alias for ifstream.

void SetFrom(TFrom value)

Assign a value to From data member.

void SetTo(TTo value)

Assign a value to To data member.

#define KMER_LSH_ARRAY_SIZE

constexpr auto sort(_Init &&init)

const string version

version string

const struct ncbi::grid::netcache::search::fields::SIZE size

const struct ncbi::grid::netcache::search::fields::KEY key

Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...

std::istream & in(std::istream &in_, double &x_)

double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)

uint32_t do_pearson_hash(unsigned char *key, int length)

uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)

Pearson hash an integer into two bytes.

unsigned char pearson_hash_int2byte(uint32_t input, int seed1)

Pearson hash an integer into one byte.


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4