A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/blastkmerutils_8cpp_source.html below:

NCBI C++ ToolKit: src/algo/blast/proteinkmer/blastkmerutils.cpp Source File

59  return

( (

a

*x +

b

) % p );

66  const Uint4

fnv_prime = 16777619u;

67  const Uint4

fnv_offset_basis = 2166136261u;

73  key

[1] = (num >> 8) & 0xff;

74  key

[2] = (num >> 16) & 0xff;

75  key

[3] = (num >> 24) & 0xff;

78  hash

= fnv_offset_basis;

79  for

(

i

= 0;

i

< 4;

i

++) {

97  for

(

int

h=0;h<num_hashes;h++)

103  return

(

double

) score / num_hashes;

117  for

(

int

h=0;h<num_hashes;h++)

119  while

(bindex < num_hashes &&

a

[h] >

b

[bindex])

123  if

(bindex == num_hashes)

125  if

(

a

[h] ==

b

[bindex])

129  return

(

double

) score / num_hashes;

137

vector<Uint1> trans_table;

140  int

seq_length =

static_cast<int>

(query_sequence.length());

141  const char

*

query

= query_sequence.c_str();

144  if

(seq_length < kmerNum)

150  for

(

int i

=0;

i

<seq_length-kmerNum+1;

i

++)

155  for

(

int

kindex=0; kindex<kmerNum; kindex++)

157  if

(

query

[

i

+kindex] == 21)

162

index = (index << 4);

163

index += trans_table[

query

[

i

+kindex]];

169  string

kmer_str=

""

;

175

kmerCountPriv[kmer_str]++;

177  if

(index != 0 &&

i

<seq_length-kmerNum &&

query

[

i

+kmerNum] != 21)

179  int

index_short = index;

180

index = (index << 4);

181

index += trans_table[

query

[

i

+kmerNum]];

182  string

kmer_str=

""

;

184

kmerCountPlusPriv[kmer_str]++;

191

kmerCount[(*i).first]++;

193

kmerCountPlus[(*i).first]++;

198

kmerCount[(*i).first] += (*i).second;

200

kmerCountPlus[(*i).first] += (*i).second;

211

vector<Uint1> trans_table;

214  int

seq_length =

static_cast<int>

(query_sequence.length());

215  const char

*

query

= query_sequence.c_str();

218  if

(seq_length < kmerNum)

222  char

*query_private=(

char

*)

malloc

(chunk_length);

224  for

(

unsigned int i

=range.

GetFrom

();

i

<=range.

GetTo

();

i

++, private_index++)

225

query_private[private_index] =

query

[

i

];

232  SeqBufferSeg

((

unsigned char

*)query_private, chunk_length, 0, sp, &seq_locs);

238  for

(

int i

=itr->ssr->left; i <= itr->ssr->right;

i

++)

239

query_private[

i

]=21;

245  for

(

int i

=0;

i

<chunk_length-kmerNum+1;

i

++)

250  for

(

int

kindex=0; kindex<kmerNum; kindex++)

252  if

(query_private[

i

+kindex] == 21)

257

index = (index << 4);

258

index += trans_table[query_private[

i

+kindex]];

265  free

(query_private);

275

vector<Uint1> trans_table;

278  int

seq_length =

static_cast<int>

(query_sequence.length());

279  const char

*

query

= query_sequence.c_str();

282  if

(seq_length < kmerNum)

286  char

*query_private=(

char

*)

malloc

(chunk_length);

288  for

(

unsigned int i

=range.

GetFrom

();

i

<=range.

GetTo

();

i

++, private_index++)

289

query_private[private_index] =

query

[

i

];

291  for

(

int i

=0;

i

<chunk_length-kmerNum+1;

i

++)

296  for

(

int

kindex=0; kindex<kmerNum; kindex++)

298  if

(query_private[

i

+kindex] == 21)

303

index = (index << 4);

304

index += trans_table[query_private[

i

+kindex]];

308  if

(

i

< chunk_length-kmerNum && !badMers.empty())

310

std::vector<int>::iterator it;

311

it = std::find(badMers.begin(), badMers.end(), index);

312  if

(it != badMers.end() &&

i

< chunk_length-1)

314

index = (index << 4);

315

index += trans_table[query_private[

i

+kmerNum]];

323  free

(query_private);

332  const unsigned int

kAlphabetLen = 28;

335  const char

* kCompAlphabets[] = {

337  "ST IJV LM KR EQZ A G BD P N F Y H C W"

,

339  "IJLMV AST BDENZ KQR G FY P H C W" 343  const char

* trans_string = kCompAlphabets[alphabetChoice];

345  Uint4

compressed_letter = 1;

347

trans_table.resize(kAlphabetLen + 1, 0);

348  for

(

Uint4 i

= 0;

i

< strlen(trans_string);

i

++) {

352  else if

(

isalpha

(trans_string[

i

])) {

355  _ASSERT

(aa_letter < trans_table.size());

357

trans_table[aa_letter] = compressed_letter;

366  const int

kOverlap=50;

376

numChunks =

MAX

(1, (length - kOverlap)/(

ChunkSize

- kOverlap));

377

newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;

381

newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;

386  for

(

int i

=0;

i

<numChunks;

i

++)

393

range_v.push_back(range);

403  int

num_hashes =

static_cast<int>

(minhash1.size());

405  for

(

int

index=0; index<num_hashes; index++)

407  if

(minhash1[index] != minhash2[index])

416

vector < vector <uint32_t> >& seq_hash,

425  bool

kmersFound=

false

;

426  int

seq_length =

static_cast<int>

(

query

.length());

427

vector<TSeqRange> range_v;

429

seq_hash.resize(chunk_num);

430  bool

seg = (do_seg > 0) ?

true

:

false

;

432

vector<uint32_t> idx_tmp(num_hashes);

433

vector<uint32_t> hash_tmp(num_hashes);

435  for

(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)

438

seq_hash[chunk_iter].resize(num_hashes);

442  if

(seq_kmer.

empty

())

447  for

(

int

h=0;h<num_hashes;h++)

449

hash_tmp[h]=0xffffffff;

450

idx_tmp[h]=0xffffffff;

457  for

(

int

h=0;h<num_hashes;h++)

461  if

(hashval < hash_tmp[h])

463

hash_tmp[h] = hashval;

470  for

(

int

h=0;h<num_hashes;h++)

472

seq_hash[chunk_iter][h] = idx_tmp[h];

480

vector < vector <uint32_t> >& seq_hash,

487  bool

kmersFound=

false

;

488  int

seq_length =

static_cast<int>

(

query

.length());

489

vector<TSeqRange> range_v;

491

seq_hash.resize(chunk_num);

493

vector<uint32_t> hash_values;

495  for

(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)

498

seq_hash[chunk_iter].resize(numHashes);

502  if

(seq_kmer.

empty

())

511

hash_values.push_back(hashval);

515  if

(hash_values.size() <

static_cast<size_t>

(numHashes))

517  int

rem = 1 + numHashes -

static_cast<int>

(hash_values.size());

519  for

(

int i

=0;

i

<rem;

i

++)

520

hash_values.push_back(hashval);

522  std::sort

(hash_values.begin(), hash_values.end());

525  for

(

int

h=0;h<numHashes;h++)

526

seq_hash[chunk_iter][h] = hash_values[h];

533

vector < vector <uint32_t> >&lsh_hash_vec,

537  int

num_chunks=

static_cast<int>

(query_hash.size());

539  for

(

int n

=0;

n

<num_chunks;

n

++)

541

vector <uint32_t> lsh_chunk_vec;

542  for

(

int b

=0;

b

<num_bands;

b

++)

546  unsigned char key

[9];

547  for

(

int r

=0;

r

<rows_per_band;

r

++)

549

temp_hash = query_hash[

n

][

b

*rows_per_band+

r

];

550  key

[

r

*4] = (temp_hash) & 0xff;

551  key

[1+

r

*4] = ((temp_hash) >> 8) & 0xff;

552  key

[2+

r

*4] = ((temp_hash) >> 16) & 0xff;

553  key

[3+

r

*4] = ((temp_hash) >> 24) & 0xff;

555  key

[8] = (

unsigned

char)

b

;

557

lsh_chunk_vec.push_back(foo);

559  std::sort

(lsh_chunk_vec.begin(), lsh_chunk_vec.end());

560

lsh_hash_vec.push_back(lsh_chunk_vec);

567

vector < vector <uint32_t> >&lsh_hash_vec,

571  int

num_chunks=

static_cast<int>

(query_hash.size());

573  int

numHashMax = numHashes - numRows + 1;

574  for

(

int n

=0;

n

<num_chunks;

n

++)

576

vector <uint32_t> lsh_chunk_vec;

577  for

(

int b

=0;

b

<numHashMax;

b

++)

580  unsigned char key

[8];

581  for

(

int r

=0;

r

<numRows;

r

++)

583

temp_hash = query_hash[

n

][

b

+

r

];

584  key

[

r

*4] = (temp_hash) & 0xff;

585  key

[1+

r

*4] = ((temp_hash) >> 8) & 0xff;

586  key

[2+

r

*4] = ((temp_hash) >> 16) & 0xff;

587  key

[3+

r

*4] = ((temp_hash) >> 24) & 0xff;

590

lsh_chunk_vec.push_back(foo);

592  for

(

int b

=0;

b

<numHashMax-1;

b

++)

594  unsigned char key

[8];

595  for

(

int r

=0;

r

<numRows;

r

++)

597

temp_hash = query_hash[

n

][

b

+2*

r

];

598  key

[

r

*4] = (temp_hash) & 0xff;

599  key

[1+

r

*4] = ((temp_hash) >> 8) & 0xff;

600  key

[2+

r

*4] = ((temp_hash) >> 16) & 0xff;

601  key

[3+

r

*4] = ((temp_hash) >> 24) & 0xff;

604

lsh_chunk_vec.push_back(foo);

606  std::sort

(lsh_chunk_vec.begin(), lsh_chunk_vec.end());

607

lsh_hash_vec.push_back(lsh_chunk_vec);

617  for

(

int i

=0;

i

<numHashes;

i

++)

623  while

(

a

[

i

] == 0);

629 void GetKValues

(vector< vector <int> >& kvector,

int

k_value,

int

l_value,

int

array_size)

634  for

(

int i

=0;

i

<l_value;

i

++)

637  for

(

int

k=0; k<k_value; k++)

640

ktemp.push_back(temp);

642

kvector.push_back(ktemp);

652

vector < vector <uint32_t> >& lsh_hash_vec,

655

vector< vector<int> >& kvector)

658

vector<unsigned char>

key

(

max

);

659  int

num_chunks=

static_cast<int>

(query_hash.size());

662  for

(

int n

=0;

n

<num_chunks;

n

++)

664

vector <uint32_t> lsh_chunk_vec;

665  for

(

int r

=0;

r

<num_l;

r

++)

667  for

(

int i

=0;

i

<num_k;

i

++)

669

temp_index=kvector[

r

][

i

];

670

temp_hash = query_hash[

n

][temp_index];

671  key

[

i

*4] = (temp_hash) & 0xff;

672  key

[1+

i

*4] = ((temp_hash) >> 8) & 0xff;

673  key

[2+

i

*4] = ((temp_hash) >> 16) & 0xff;

674  key

[3+

i

*4] = ((temp_hash) >> 24) & 0xff;

678

lsh_chunk_vec.push_back(foo);

680  std::sort

(lsh_chunk_vec.begin(), lsh_chunk_vec.end());

681

lsh_hash_vec.push_back(lsh_chunk_vec);

691  int

num=

static_cast<int>

(query_LSH_hash.size());

692  for

(

int i

=0;

i

<num;

i

++)

694  for

(vector<uint32_t>::const_iterator iter=query_LSH_hash[

i

].begin(); iter != query_LSH_hash[

i

].end(); ++iter)

696  if

(lsh_array[*iter] != 0)

697

candidates[

i

].insert(*iter);

709  int

num_chunks=

static_cast<int>

(query_hash.size());

711  for

(

int n

=0;

n

<num_chunks;

n

++)

713

vector<uint32_t> tmp_hash;

714  for

(vector<uint32_t>::const_iterator

i

= query_hash[

n

].begin();

i

!= query_hash[

n

].end(); ++

i

)

726

tmp_hash.push_back(0xffff);

728

tmp_hash.push_back(0xff);

730

tmp_hash.push_back(kBig);

733

tmp_hash.push_back(hash_value);

736  std::sort

(tmp_hash.begin(), tmp_hash.end());

737

query_hash_hash.push_back(tmp_hash);

755  int

num_chunks=

static_cast<int>

(query_hash.size());

757

vector< vector<uint32_t> > candidate_oids;

758

candidate_oids.resize(num_chunks);

760  for

(

int

num=0; num<num_chunks; num++)

768  while

(lsh[index] == 0)

774  while

(index < remaining)

776  int

oid = *(oid_offset+index);

777

candidate_oids[num].push_back(oid);

784  for

(

int

index=0; index<num_chunks; index++)

786

kmer_stats.

hit_count

+= candidates[index].size();

790

vector < vector <uint32_t> > query_hash_hash;

793

vector <uint32_t> subject_hash;

794

subject_hash.resize(num_hashes);

799  for

(

int n

=0;

n

<num_chunks;

n

++)

801  std::sort

(candidate_oids[

n

].begin(), candidate_oids[

n

].end());

804  for

(vector<uint32_t>::iterator

i

= candidate_oids[

n

].begin();

i

!= candidate_oids[

n

].end(); ++

i

)

806  if

(last_oid == *

i

)

809  if

(min_hits != oid_count)

824

mhfile.

GetMinHits

(oid, subject_oid, subject_hash);

825  double

current_score=0;

827

current_score =

estimate_jaccard

(query_hash_hash[

n

], subject_hash, num_hashes);

831  if

(current_score < thresh)

835  if

(!score_map.count(subject_oid))

836

score_map.

insert

(score_mapValType(subject_oid, current_score));

837  else if

(current_score > score_map[subject_oid])

838

score_map[subject_oid] = current_score;

845  double

total_score = (*i).second;

846  if

(total_score > thresh)

848

score_vector.push_back((*

i

));

883  for

(

i

=0;

i

<lshSize;

i

++)

885  if

(lsh_array[

i

] > 0)

894

vector<uint32_t> hits;

901  if

(hits.size() !=

static_cast<size_t>

(num_hashes))

909  for

(vector<uint32_t>::iterator iter=hits.begin(); iter != hits.end(); ++iter)

913

error_msg =

"Signature array for version 3 index not in ascending order in volume "

+

NStr::NumericToString

(volume);

926

vector<string> kmerFiles;

928  int

numFiles =

static_cast<int>

(kmerFiles.size());

929  for

(

int i

=0;

i

<numFiles;

i

++)

935  catch

(

const

blast::CMinHashException& e) {

936

error_msg = e.GetMsg();

940

error_msg = e.GetMsg();

944

error_msg =

"Unknown error"

;

Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...

BLAST filtering functions.

BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)

Deallocate all BlastSeqLoc objects in a chain.

Int2 SeqBufferSeg(Uint1 *sequence, Int4 length, Int4 offset, SegParameters *sparamsp, BlastSeqLoc **seg_locs)

Runs seg on a protein sequence in ncbistdaa.

SegParameters * SegParametersNewAa(void)

Allocated SeqParameter struct for proteins and fills with default values.

void SegParametersFree(SegParameters *sparamsp)

Free SegParameters structure.

static int s_BlastKmerVerifyVolume(CMinHashFile &mhfile, string &error_msg, int volume)

uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)

void BlastKmerGetCompressedTranslationTable(vector< Uint1 > &trans_table, int alphabetChoice)

Creates translation table for compressed alphabets.

int BlastKmerVerifyIndex(CRef< CSeqDB > seqdb, string &error_msg)

double estimate_jaccard(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)

int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int ChunkSize)

Breaks a sequences up into chunks if the sequences is above a certain length.

int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)

Calculates the number of differences between two minhash arrays.

void get_LSH_hashes(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)

void get_LSH_hashes5(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)

Gets the LSH hash for one hash function.

set< uint32_t > BlastKmerGetKmerSetStats(const string &query_sequence, int kmerNum, map< string, int > &kmerCount, map< string, int > &kmerCountPlus, int alphabetChoice, bool perQuery)

Simplified version of BlastKmerGetKmerSet.

DEFINE_STATIC_MUTEX(randMutex)

set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)

Get KMERs for a given sequence using a compressed alphabet.

bool minhash_query2(const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)

Hash the query for the minimum values;.

static uint32_t FNV_hash(uint32_t num)

FNV hash, see http://www.isthe.com/chongo/tech/comp/fnv/index.html.

void s_HashHashQuery(const vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &query_hash_hash, int compress, int version)

void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)

Get the random numbers for the hash function.

set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)

Get KMERs for a given sequence using a compressed alphabet.

void neighbor_query(const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVer)

double estimate_jaccard2(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)

bool minhash_query(const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)

void get_LSH_match_from_hash(const vector< vector< uint32_t > > &query_LSH_hash, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates)

void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)

Function to get the k sites to compare for Buhler LSH.

void get_LSH_hashes2(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kvector)

vector< pair< uint32_t, double > > TBlastKmerPrelimScoreVector

Vector of pairs of database OIDs and scores.

Access data in Minhash files.

int GetLSHSize(void) const

uint32_t * GetMinHits(int oid) const

int GetNumHashes(void) const

Returns the number of values in an array of hashes (probably 32)

uint64_t * GetLSHArray(void) const

int GetVersion(void) const

int GetNumSeqs(void) const

int GetDataWidth(void) const

int GetKmerSize(void) const

Returns the length of the KMER.

int * GetHits(uint64_t offset) const

static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)

Find volume paths.

const_iterator begin() const

const_iterator end() const

iterator_bool insert(const value_type &val)

iterator_bool insert(const value_type &val)

const_iterator begin() const

parent_type::iterator iterator

const_iterator end() const

static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)

const Uint1 AMINOACID_TO_NCBISTDAA[]

Translates between ncbieaa and ncbistdaa.

unsigned int TSeqPos

Type for sequence locations and lengths.

uint8_t Uint1

1-byte (8-bit) unsigned integer

int32_t Int4

4-byte (32-bit) signed integer

uint32_t Uint4

4-byte (32-bit) unsigned integer

TValue GetRand(void)

Get the next random number in the interval [0..GetMax()] (inclusive)

position_type GetLength(void) const

#define END_NCBI_SCOPE

End previously defined NCBI scope.

#define BEGIN_NCBI_SCOPE

Define ncbi namespace.

static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)

Convert numeric value to string.

void SetFrom(TFrom value)

Assign a value to From data member.

TTo GetTo(void) const

Get the To member data.

TFrom GetFrom(void) const

Get the From member data.

void SetTo(TTo value)

Assign a value to To data member.

unsigned int

A callback function used to compare two keys in a database.

pair< int, int > ChunkSize(const CSpliced_exon_chunk &chunk)

constexpr auto sort(_Init &&init)

const string version

version string

const struct ncbi::grid::netcache::search::fields::KEY key

#define MIN(a, b)

returns smaller of a and b.

#define MAX(a, b)

returns larger of a and b.

Defines the CNcbiApplication and CAppException classes for creating NCBI applications.

double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)

uint32_t do_pearson_hash(unsigned char *key, int length)

uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)

Pearson hash an integer into two bytes.

unsigned char pearson_hash_int2byte(uint32_t input, int seed1)

Pearson hash an integer into one byte.

static size_t read_size(CNcbiIstream &stream, const char *name)

Defines BLAST database access classes.

Structure for ancillary data on KMER search.

int jd_count

How often was the Jaccard distance calculated.

int total_matches

How many matches returned.

int oids_considered

How many OIDs were considered as candidates.

int hit_count

How many hits to the hash array were there?

int jd_oid_count

How many OIDs was the Jaccard distance calculated for.

Used to hold a set of positions, mostly used for filtering.

struct BlastSeqLoc * next

next in linked list

Structure to hold parameters for seg search.


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4