approx_table_entries,
Int4max_q_off,
87 if(approx_table_entries < 250)
95 if(approx_table_entries < 8500)
102 if(approx_table_entries < 1250) {
105}
else if(approx_table_entries < 21000) {
115 if(approx_table_entries < 1250) {
118}
else if(approx_table_entries < 8500) {
121}
else if(approx_table_entries < 18000) {
131 if(approx_table_entries < 12000) {
134}
else if(approx_table_entries < 180000) {
144 if(approx_table_entries < 8500) {
147}
else if(approx_table_entries < 18000) {
150}
else if(approx_table_entries < 60000) {
153}
else if(approx_table_entries < 900000) {
163 if(approx_table_entries < 8500) {
166}
else if(approx_table_entries < 300000) {
182(approx_table_entries >= 32767 || max_q_off >= 32768)) {
202 Int4overflow_cells_needed = 2;
203 Int4overflow_cursor = 2;
204 Int4longest_chain = 0;
205 #ifdef LOOKUP_VERBOSE 206 Int4backbone_occupancy = 0;
207 Int4thick_backbone_occupancy = 0;
208 Int4num_overflows = 0;
216 for(
i= 0;
i<
lookup->backbone_size;
i++) {
217 if(thin_backbone[
i] !=
NULL) {
218 Int4num_hits = thin_backbone[
i][1];
220overflow_cells_needed += num_hits + 1;
221longest_chain =
MAX(longest_chain, num_hits);
231 if(overflow_cells_needed >= 32768) {
232 for(
i= 0;
i<
lookup->backbone_size;
i++)
233 sfree(thin_backbone[
i]);
247 lookup->longest_chain = longest_chain;
250 if(overflow_cells_needed > 0) {
257 for(
i= 0;
i<
lookup->backbone_size;
i++) {
262 if(thin_backbone[
i] ==
NULL) {
263 lookup->final_backbone[
i] = -1;
267 #ifdef LOOKUP_VERBOSE 268backbone_occupancy++;
270num_hits = thin_backbone[
i][1];
276 #ifdef LOOKUP_VERBOSE 277thick_backbone_occupancy++;
279 lookup->final_backbone[
i] = thin_backbone[
i][2];
282 #ifdef LOOKUP_VERBOSE 289 lookup->final_backbone[
i] = -overflow_cursor;
290 for(j = 0; j < num_hits; j++) {
291 lookup->overflow[overflow_cursor++] =
292thin_backbone[
i][j + 2];
298 lookup->overflow[overflow_cursor++] = -1;
302 sfree(thin_backbone[
i]);
305 lookup->overflow_size = overflow_cursor;
307 #ifdef LOOKUP_VERBOSE 308printf(
"SmallNa\n");
309printf(
"backbone size: %d\n",
lookup->backbone_size);
310printf(
"backbone occupancy: %d (%f%%)\n", backbone_occupancy,
311100.0 * backbone_occupancy /
lookup->backbone_size);
312printf(
"thick_backbone occupancy: %d (%f%%)\n",
313thick_backbone_occupancy,
314100.0 * thick_backbone_occupancy /
lookup->backbone_size);
315printf(
"num_overflows: %d\n", num_overflows);
316printf(
"overflow size: %d\n", overflow_cells_needed);
317printf(
"longest chain: %d\n", longest_chain);
341 if(stop - start > 2)
346start = locations->
ssr->
right+1;
347locations = locations->
next;
350stop = locations->
ssr->
left-1;
365 if( !query_options ) {
386 Int4**thin_backbone;
393 lookup->lut_word_length = lut_width;
418 sfree(thin_backbone);
428 if(
lookup->masked_locations)
446 Int4overflow_cells_needed = 0;
447 Int4overflow_cursor = 0;
448 Int4longest_chain = 0;
450 #ifdef LOOKUP_VERBOSE 451 Int4backbone_occupancy = 0;
452 Int4thick_backbone_occupancy = 0;
453 Int4num_overflows = 0;
469 for(
i= 0;
i<
lookup->backbone_size;
i++) {
470 if(thin_backbone[
i] !=
NULL) {
471 Int4num_hits = thin_backbone[
i][1];
473overflow_cells_needed += num_hits;
474longest_chain =
MAX(longest_chain, num_hits);
478 lookup->longest_chain = longest_chain;
481 if(overflow_cells_needed > 0) {
487 for(
i= 0;
i<
lookup->backbone_size;
i++) {
492 if(thin_backbone[
i] ==
NULL)
495 #ifdef LOOKUP_VERBOSE 496backbone_occupancy++;
498num_hits = thin_backbone[
i][1];
499 lookup->thick_backbone[
i].num_used = num_hits;
508 #ifdef LOOKUP_VERBOSE 509thick_backbone_occupancy++;
511 for(j = 0; j < num_hits; j++) {
512 lookup->thick_backbone[
i].payload.entries[j] =
513thin_backbone[
i][j + 2];
517 #ifdef LOOKUP_VERBOSE 520 lookup->thick_backbone[
i].payload.overflow_cursor =
522 for(j = 0; j < num_hits; j++) {
523 lookup->overflow[overflow_cursor] =
524thin_backbone[
i][j + 2];
530 sfree(thin_backbone[
i]);
533 lookup->overflow_size = overflow_cursor;
535 #ifdef LOOKUP_VERBOSE 536printf(
"backbone size: %d\n",
lookup->backbone_size);
537printf(
"backbone occupancy: %d (%f%%)\n", backbone_occupancy,
538100.0 * backbone_occupancy /
lookup->backbone_size);
539printf(
"thick_backbone occupancy: %d (%f%%)\n",
540thick_backbone_occupancy,
541100.0 * thick_backbone_occupancy /
lookup->backbone_size);
542printf(
"num_overflows: %d\n", num_overflows);
543printf(
"overflow size: %d\n", overflow_cells_needed);
544printf(
"longest chain: %d\n", longest_chain);
555 Int4**thin_backbone;
562 lookup->lut_word_length = lut_width;
582 sfree(thin_backbone);
590 if(
lookup->masked_locations)
613}
else if(length == 18) {
618}
else if(length == 21) {
624}
else if(
weight== 12) {
630}
else if(length == 18) {
635}
else if(length == 21) {
670 Int4template_length;
680 const Int4kCompressionFactor=2048;
713 inttemp_int = template_type + 1;
714second_template_type =
723helper_array2 ==
NULL)
749from = loc->
ssr->
left- (template_length - 2);
750to = loc->
ssr->
right- (template_length - 2);
752pos = seq + template_length;
754 for(index = from; index <= to; index++) {
760pos = seq + template_length;
769 #ifdef LOOKUP_VERBOSE 777 #ifdef LOOKUP_VERBOSE 780 PV_SET(pv_array, ecode1, pv_array_bts);
783helper_array[ecode1/kCompressionFactor]++;
795 #ifdef LOOKUP_VERBOSE 798 PV_SET(pv_array, ecode2, pv_array_bts);
801helper_array2[ecode2/kCompressionFactor]++;
809 for(index = 0; index < mb_lt->
hashsize/ kCompressionFactor; index++)
810longest_chain =
MAX(longest_chain, helper_array[index]);
814 sfree(helper_array);
818 for(index = 0; index < mb_lt->
hashsize/ kCompressionFactor; index++)
819longest_chain =
MAX(longest_chain, helper_array2[index]);
823 sfree(helper_array2);
870seq =
query->sequence_start + from;
871pos = seq + kLutWordLength;
875from -= kLutWordLength - 2;
876last_offset = to + 2;
878 for(index = from; index <= last_offset; index++) {
884pos = seq + kLutWordLength;
893 PV_SET(pv_array, ecode, pv_array_bts);
911mb_lt->
hashtable[(
Int8)((1 << (2 * word_size)) - 1)] = 0;
913 if(word_size < 16) {
920 for(
i= 1;
i< 4;
i++) {
922 for(k = 0;k < word_size;k++) {
928 for(
i= 0;
i< 3;
i++) {
929 for(k = 0;k < word_size;k++) {
930word = ((0xffffffff ^ (3 << k*2)) | (
i<< k*2)) & 0xffffffff;
972 const Int4kCompressionFactor=2048;
974 Uint4* helper_array;
988 if(helper_array ==
NULL)
1013 if(lookup_options->
stride> 0) {
1014shift = lookup_options->
stride- 1;
1015pos_shift = kLutWordLength + 1;
1024seq =
query->sequence_start + from;
1025pos = seq + kLutWordLength;
1029from -= kLutWordLength - 2;
1030last_offset = to + 2;
1032 for(index = from; index <= last_offset; index++) {
1038pos = seq + kLutWordLength;
1051 if((counts[ecode / 2] >> 4) >= max_word_count) {
1056 if((counts[ecode / 2] & 0xf) >= max_word_count) {
1077 #ifdef LOOKUP_VERBOSE 1082 #ifdef LOOKUP_VERBOSE 1085 PV_SET(pv_array, ecode, pv_array_bts);
1088helper_array[ecode/kCompressionFactor]++;
1096pos = seq + pos_shift;
1105 for(index = 0; index < mb_lt->
hashsize/ kCompressionFactor; index++)
1106longest_chain =
MAX(longest_chain, helper_array[index]);
1109 sfree(helper_array);
1125 Uint1max_word_count)
1130 Int8word, index, w;
1131 const Int4kNumWords
1138 if(!sequence || !counts || !mb_lt || !pv) {
1147w = (
Int8)s[0] << 24 | (
Int8)s[1] << 16 | (
Int8)s[2] << 8 | s[3];
1148 for(
i= 0;
i< kNumWords;
i++) {
1159word = (w >> shift) &
mask;
1162 if(!
PV_TEST(pv, word, pv_array_bts)) {
1169 if((counts[index] & 0xf) < max_word_count) {
1174 if((counts[index] >> 4) < max_word_count) {
1175counts[index] += 1 << 4;
1194 Uint1max_word_count)
1200 if(!seq_src || !pv || !counts) {
1204memset(&seq_arg, 0,
sizeof(seq_arg));
1231 Int4approx_table_entries,
1238 const Int4kTargetPVSize = 131072;
1239 const Int4kSmallQueryCutoff = 15000;
1240 const Int4kLargeQueryCutoff = 800000;
1251 if(mb_lt ==
NULL) {
1287 if(mb_lt->
hashsize<= 8 * kTargetPVSize)
1299(approx_table_entries <= kSmallQueryCutoff ||
1300approx_table_entries >= kLargeQueryCutoff)) {
1301pv_size = pv_size / 2;
1315 if(counts ==
NULL) {
1344 if(lookup_options->
db_filter&& counts) {
1356 #ifdef LOOKUP_VERBOSE 1357printf(
"lookup table size: %ld (%d letters)\n", mb_lt->
hashsize,
1362printf(
"PV array size: %d bytes (%ld table entries/bit)\n",
1392 const Uint4fnv_prime = 16777619u;
1393 const Uint4fnv_offset_basis = 2166136261u;
1397 hash= fnv_offset_basis;
1398 for(
i= 0;
i< 4;
i++) {
1414 Int4lut_word_length;
1416 const Int4pv_array_bts =
lookup->pv_array_bts;
1420word_length =
lookup->word_length;
1421lut_word_length =
lookup->lut_word_length;
1425 for(loc = locations; loc; loc = loc->
next) {
1443seq =
query->sequence + from;
1444pos = seq + lut_word_length - 1;
1445end =
query->sequence + to + 1;
1447 for(; seq < end; seq++) {
1454pos = seq + lut_word_length;
1479v = v - ((v >> 1) & 0x55555555);
1480v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
1481v = ((v + (v >> 4)) & 0xF0F0F0F);
1507 if(
array->values) {
1511 if(
array->counts) {
1526 if(!retval || !bitfield) {
1539 for(
i= 1;
i< retval->
length;
i++) {
1573bit_count = (idx > 0) ?
array->counts[idx - 1] : 0;
1574 ASSERT(
array->bitfield[idx] & (1 << bit_number));
1577bit_count +=
s_Popcount(
array->bitfield[idx] & ((1 << bit_number) - 1));
1582 returnbit_count - 1;
1596 ASSERT(sparse_index < array->num_elements);
1597 if(sparse_index < 0 || sparse_index >
array->num_elements) {
1601 return array->values + sparse_index;
1616 Uint1max_word_count)
1622 const Int4kNumWords
1630 if(!sequence || !counts || !
lookup|| !pv) {
1642w = (
Int8)s[0] << 24 | (
Int8)s[1] << 16 | (
Int8)s[2] << 8 | s[3];
1643 for(
i= 0;
i< kNumWords;
i++) {
1654word = (w >> shift) &
mask;
1657 if(!
PV_TEST(pv, word, pv_array_bts)) {
1663 if(*pelem < max_word_count) {
1691 if(
th->seq_arg) {
1693 for(
i= 0;
i<
th->num_threads;
i++) {
1701 for(
i= 0;
i<
th->num_threads;
i++) {
1707 if(
th->seq_src) {
1709 for(
i= 0;
i<
th->num_threads;
i++) {
1715 if(
th->word_counts) {
1717 for(
i= 1;
i<
th->num_threads;
i++) {
1718 if(
th->word_counts[
i]) {
1719 if(
th->word_counts[
i]->values) {
1720 free(
th->word_counts[
i]->values);
1722 free(
th->word_counts[
i]);
1728 free(
th->word_counts);
1743 if(num_threads < 1 || !
lookup|| !seq_src) {
1759 if(!retval->
itr) {
1776 for(
i= 0;
i< num_threads;
i++) {
1789 if(!retval->
itr[
i]) {
17961LL << (2 *
lookup->lut_word_length));
1841 Uint4in_num_threads,
1842 Uint1max_word_count)
1846 Int4num_db_seqs, th_batch;
1860num_threads =
MIN(in_num_threads, num_db_seqs);
1872 #pragma omp parallel for if (num_threads > 1) num_threads(num_threads) \ 1873 default(none) shared(num_threads, th_data, lookup, \ 1874 th_batch, max_word_count) private(i) \ 1875 schedule(dynamic, 1) 1877 for(
i= 0;
i< num_threads;
i++) {
1879 for(j = 0;j < th_batch;j++) {
1881 #pragma omp critical (get_sequence_for_word_counts) 1885th_data->
itr[
i]);
1919 for(k = 1;k < num_threads;k++) {
1932 while(i < th_data->word_counts[0]->length) {
1942 ASSERT(k < th_data->word_counts[0]->num_elements);
1983pv_array_bts =
lookup->pv_array_bts;
1984word_size =
lookup->lut_word_length;
1988pv[0xffffffff >> pv_array_bts] &=
1992 for(
i= 1;
i< 4;
i++) {
1994 for(k = 0;k < word_size;k++) {
1995pv[word >> pv_array_bts] &=
2001 for(
i= 0;
i< 3;
i++) {
2002 for(k = 0;k < word_size;k++) {
2003word = ((0xffffffff ^ (3 << k*2)) | (
i<< k*2)) & 0xffffffff;
2005pv[word >> pv_array_bts] &=
2024 Int4overflow_cells_needed = 0;
2025 Int4overflow_cursor = 0;
2026 Int4longest_chain = 0;
2028 const Int4pv_array_bts =
lookup->pv_array_bts;
2029 const Int8kNumWords = 1LL << (2 *
lookup->lut_word_length);
2030 #ifdef LOOKUP_VERBOSE 2031 Int4backbone_occupancy = 0;
2032 Int4thick_backbone_occupancy = 0;
2033 Int4num_overflows = 0;
2034 Int4words_per_hash[5] = {0,};
2048memset(
lookup->pv, 0, (kNumWords >>
lookup->pv_array_bts) *
2061 for(
i= 0;
i<
lookup->backbone_size;
i++) {
2065 if(
b->num_offsets > 0) {
2066 for(;
b;
b=
b->next) {
2067num_hits +=
b->num_offsets;
2075overflow_cells_needed += num_hits + (num_words * 2);
2077longest_chain =
MAX(longest_chain, num_hits);
2080 lookup->longest_chain = longest_chain;
2083 if(overflow_cells_needed > 0) {
2089 for(
i= 0;
i<
lookup->backbone_size;
i++) {
2092 Int4num_offsets = 0;
2098 if(
head->num_offsets == 0) {
2102 #ifdef LOOKUP_VERBOSE 2103thick_backbone_occupancy++;
2108 for(
b=
head;
b;
b=
b->next) {
2110num_offsets +=
b->num_offsets;
2112 #ifdef LOOKUP_VERBOSE 2113backbone_occupancy++;
2118 #ifdef LOOKUP_VERBOSE 2119words_per_hash[((num_words < 6) ? num_words : 5) - 1]++;
2129 for(
b=
head;
b;
b=
b->next, k++) {
2131cell->
words[k] =
b->word;
2148 for(
b=
head;
b;
b=
b->next, k++) {
2149cell->
words[k] =
b->word;
2151is_overflow =
TRUE;
2154is_overflow =
TRUE;
2160 #ifdef LOOKUP_VERBOSE 2163cell->
offsets[0] = overflow_cursor;
2164 for(
b=
head;
b;
b=
b->next) {
2166 lookup->overflow[overflow_cursor++] = *(
Int4*)(&
b->word);
2167 lookup->overflow[overflow_cursor++] =
b->num_offsets;
2172 lookup->overflow[overflow_cursor++] = j - 1;
2176 ASSERT(overflow_cursor <= overflow_cells_needed);
2184 lookup->offsets_size = overflow_cursor;
2186 #ifdef LOOKUP_VERBOSE 2187printf(
"backbone size: %d\n",
lookup->backbone_size);
2188printf(
"backbone occupancy: %d (%f%%)\n", backbone_occupancy,
2189100.0 * backbone_occupancy /
lookup->backbone_size);
2190printf(
"thick_backbone occupancy: %d (%f%%)\n",
2191thick_backbone_occupancy,
2192100.0 * thick_backbone_occupancy /
lookup->backbone_size);
2193printf(
"num_overflows: %d\n", num_overflows);
2194printf(
"\tnumber of words per hash\tcount\n");
2197 for(ii = 0;ii < 5;ii++) {
2198printf(
"\t%d\t%d\n", ii + 1, words_per_hash[ii]);
2201printf(
"overflow size: %d\n", overflow_cells_needed);
2202printf(
"longest chain: %d\n", longest_chain);
2212 if(
lookup->masked_locations)
2235 const Int8kNumWords = (1ULL << 32);
2236 Int4num_hash_bits = 8;
2237 Int4 i, num_unique_words = 0;
2246 lookup->lut_word_length = 16;
2278 for(
i= 0;i < kNumWords >>
lookup->pv_array_bts;
i++) {
2283 while(num_hash_bits < 32 &&
2284(1LL << num_hash_bits) < num_unique_words) {
2288 lookup->backbone_size = 1 << num_hash_bits;
2292 if(!thin_backbone) {
2306 lookup->lut_word_length,
2318 sfree(thin_backbone);
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
BLAST filtering functions.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
#define PV_ARRAY_BTS
bits-to-shift from lookup_index to pv_array index.
BackboneCell * BackboneCellFree(BackboneCell *cell)
#define PV_TEST(lookup, index, shift)
Test the bit at position 'index' in the PV array bitfield within 'lookup'.
#define PV_ARRAY_MASK
amount to mask off.
void BlastLookupIndexQueryExactMatches(Int4 **backbone, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations)
Add all applicable query offsets to a generic lookup table.
void BlastHashLookupIndexQueryExactMatches(BackboneCell *backbone, Int4 *offsets, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations, TNaLookupHashFunction hash_func, Uint4 mask, Uint4 *pv_array)
Add all applicable query offsets to a hashed lookup table.
#define PV_ARRAY_BYTES
number of BYTES in 'native' type.
#define PV_SET(lookup, index, shift)
Set the bit at position 'index' in the PV array bitfield within 'lookup'.
#define PV_ARRAY_TYPE
The pv_array 'native' type.
#define BLASTERR_MEMORY
System error: out of memory condition.
static Int2 s_NaHashLookupCountWordsInSubject_16_1(const BLAST_SequenceBlk *sequence, BlastNaHashLookupTable *lookup, BlastSparseUint1Array *counts, Uint1 max_word_count)
Scan a subject sequecne and update words counters, for 16-base words with scan step of 1.
BlastSmallNaLookupTable * BlastSmallNaLookupTableDestruct(BlastSmallNaLookupTable *lookup)
Free a small nucleotide lookup table.
Int4 BlastNaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, Int4 lut_width)
Create a new nucleotide lookup table.
static Int4 s_BlastSmallNaLookupFinalize(Int4 **thin_backbone, BlastSmallNaLookupTable *lookup, BLAST_SequenceBlk *query)
Pack the data structures comprising a small nucleotide lookup table into their final form.
static Int2 s_FillContigMBTable(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options, Uint1 *counts)
Fills in the hashtable and next_pos fields of BlastMBLookupTable* for the contiguous case.
static Boolean s_HasMaskAtHashEnabled(const QuerySetUpOptions *query_options)
Determine whether mask at hash is enabled from the QuerySetUpOptions.
static NaHashLookupThreadData * NaHashLookupThreadDataFree(NaHashLookupThreadData *th)
BlastMBLookupTable * BlastMBLookupTableDestruct(BlastMBLookupTable *mb_lt)
Deallocate memory used by the Mega BLAST lookup table.
static BlastSeqLoc * s_SeqLocListInvert(const BlastSeqLoc *locations, Int4 length)
Changes the list of locations into a list of the intervals between locations (the inverse).
static Int2 s_ScanSubjectForWordCounts(BlastSeqSrc *seq_src, BlastMBLookupTable *mb_lt, Uint1 *counts, Uint1 max_word_count)
Scan database sequences and count query words that appear in the database.
BlastNaHashLookupTable * BlastNaHashLookupTableDestruct(BlastNaHashLookupTable *lookup)
Free a nucleotide lookup table.
static BlastSparseUint1Array * BlastSparseUint1ArrayFree(BlastSparseUint1Array *array)
static EDiscTemplateType s_GetDiscTemplateType(Int4 weight, Uint1 length, EDiscWordType type)
Convert weight, template length and template type from input options into an MBTemplateType enum.
ELookupTableType BlastChooseNaLookupTable(const LookupTableOptions *lookup_options, Int4 approx_table_entries, Int4 max_q_off, Int4 *lut_width)
choose the type of nucleotide lookup table to be used for a blast search
Int2 BlastMBLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable **mb_lt_ptr, const LookupTableOptions *lookup_options, const QuerySetUpOptions *query_options, Int4 approx_table_entries, Int4 lut_width, BlastSeqSrc *seqsrc)
Create the lookup table for Mega BLAST.
Int4 BlastNaHashLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaHashLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, BlastSeqSrc *seqsrc, Uint4 num_threads)
struct NaHashLookupThreadData NaHashLookupThreadData
static BlastSparseUint1Array * BlastSparseUint1ArrayNew(Uint4 *bitfield, Int8 len)
static Int2 s_NaHashLookupScanSubjectForWordCounts(BlastSeqSrc *seq_src, BlastNaHashLookupTable *lookup, Uint4 in_num_threads, Uint1 max_word_count)
Scan database sequences and count query words that appear in the database.
static void s_BlastNaHashLookupFinalize(BackboneCell *thin_backbone, Int4 *offsets, BlastNaHashLookupTable *lookup)
Pack the data structures comprising a nucleotide lookup table into their final form.
static Int2 s_NaHashLookupFillPV(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaHashLookupTable *lookup)
static Uint4 s_Popcount(Uint4 v)
static Uint1 * BlastSparseUint1ArrayGetElement(BlastSparseUint1Array *array, Int8 index)
static Int4 BlastSparseUint1ArrayGetIndex(BlastSparseUint1Array *array, Int8 index)
#define BLAST2NA_MASK
bitfield used to detect ambiguities in uncompressed nucleotide letters
static Int2 s_FillPV(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options)
static void s_BlastNaLookupFinalize(Int4 **thin_backbone, BlastNaLookupTable *lookup)
Pack the data structures comprising a nucleotide lookup table into their final form.
static Int2 s_NaHashLookupRemovePolyAWords(BlastNaHashLookupTable *lookup)
static Int2 s_MBCountWordsInSubject_16_1(const BLAST_SequenceBlk *sequence, BlastMBLookupTable *mb_lt, Uint1 *counts, Uint1 max_word_count)
Scan a subject sequecne and update words counters, for 16-base words with scan step of 1.
#define BITS_PER_NUC
number of bits in a compressed nucleotide letter
static Int2 s_FillDiscMBTable(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options)
Fills in the hashtable and next_pos fields of BlastMBLookupTable* for the discontiguous case.
static NaHashLookupThreadData * NaHashLookupThreadDataNew(Int4 num_threads, BlastNaHashLookupTable *lookup, BlastSeqSrc *seq_src)
struct BlastSparseUint1Array BlastSparseUint1Array
Sparse array of Uint1 implemented with a bitfield.
static Int2 s_RemovePolyAWords(BlastMBLookupTable *mb_lt)
Int4 BlastSmallNaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastSmallNaLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, Int4 lut_width)
Create a new small nucleotide lookup table.
static Uint4 FNV_hash(Uint1 *seq, Uint4 mask)
BlastNaLookupTable * BlastNaLookupTableDestruct(BlastNaLookupTable *lookup)
Free a nucleotide lookup table.
Routines for creating nucleotide BLAST lookup tables.
#define NA_OFFSETS_PER_HASH
EDiscWordType
General types of discontiguous word templates.
#define NA_WORDS_PER_HASH
static NCBI_INLINE Int4 ComputeDiscontiguousIndex(Uint8 accum, EDiscTemplateType template_type)
Given an accumulator containing packed bases, compute the discontiguous word index specified by templ...
EDiscTemplateType
Enumeration of all discontiguous word templates; the enumerated values encode the weight,...
@ eDiscTemplate_12_18_Optimal
@ eDiscTemplate_11_18_Optimal
@ eDiscTemplateContiguous
@ eDiscTemplate_12_16_Optimal
@ eDiscTemplate_12_16_Coding
@ eDiscTemplate_11_21_Coding
@ eDiscTemplate_11_18_Coding
@ eDiscTemplate_12_21_Coding
@ eDiscTemplate_11_16_Optimal
@ eDiscTemplate_11_21_Optimal
@ eDiscTemplate_12_21_Optimal
@ eDiscTemplate_12_18_Coding
@ eDiscTemplate_11_16_Coding
#define NA_HITS_PER_CELL
maximum number of hits in one lookup table cell
Boolean SBlastFilterOptionsMaskAtHash(const SBlastFilterOptions *filter_options)
Queries whether masking should be done only for the lookup table or for the entire search.
ELookupTableType
Types of the lookup table.
@ eSmallNaLookupTable
lookup table for blastn with small query
@ eNaLookupTable
blastn lookup table
@ eMBLookupTable
megablast lookup table (includes both contiguous and discontiguous megablast)
@ eNaHashLookupTable
used for 16-base words
Boolean Blast_ProgramIsMapping(EBlastProgramType p)
Int4 BlastSeqSrcIteratorNext(const BlastSeqSrc *seq_src, BlastSeqSrcIterator *itr)
Increments the BlastSeqSrcIterator.
BlastSeqSrcIterator * BlastSeqSrcIteratorFree(BlastSeqSrcIterator *itr)
Frees the BlastSeqSrcIterator structure.
BlastSeqSrcIterator * BlastSeqSrcIteratorNewEx(unsigned int chunk_sz)
Allocate and initialize an iterator over a BlastSeqSrc.
void BlastSeqSrcReleaseSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Deallocate individual sequence.
BlastSeqSrc * BlastSeqSrcCopy(const BlastSeqSrc *seq_src)
Copy function: needed to guarantee thread safety.
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
BlastSeqSrc * BlastSeqSrcFree(BlastSeqSrc *seq_src)
Frees the BlastSeqSrc structure by invoking the destructor function set by the user-defined construct...
Int2 BlastSeqSrcGetSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Retrieve an individual sequence.
#define BLAST_SEQSRC_EOF
No more sequences available.
void BlastSeqSrcResetChunkIterator(BlastSeqSrc *seq_src)
Reset the internal "bookmark" of the last chunk for iteration provided by this object.
Various auxiliary BLAST utility functions.
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Int2 BlastCompressBlastnaSequence(BLAST_SequenceBlk *seq_blk)
Adds a specialized representation of sequence data to a sequence block.
ncbi::TMaskedQueryRegions mask
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static int lookup(const char *name, const struct lookup_int *table)
static const char location[]
@ eBlastEncodingProtein
NCBIstdaa.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th
Utility functions for lookup table generation.
Int4 ilog2(Int8 x)
Integer base two logarithm.
#define MIN(a, b)
returns smaller of a and b.
Uint1 Boolean
bool replacment for C
#define TRUE
bool replacment for C indicating true.
#define FALSE
bool replacment for C indicating false.
#define ASSERT
macro for assert.
#define MAX(a, b)
returns larger of a and b.
static PCRE2_SIZE * offsets
Structure to hold a sequence.
Int4 length
Length of sequence.
Uint1 * sequence
Sequence used for search (could be translation).
Thin backbone cell for nucleotide lookup table with hashed words.
The lookup table structure used for Mega BLAST.
Int4 num_words_added
Number of words added to the l.t.
Int4 lut_word_length
number of letters in a lookup table word
Int4 pv_array_bts
The exponent of 2 by which pv_array is smaller than the backbone.
BlastSeqLoc * masked_locations
masked locations, only non-NULL for soft-masking.
Int4 * next_pos2
Extra positions for the second template.
Int4 * hashtable2
Array of positions for second template.
Int4 * hashtable
Array of positions.
Int4 num_unique_pos_added
Number of positions added to the l.t.
PV_ARRAY_TYPE * pv_array
Presence vector, used for quick presence check.
Boolean stride
is lookup table created with a stride
Int8 hashsize
= 4^(lut_word_length)
EDiscTemplateType template_type
Type of the discontiguous word template.
Int4 scan_step
Step size for scanning the database.
Int4 longest_chain
Largest number of query positions for a given word.
Int4 word_length
number of exact letter matches that will trigger an ungapped extension
Boolean discontiguous
Are discontiguous words used?
Int4 * next_pos
Extra positions stored here.
Boolean two_templates
Use two templates simultaneously.
EDiscTemplateType second_template_type
Type of the second discontiguous word template.
Int4 template_length
Length of the discontiguous word template.
The basic lookup table structure for blastn searches.
Used to hold a set of positions, mostly used for filtering.
SSeqRange * ssr
location data on the sequence.
struct BlastSeqLoc * next
next in linked list
Structure used as the second argument to functions satisfying the GetSeqBlkFnPtr signature,...
Int4 oid
Oid in BLAST database, index in an array of sequences, etc [in].
EBlastEncoding encoding
Encoding of sequence, i.e.
BLAST_SequenceBlk * seq
Sequence to return, if NULL, it should allocated by GetSeqBlkFnPtr (using BlastSeqBlkNew or BlastSetU...
Complete type definition of Blast Sequence Source Iterator.
Complete type definition of Blast Sequence Source ADT.
Lookup table structure for blastn searches with small queries.
Sparse array of Uint1 implemented with a bitfield.
Uint1 * values
array of values for present indices
Uint4 num_elements
number of values present in the array
Int4 * counts
cumulative number of bits set
Uint4 length
length of the bitfield
Uint4 * bitfield
bitfield with bits set for present indices
Options needed to construct a lookup table Also needed: query sequence and query length.
Int4 word_size
Determines the size of the lookup table.
Uint1 max_db_word_count
words with larger frequency in the database will be masked in the lookup table, if the db_filter opto...
Boolean db_filter
scan the database and include only words that appear in the database between 1 and 9 times (currently...
EBlastProgramType program_number
indicates blastn, blastp, etc.
Int4 mb_template_type
Type of a discontiguous word template.
Uint4 stride
number of words to skip after collecting each word
Int4 mb_template_length
Length of the discontiguous words.
Structure defining one cell of the compacted lookup table.
Int1 num_offsets[3]
number of offsets for each word if there are fewer than 3
Uint4 words[3]
words stored under this hash value
Int4 offsets[9]
offset locations for each word
Int1 num_words
number of words stored under the same hash value
BlastSeqSrcIterator ** itr
BlastSparseUint1Array ** word_counts
BlastSeqSrcGetSeqArg * seq_arg
structure defining one cell of the compacted lookup table
Options required for setting up the query sequence.
char * filter_string
DEPRECATED, filtering options above.
SBlastFilterOptions * filtering_options
structured options for all filtering offered from algo/blast/core for BLAST.
Int4 left
left endpoint of range (zero based)
Int4 right
right endpoint of range (zero based)
voidp calloc(uInt items, uInt size)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4