maskLeftPlusOne = (
mask<< 1)+1;
77prefixPos[
i] = mask2 & ((
tmp>>1) |
mask) &
S[a1];
80 tmp= ((
S[a1]<<1) | maskLeftPlusOne) &
S[a2];
82 tmp= ((
tmp<<1) | maskLeftPlusOne) &
S[a3];
84suffixPos[
i] = ((((
tmp<<1) | maskLeftPlusOne) &
S[a4]) << 1) | maskLeftPlusOne;
101 for(wordIndex = 0; wordIndex < multiword_items->
numWords; wordIndex++) {
103compositeMask = mask1 + (mask1>>1)+(mask1>>2)+(mask1>>3);
116match_mask + (match_mask>>1) + (match_mask>>2) + (match_mask>>3);
147 Int4recReturnValue1, recReturnValue2;
149 Int4thisPlaceMasked;
155 for(
i= 0;
i< length;
i++) {
156thisPlaceMasked = -inputPatternMasked[
i];
157 if(thisPlaceMasked > 0) {
159 for(j = 0; j < length; j++) {
161tempPatternMask[j] = inputPatternMasked[j];
162tempPattern[j] = inputPattern[j];
164recReturnValue2 = recReturnValue1 =
166 if(recReturnValue1 == -1)
168 for(numPos = 0; numPos <= thisPlaceMasked; numPos++) {
171 for(k = 0; k < length; k++) {
173 for(
t= 0;
t< numPos;
t++) {
175 if(recReturnValue1 >= maxLength)
180inputPatternMasked[recReturnValue1] = tempPatternMask[k];
181inputPattern[recReturnValue1++] = tempPattern[k];
182 if(recReturnValue1 >= maxLength)
185 if(recReturnValue1 >= maxLength)
190&inputPattern[recReturnValue2],
192maxLength - recReturnValue2);
193 if(recReturnValue1 == -1)
195recReturnValue2 += recReturnValue1;
196recReturnValue1 = recReturnValue2;
198 returnrecReturnValue1;
214 Int4returnValue = 0;
215 for(
i= 0;
i< length;
i++) {
216 if(inputPattern[
i])
217returnValue += (1 <<
i);
242 for(wordIndex = 0; wordIndex < multiword_items->
numWords; wordIndex++) {
246bitPattern += (1 <<
i);
248multiword_items->
match_maskL[wordIndex] = bitPattern;
250 for(charIndex = 0; charIndex <
BLASTAA_SIZE; charIndex++) {
251 for(wordIndex = 0; wordIndex < multiword_items->
numWords; wordIndex++) {
256bitPattern = bitPattern | (1 <<
i);
291 Int4placeInWord, placeInWord2;
295 doublepatternWordProbability;
296 doublemost_specific;
307patternWordProbability = 1.0;
308 for(placeIndex = 0, wordIndex = 0, placeInWord=0;
309placeIndex <= numPlacesInPattern; placeIndex++, placeInWord++) {
310 if(placeIndex==numPlacesInPattern || inputPatternMasked[placeIndex] < 0
312multiword_items->
match_maskL[wordIndex] = 1 << (placeInWord-1);
313oneWordSLL = multiword_items->
SLL[wordIndex];
314 for(charIndex = 0; charIndex <
BLASTAA_SIZE; charIndex++) {
316 for(placeInWord2 = 0; placeInWord2 < placeInWord; placeInWord2++) {
317 if((1<< charIndex) &
318inputPatternMasked[placeIndex-placeInWord+placeInWord2])
319oneWordMask |= (1 << placeInWord2);
321oneWordSLL[charIndex] = oneWordMask;
324 if(patternWordProbability < most_specific) {
325most_specific = patternWordProbability;
328 if(placeIndex == numPlacesInPattern)
329extra_items->
spacing[wordIndex++] = 0;
330 else if(inputPatternMasked[placeIndex] < 0) {
331extra_items->
spacing[wordIndex++] = -inputPatternMasked[placeIndex];
334extra_items->
spacing[wordIndex++] = 0;
337patternWordProbability = 1.0;
339patternWordProbability *= (double)
343multiword_items->
numWords= wordIndex;
374 ASSERT(pattern_in && pattern_out && length > 0);
376 for(index=0; index<length; index++)
378 if(pattern_in[index] >=
'a'&& pattern_in[index] <=
'z')
379pattern_out[index] =
toupper(pattern_in[index]);
381pattern_out[index] = pattern_in[index];
392 const intkWildcardThreshold = 30;
401 Int4currentSetMask, prevSetMask;
405 Int4minWildcard, maxWildcard;
413 doublepositionProbability;
415 Int4currentWildcardProduct;
417 Int4wildcardProduct;
419 Int4* whichPositionsByCharacter=
NULL;
425 char* pattern =
NULL;
426 intpattern_length = 0;
436currentWildcardProduct = 1;
439pattern_length = (
int)strlen(pattern_in);
444snprintf(message,
sizeof(message),
"Pattern is too long (%ld but only %ld supported)",
452pattern =
calloc(pattern_length+1,
sizeof(
char));
454pattern_blk->
pattern= pattern;
459 for(charIndex = 0, posIndex = 0; charIndex < pattern_length; charIndex++)
461next_char = pattern[charIndex];
462 if(next_char ==
'\0'|| next_char ==
'\r'|| next_char ==
'\n')
464 if(next_char ==
'-'|| next_char ==
'.'||
465next_char ==
'>'|| next_char ==
' '|| next_char ==
'<')
467 if( next_char !=
'['&& next_char !=
'{') {
468 if(next_char ==
'x'|| next_char==
'X') {
471 if(pattern[charIndex+1] ==
'(') {
473secondIndex = charIndex;
476 while(pattern[secondIndex] !=
','&&
477pattern[secondIndex] !=
')')
479 if(pattern[secondIndex] ==
')') {
484positionProbability = 1;
487sscanf(&pattern[++charIndex],
"%d,%d",
488&minWildcard, &maxWildcard);
489maxWildcard = maxWildcard - minWildcard;
490currentWildcardProduct *= (maxWildcard + 1);
491 if(currentWildcardProduct > wildcardProduct)
492wildcardProduct = currentWildcardProduct;
494 while(minWildcard-- > 0) {
505 if(maxWildcard != 0) {
517 while(pattern[++charIndex] !=
')') ;
523positionProbability =1;
527 if(next_char ==
'U') {
529positionProbability = 1;
533prevSetMask = currentSetMask;
535charSetMask = (1 << kOrder[(
Uint1)next_char]);
536 if(!(prevSetMask & currentSetMask))
538currentWildcardProduct = 1;
539positionProbability =
544 if(next_char ==
'[') {
546positionProbability = 0;
549 while((next_char=pattern[++charIndex]) !=
']') {
550 if((next_char <
'A') || (next_char >
'Z') || (next_char ==
'\0')) {
552 "pattern description has a non-alphabetic" 553 "character inside a bracket");
558charSetMask | (1 << kOrder[(
Uint1)next_char]);
559positionProbability +=
562prevSetMask = currentSetMask;
563currentSetMask = charSetMask;
564 if(!(prevSetMask & currentSetMask))
566currentWildcardProduct = 1;
571positionProbability = 1;
572 while((next_char=pattern[++charIndex]) !=
'}') {
573charSetMask = charSetMask -
574(charSetMask & (1 << kOrder[(
Uint1)next_char]));
575positionProbability -=
578prevSetMask = currentSetMask;
579currentSetMask = charSetMask;
580 if(!(prevSetMask & currentSetMask))
582currentWildcardProduct = 1;
586 if(pattern[charIndex+1] ==
'(') {
588numIdentical = atoi(&pattern[++charIndex]);
590 while(pattern[++charIndex] !=
')') ;
591 while((numIdentical--) > 0) {
604 "Pattern is too long");
617 for(charIndex = 0; charIndex < posIndex; charIndex++) {
623 for(secondIndex = charIndex + 1; secondIndex < posIndex;
628 for(; secondIndex < posIndex; secondIndex++, charIndex++) {
632posIndex = charIndex;
635localPattern[posIndex-1] = 1;
639 for(charIndex = 0; charIndex < posIndex; charIndex++) {
640tempInputPatternMasked[charIndex] =
642tempPosIndex = posIndex;
649 for(charIndex = 0; charIndex < tempPosIndex; charIndex++)
651tempInputPatternMasked[charIndex];
669 for(charIndex = 0; charIndex <
BLASTAA_SIZE; charIndex++) {
671 for(charSetMask = 0; charSetMask < (
Uint4)posIndex; charSetMask++) {
673thisMask |= (1 << charSetMask);
675whichPositionsByCharacter[charIndex] = thisMask;
681 if(wildcardProduct > kWildcardThreshold) {
683 "Due to variable wildcards pattern is likely to " 684 "occur too many times in a single sequence\n");
746*offset_ptr = subject_blk->
length;
749kIsDna, pattern_blk);
752 for(index = 0; index < twiceNumHits; index += 2) {
753offset_pairs[
count].phi_offsets.s_start = hitArray[index+1];
754offset_pairs[
count].phi_offsets.s_end = hitArray[index];
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
Int2 Blast_MessageWrite(Blast_Message **blast_msg, EBlastSeverity severity, int context, const char *message)
Writes a message to a structure.
const int kBlastMessageNoContext
Declared in blast_message.h as extern const.
@ ePhiNaLookupTable
nucleotide lookup table for phi-blast
@ ePhiLookupTable
protein lookup table specialized for phi-blast
Blast_ResFreq * Blast_ResFreqFree(Blast_ResFreq *rfp)
Deallocates Blast_ResFreq and prob0 element.
Int2 Blast_ResFreqStdComp(const BlastScoreBlk *sbp, Blast_ResFreq *rfp)
Calculates residues frequencies given a standard distribution.
Blast_ResFreq * Blast_ResFreqNew(const BlastScoreBlk *sbp)
Allocates a new Blast_ResFreq structure and fills in the prob element based upon the contents of sbp.
Various auxiliary BLAST utility functions.
#define NCBI2NA_UNPACK_BASE(x, N)
Macro to extract base N from a byte x (N >= 0, N < 4)
ncbi::TMaskedQueryRegions mask
const Uint1 IUPACNA_TO_NCBI4NA[]
Translates between iupacna and ncbi4na.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
unsigned int
A callback function used to compare two keys in a database.
if(yy_accept[yy_current_state])
Uint1 Boolean
bool replacment for C
#define ASSERT
macro for assert.
@ eVeryLong
Is pattern too long for a simple multi-word processing?
@ eMultiWord
Does pattern consist of a multiple words?
@ eOneWord
Does pattern consist of a single word?
#define PHI_ASCII_SIZE
Size of ASCII alphabet.
Int4 FindPatternHits(Int4 *hitArray, const Uint1 *seq, Int4 len, Boolean is_dna, const SPHIPatternSearchBlk *patternSearch)
Find the places where the pattern matches seq; 3 different methods are used depending on the length o...
#define PHI_BITS_PACKED_PER_WORD
Number of bits packed in a word.
#define PHI_MAX_HIT
Maximal size of an array of pattern hits.
#define PHI_MAX_PATTERN_LENGTH
Threshold pattern length.
Auxiliary functions for finding pattern matches in sequence (PHI-BLAST), that are used in multiple so...
static void s_PackLongPattern(Int4 numPlaces, Uint1 *inputPattern, SPHIPatternSearchBlk *pattern_blk)
Pack the bit representation of the inputPattern into the array pattern_blk->match_maskL.
static Int4 s_NumOfOne(Int4 a)
Return the number of 1 bits in the base 2 representation of a number a.
static void s_FindPrefixAndSuffixPos(Int4 *S, Int4 mask, Int4 mask2, Uint4 *prefixPos, Uint4 *suffixPos)
Set up matches for words that encode 4 DNA characters; figure out for each of 256 possible DNA 4-mers...
static SPHIPatternSearchBlk * s_PatternSearchItemsInit()
Allocates the SPHIPatternSearchBlk structure.
static void s_InitDNAPattern(SPHIPatternSearchBlk *pattern_blk)
Initialize mask and other arrays for DNA patterns.
static Int4 s_PackPattern(Uint1 *inputPattern, Int4 length)
Pack the next length bytes of inputPattern into a bit vector where the bit is 1 if and only if the by...
static Int4 s_ExpandPattern(Int4 *inputPatternMasked, Uint1 *inputPattern, Int4 length, Int4 maxLength)
Determine the length of the pattern after it has been expanded for efficient searching.
Int2 SPHIPatternSearchBlkNew(char *pattern_in, Boolean is_dna, BlastScoreBlk *sbp, SPHIPatternSearchBlk **pattern_blk_out, Blast_Message **error_msg)
Initialize the pattern items structure, serving as a "pseudo" lookup table in a PHI BLAST search.
SPHIPatternSearchBlk * SPHIPatternSearchBlkFree(SPHIPatternSearchBlk *lut)
Deallocate memory for the PHI BLAST lookup table.
static void s_PackVeryLongPattern(Int4 *inputPatternMasked, Int4 numPlacesInPattern, SPHIPatternSearchBlk *pattern_blk)
Sets up fields in SPHIPatternSearchBlk structure when pattern is very long.
const int kMaskAaAlphabetBits
Masks all bits corresponding to the aminoacid alphabet, i.e.
Int4 PHIBlastScanSubject(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *query_blk, const BLAST_SequenceBlk *subject_blk, Int4 *offset_ptr, BlastOffsetPair *offset_pairs, Int4 array_size)
Implementation of the ScanSubject function for PHI BLAST.
static void s_MakePatternUpperCase(char *pattern_in, char *pattern_out, int length)
Convert the string representation of a PHIblast pattern to uppercase.
Pseudo lookup table structure and database scanning functions used in PHI-BLAST.
Structure to hold a sequence.
Int4 length
Length of sequence.
Uint1 * sequence
Sequence used for search (could be translation).
Structure used for scoring calculations.
Structure to hold the a message from the core of the BLAST engine.
Stores the letter frequency of a sequence or database.
double * prob
letter probs, (possible) non-zero offset.
Wrapper structure for different types of BLAST lookup tables.
void * lut
Pointer to the actual lookup table structure.
ELookupTableType lut_type
What kind of a lookup table it is?
Auxiliary items needed for a DNA pattern search with pattern containing multiple words.
Uint4 DNAprefixSLL[100][256]
Where prefix of DNA 4-mer matches pattern, for multiple-word patterns.
Uint4 DNAsuffixSLL[100][256]
Where suffix of DNA 4-mer matches pattern, for multiple-word patterns.
Structure containing auxiliary items needed for a DNA search with a pattern that fits in a single wor...
Uint4 * DNAwhichPrefixPosPtr
Prefix position array for DNA patterns.
Uint4 DNAwhichSuffixPositions[256]
Where suffix of DNA 4-mer matches pattern.
Uint4 * DNAwhichSuffixPosPtr
Suffix position array for DNA patterns.
Uint4 DNAwhichPrefixPositions[256]
Where prefix of DNA 4-mer matches pattern.
Auxiliary items needed for a PHI BLAST search with pattern containing multiple words.
Int4 match_maskL[100]
Bit mask representation of input pattern for long patterns.
SExtraLongPatternItems * extra_long_items
Additional items necessary if pattern contains pieces longer than a word.
SDNALongPatternItems * dna_items
Additional items necessary for a DNA pattern.
Int4 SLL[100][256]
For each letter in the alphabet and each word in the masked pattern representation,...
Int4 inputPatternMasked[(30 *11)]
Masked input pattern.
Int4 bitPatternByLetter[256][11]
Which positions can a character occur in for long patterns.
Int4 numWords
Number of words need to hold bit representation of pattern.
Structure containing all auxiliary information needed in a pattern search.
SShortPatternItems * one_word_items
Items necessary when pattern fits in one word.
EPatternType flagPatternLength
Indicates if the whole pattern fits in 1 word, each of several parts of the pattern fit in a word,...
double patternProbability
Probability of this letter combination.
Int4 minPatternMatchLength
Minimum length of string to match this pattern.
char * pattern
Pattern used, saved here for error reporting.
SLongPatternItems * multi_word_items
Additional items, when pattern requires multiple words.
Auxiliary items needed for a PHI BLAST search with a pattern that fits in a single word.
Int4 * whichPositionPtr
Array of positions where pattern lettern should match, for a single word of the pattern.
SDNAShortPatternItems * dna_items
Additional items for a DNA search.
Int4 match_mask
Bit mask representation of input pattern for patterns that fit in a word.
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
voidp calloc(uInt items, uInt size)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4