data_type_sz)
69 void** retval =
NULL;
72retval = (
void**)
malloc(
sizeof(
void*) * ncols);
77 for(
i= 0;
i< ncols;
i++) {
78retval[
i] = (
void*)
calloc(nrows, data_type_sz);
95 for(
i= 0;
i< ncols;
i++) {
106 #define DEFINE_COPY_MATRIX_FUNCTION(type) \ 107 void _PSICopyMatrix_##type(type** dest, type** src, \ 108 unsigned int ncols, unsigned int nrows) \ 110 unsigned int i = 0; \ 111 unsigned int j = 0; \ 116 for (i = 0; i < ncols; i++) { \ 117 for (j = 0; j < nrows; j++) { \ 118 dest[i][j] = src[i][j]; \ 135 if( !msa || !msa->dimensions || !msa->data ) {
149(
void*) msa->dimensions,
154msa->dimensions->query_length,
156 if( !retval->
data) {
160 for(s = 0; s < msa->dimensions->num_seqs + 1; s++) {
161 for(p = 0; p < msa->dimensions->query_length; p++) {
163retval->
data[s][p].
letter= msa->data[s][p].letter;
175 for(s = 0; s < msa->dimensions->num_seqs + 1; s++) {
211 unsigned intretval = 0;
229 #ifdef DEBUG_PSSM_ENGINE 236PrintMsa(
const char* filename,
const PSIMsa* msa)
241 fp= fopen(filename,
"w");
242PrintMsaFP(
fp, msa);
247PrintMsaFP(FILE*
fp,
const PSIMsa* msa)
254fprintf(
fp,
"%3d\tGI=%10d\tEvalue=%2.0le\tBitScore=%4.1f\t",
i,
255msa->seqinfo[
i].gi,
256msa->seqinfo[
i].evalue,
257msa->seqinfo[
i].bit_score);
265fprintf(
fp,
"\n");
278fprintf(
fp,
"%3d\t",
i);
280fprintf(
fp,
"NOT USED\n");
291fprintf(
fp,
"\n");
296__printPackedMsa(
const char* filename,
const _PSIPackedMsa* msa)
301 fp= fopen(filename,
"w");
302__printPackedMsaFP(
fp, msa);
335 if( !retval->
cell) {
360 if( !retval->
query) {
434retval->
ncols= query_length;
435retval->
nrows= alphabet_size;
440 if( !retval->
pssm) {
474 if(pssm_data->
pssm) {
475pssm_data->
pssm= (
int**)
512 if( !retval->
size) {
524 for(
i= 0;
i< query_length;
i++) {
534 if( !aligned_blocks ) {
538 if(aligned_blocks->
size) {
546 sfree(aligned_blocks);
550 #define EFFECTIVE_ALPHABET 20 578 if( !retval->
sigma) {
629 if( !seq_weights ) {
641 if(seq_weights->
sigma) {
691msa->
query[p] == kGapResidue) {
717 if(msa->
cell[s][p].
letter== kGapResidue) {
732 if(msa->
cell[s][p].
letter== kGapResidue) {
769found_aligned_sequence =
TRUE;
770 if(msa->
cell[s][p].
letter!= kGapResidue) {
771found_non_gap_residue =
TRUE;
776 if( !found_aligned_sequence ) {
779 if( !found_non_gap_residue ) {
841 if( !ignore_unaligned_positions ) {
875 if(cd_msa->
query[p] == kGapResidue) {
887 if(!cd_msa->
msa[s][p].
data) {
895 for(k = 0; k < alphabet_size; k++) {
943 doublemax_percent_identity);
993 Uint4kQueryLength = 0;
994 Uint4kNumberOfSeqs = 0;
1007 for(p = 0; p < kQueryLength; p++) {
1012 for(s = 0; s < kNumberOfSeqs; s++) {
1015 for(p = 0; p < kQueryLength; p++, pos++) {
1052fprintf(stderr,
"Position: %d - State: %s\n", position,
1054fprintf(stderr,
"\tstart: %d\n", traits->
start);
1056fprintf(stderr,
"\tn_x_residues: %d\n", traits->
n_x_residues);
1057fprintf(stderr,
"\tn_identical: %d\n", traits->
n_identical);
1071traits->
start= position;
1079 doublemax_percent_identity)
1089 const doublepercent_identity =
1091 if(percent_identity >= max_percent_identity) {
1092 const unsigned intalign_stop =
1096traits->
start, align_stop);
1189 doublemax_percent_identity)
1202 if( seq_index1 == seq_index2 ||
1209seq1 = msa->
data[seq_index1];
1210seq2 = msa->
data[seq_index2];
1214 for(p = 0; p < kQueryLength; p++, seq1++, seq2++) {
1229 if(!kPos1Aligned && !kPos2Aligned) {
1231max_percent_identity);
1237seq1->
letter!= kXResidue && seq2->
letter!= kXResidue;
1246 if(neither_is_X && (kPos2Aligned && seq1->
is_aligned) &&
1253max_percent_identity);
1302 if( !msa || !aligned_blocks ) {
1327 ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1329sequence_position = msa->
cell[seq_index];
1332sequence_position[
prev].
letter!= kGapResidue) {
1339 if( !sequence_position[curr].
is_aligned) {
1361 ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1363sequence_position = msa->
cell[seq_index];
1367sequence_position[
last].
letter!= kGapResidue) {
1371 for(curr =
last- 1; curr >= 0; curr--,
last--) {
1373 if( !sequence_position[curr].
is_aligned) {
1391 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS 1399 ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1401sequence_position = msa->
cell[seq_index];
1404 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS 1406sequence_position[
i].
letter!= kGapResidue) {
1425 Uint4kQueryLength = 0;
1432 for(
i= 0;
i< kQueryLength;
i++) {
1444 for(
i= 0;
i< kQueryLength;
i++) {
1446 if(msa->
query[
i] == kXResidue) {
1449 for(idx = 0; idx <
i; idx++) {
1451msa->
query[idx] != kXResidue) {
1452aligned_blocks->
size[idx]--;
1457msa->
query[idx] != kXResidue) {
1458aligned_blocks->
size[idx]--;
1529 Booleannsg_compatibility_mode);
1544 Booleannsg_compatibility_mode);
1554 Booleannsg_compatibility_mode,
1563 Uint4kQueryLength = 0;
1566 const Uint4kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1567 Uint4last_calc_pos = 0;
1569 if( !msa || !aligned_blocks || !seq_weights ) {
1575 if( !aligned_seqs || !prev_pos_aligned_seqs ) {
1580 for(pos = 0; pos < kQueryLength; pos++) {
1583 if(aligned_blocks->
size[pos] == 0 ||
1591 if(aligned_seqs->
num_used<= kExpectedNumMatchingSeqs) {
1594last_calc_pos = pos;
1596 if(last_calc_pos != pos - 1 ||
1601memset((
void*)seq_weights->
row_sigma, 0,
1605aligned_seqs, seq_weights);
1608seq_weights->
sigma[pos] = seq_weights->
sigma[pos-1];
1627nsg_compatibility_mode);
1632 #ifndef PSI_IGNORE_GAPS_IN_COLUMNS 1635nsg_compatibility_mode);
1646 sfree(sum_weights);
1656 Uint4kQueryLength = 0;
1659 double* sum_weights =
NULL;
1663 if( !cd_msa || !seq_weights || !sbp || !options) {
1674 if( !sum_weights) {
1683 for(pos = 0; pos < kQueryLength; pos++) {
1684 doubletotal_observations = 0.0;
1689memset(sum_weights, 0, sbp->
alphabet_size*
sizeof(
double));
1703total_observations +=
1709 for(residue = 0; residue < sbp->
alphabet_size; residue++) {
1710sum_weights[residue] +=
1718 if(total_observations > 0.0 && query_residue != kXResidue
1719&& sum_weights[query_residue] == 0.0) {
1721sum_weights[query_residue] = 1.0;
1722total_observations += 1.0;
1726 if(total_observations > 0.0) {
1728 for(residue = 0; residue < sbp->
alphabet_size; residue++) {
1730sum_weights[residue] / total_observations;
1741total_observations);
1783 ASSERT(position < msa->dimensions->query_length);
1794 Uint4num_distinct_residues_for_column = 0;
1795 Uint4num_local_std_letters = 0;
1798 ASSERT(i < msa->dimensions->query_length);
1802 for(asi = 0; asi < aligned_seqs->
num_used; asi++) {
1803 const Uint4kSeqIdx = aligned_seqs->
data[asi];
1806 if(residue_counts_for_column[kResidue] == 0) {
1807num_distinct_residues_for_column++;
1808 if(kResidue != kGapResidue && kResidue != kXResidue)
1809num_local_std_letters++;
1811residue_counts_for_column[kResidue]++;
1814sigma += num_distinct_residues_for_column;
1817 if(num_distinct_residues_for_column > 1) {
1820distinct_residues_found =
TRUE;
1825 for(asi = 0; asi < aligned_seqs->
num_used; asi++) {
1826 const Uint4seq_idx = aligned_seqs->
data[asi];
1834(residue_counts_for_column[residue] *
1835num_distinct_residues_for_column) );
1840seq_weights->
sigma[position] = sigma;
1842 if(distinct_residues_found) {
1843 doubleweight_sum = 0.0;
1845 for(asi = 0; asi < aligned_seqs->
num_used; asi++) {
1846 const Uint4seq_idx = aligned_seqs->
data[asi];
1855 for(asi = 0; asi < aligned_seqs->
num_used; asi++) {
1856 const Uint4seq_idx = aligned_seqs->
data[asi];
1864 for(asi = 0; asi < aligned_seqs->
num_used; asi++) {
1865 const Uint4seq_idx = aligned_seqs->
data[asi];
1867(1.0/(double) aligned_seqs->
num_used);
1888 for(asi = 0; asi < aligned_seqs->
num_used; asi++) {
1889 const Uint4seq_idx = aligned_seqs->
data[asi];
1896 if(residue != kGapResidue) {
1908 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS 1914 ASSERT(position < msa->dimensions->query_length);
1920 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS 1922msa->
cell[
i][position].
letter!= kGapResidue) {
1934 Booleannsg_compatibility_mode)
1940 const Uint4kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1972 #define SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 0 1979 Booleannsg_compatibility_mode)
1983 const Uint4kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1985 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 1994 doublerunning_total = 0.0;
2005 for(residue = 0; residue < msa->
alphabet_size; residue++) {
2006running_total += seq_weights->
match_weights[pos][residue];
2009 if(running_total < 0.99 || running_total > 1.01) {
2012 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 2013check_performed =
TRUE;
2017 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 2020 if( !check_performed &&
2021!nsg_compatibility_mode ) {
2022 assert(!
"Did not perform sequence weights check");
2050 intcolumnNumber,
intqueryLength,
2051 const double*expno);
2062 const double*backgroundProbabilities,
2063 const doubleobservations);
2065 #define MAX_IND_OBSERVATIONS 400 2066 #define PSEUDO_MAX 1000000 2076 Booleannsg_compatibility_mode,
2084 const doublekZeroObsPseudo = 30.0;
2089 if( !msa || !seq_weights || !sbp || !aligned_blocks || !internal_pssm ) {
2099 doublecolumnCounts = 0.0;
2100 doubleobservations = 0.0;
2101 doublepseudoWeight;
2110 if(0 == pseudo_count)
2113columnCounts = pseudo_count;
2116pseudoWeight = kZeroObsPseudo;
2120pseudoWeight = columnCounts;
2135 doublepseudo = 0.0;
2137 const doublekBeta = pseudoWeight;
2138 doublenumerator = 0.0;
2139 doubledenominator = 0.0;
2140 doubleqOverPEstimate = 0.0;
2150freq_ratios->
data[
r][
i]);
2160denominator = observations + kBeta;
2162 if(nsg_compatibility_mode && denominator == 0.0) {
2165 ASSERT(denominator != 0.0);
2167qOverPEstimate = numerator/denominator;
2171internal_pssm->
freq_ratios[p][
r] = qOverPEstimate *
2195 const doublekZeroObsPseudo = 30.0;
2198 const double* backgroundProbabilities =
NULL;
2200 if( !cd_msa || !seq_weights || !sbp || !internal_pssm || pseudo_count < 0) {
2205 if( !freq_ratios ) {
2210 if( !backgroundProbabilities ) {
2216 doublecolumnCounts = 0.0;
2217 doubleobservations = 0.0;
2218 doublepseudoWeight;
2220 if(cd_msa->
query[p] != kXResidue)
2224observations =
MAX(0.0,
2227 if(0 == pseudo_count)
2230columnCounts = pseudo_count;
2234pseudoWeight = kZeroObsPseudo;
2238pseudoWeight = columnCounts;
2245 if(cd_msa->
query[p] == kXResidue ||
2253 doublepseudo = 0.0;
2255 const doublekBeta = pseudoWeight;
2256 doublenumerator = 0.0;
2257 doubledenominator = 0.0;
2258 doubleqOverPEstimate = 0.0;
2265freq_ratios->
data[
r][
i]);
2275denominator = observations + kBeta;
2277 ASSERT(denominator != 0.0);
2279qOverPEstimate = numerator/denominator;
2283internal_pssm->
freq_ratios[p][
r] = qOverPEstimate *
2301 const double* std_prob,
2307 double* retval =
NULL;
2311 if( !std_prob || !score_mat ) {
2315retval = (
double*)
calloc(query_length,
sizeof(
double));
2320 for(p = 0; p < query_length; p++) {
2322 doubleinfo_sum = 0.0;
2324 for(
r= 0;
r< alphabet_sz;
r++) {
2328 doubleexponent = exp(score *
lambda);
2329 double tmp= std_prob[
r] * exponent;
2334retval[p] = info_sum;
2342 double** freq_ratios,
2343 const double* std_prob,
2347 double* retval =
NULL;
2351 if( !std_prob || !freq_ratios ) {
2355retval = (
double*)
calloc(query_length,
sizeof(
double));
2360 for(p = 0; p < query_length; p++) {
2362 doubleinfo_sum = 0.0;
2364 for(
r= 0;
r< alphabet_sz;
r++) {
2369 doubleqOverPEstimate = freq_ratios[p][
r] / std_prob[
r];
2377retval[p] = info_sum;
2395 const double* std_probs)
2404 doubleideal_lambda = 0.0;
2406 if( !internal_pssm || !sbp || !std_probs )
2413 for(
i= 0;
i< internal_pssm->
ncols;
i++) {
2421 doubleqOverPEstimate = 0.0;
2430 if(is_unaligned_column && qOverPEstimate != 0.0) {
2431is_unaligned_column =
FALSE;
2435 if(qOverPEstimate == 0.0 || std_probs[j] <
kEpsilon) {
2438 double tmp=
log(qOverPEstimate)/ideal_lambda;
2443 if( (j == kXResidue || j == kStarResidue) &&
2450 if(is_unaligned_column) {
2455 if(freq_ratios->
data[kResidue][j] != 0.0) {
2482 const double* std_probs,
2488 int** scaled_pssm =
NULL;
2489 int** pssm =
NULL;
2491 doublefactor_low = 1.0;
2492 doublefactor_high = 1.0;
2493 doubleideal_lambda = 0.0;
2495 doublenew_lambda = 0.0;
2498 Uint4query_length = 0;
2501 if( !internal_pssm || !sbp || !
query|| !std_probs )
2508pssm = internal_pssm->
pssm;
2510query_length = internal_pssm->
ncols;
2517 for(
i= 0;
i< internal_pssm->
ncols;
i++) {
2518 for(j = 0; j < internal_pssm->
nrows; j++) {
2532 if(new_lambda > ideal_lambda) {
2535factor = factor_high;
2538first_time =
FALSE;
2540 if(too_high ==
FALSE) {
2543factor_high += (factor_high - 1.0);
2544factor = factor_high;
2546}
else if(new_lambda > 0) {
2550factor = factor_low;
2552first_time =
FALSE;
2554 if(too_high ==
TRUE) {
2557factor_low += (factor_low - 1.0);
2558factor = factor_low;
2570factor = (factor_high + factor_low)/2;
2572 for(
i= 0;
i< internal_pssm->
ncols;
i++) {
2573 for(j = 0; j < internal_pssm->
nrows; j++) {
2588 if(new_lambda > ideal_lambda) {
2589factor_low = factor;
2591factor_high = factor;
2602 doublescaling_factor)
2616scaling_factor,
TRUE, sbp);
2620internal_pssm->
ncols, internal_pssm->
nrows);
2637 for(
i= 0;
i< length;
i++) {
2638 if(seq[
i] != kXResidue) {
2650 const double* std_probs,
2655 Uint4alphabet_size = 0;
2657 Uint4effective_length = 0;
2673 if(alphabet_size <= 0) {
2681 for(p = 0; p < query_length; p++) {
2682 if(
query[p] == kXResidue) {
2685 for(
r= 0;
r< alphabet_size;
r++) {
2686 const int kScore= pssm[p][aa_alphabet[
r]];
2699 if( !score_freqs ) {
2703score_freqs->
obs_min= min_score;
2704score_freqs->
obs_max= max_score;
2705 for(p = 0; p < query_length; p++) {
2706 if(
query[p] == kXResidue) {
2710 for(
r= 0;
r< alphabet_size;
r++) {
2711 const int kScore= pssm[p][aa_alphabet[
r]];
2719(std_probs[aa_alphabet[
r]]/effective_length);
2724 for(s = min_score; s <= max_score; s++) {
2735 const double* std_probs,
2770contains_aligned_regions =
TRUE;
2775 if( !contains_aligned_regions ) {
2782 unsigned intseq_index,
2796sequence_position = msa->
data[seq_index];
2797 for(
i= start;
i< stop;
i++) {
2798sequence_position[
i].
letter= 0;
2819 if( !diagnostics || !msa || !aligned_block || !seq_weights ||
2820!internal_pssm || !internal_pssm->
freq_ratios) {
2876(seq_weights->
sigma[p] / aligned_block->
size[p] - 1);
2884 if(diagnostics->
sigma) {
2886diagnostics->
sigma[p] = seq_weights->
sigma[p];
2920 if( !diagnostics || !cd_msa || !seq_weights ||
2921!internal_pssm || !internal_pssm->
freq_ratios) {
3001probabilities[c] = posSearch->
match_weights[columnNumber][charOrder[c]];
3015 double*probabilitiesToReturn,
3016 doublestandardWeight,
3017 const double*standardProbabilities,
3018 doubleobservations)
3026intermediateSums[c] =
3027(initialProbabilities[c] * observations) +
3028(standardProbabilities[c] * standardWeight);
3029overallSum += intermediateSums[c];
3032probabilitiesToReturn[c] = intermediateSums[c]/overallSum;
3046 const double*backgroundProbabilities)
3054returnValue += (newDistribution[c] *
3055 log(newDistribution[c]/backgroundProbabilities[c]));
3059 return(returnValue);
3069 const double*backgroundProbabilities)
3073 doubleweighted_sum;
3080weighted_sum += exp(j*
log(1.0-backgroundProbabilities[k]));
3088 const double*backgroundProbabilities,
3089 const doubleobservations)
3093 doublerelativeEntropy;
3095 doublepseudoDenominator;
3098 const doublekPseudoMult = 500.0;
3099 const doublekPseudoNumerator = 0.0457;
3100 const doublekPseudoExponent = 0.8;
3101 const doublekPseudoSmallInitial = 5.5;
3106&(columnProbabilitiesAdjusted[0]),
3107kPseudoSmallInitial,
3108backgroundProbabilities, observations);
3110backgroundProbabilities);
3111pseudoDenominator = pow(relativeEntropy, kPseudoExponent);
3112alpha = kPseudoNumerator/pseudoDenominator;
3114returnValue = kPseudoMult * alpha/ (1- alpha);
3118 return(returnValue);
3123 intcolumnNumber,
intqueryLength,
3124 const double*expno)
3130 inttotalDistinctCounts;
3132 doubleaveDistinctAA;
3133 intcolumnsAccountedFor;
3149columnsAccountedFor = 0;
3150totalDistinctCounts = 0;
3151 while(columnsAccountedFor < halfNumColumns) {
3155 if(columnsAccountedFor > halfNumColumns) {
3156totalDistinctCounts -=
3157((columnsAccountedFor - halfNumColumns) * k);
3158columnsAccountedFor = halfNumColumns;
3162aveDistinctAA = ((double) totalDistinctCounts)/
3163((double) columnsAccountedFor);
3172 i-(expno[
i]-aveDistinctAA)/(expno[
i]-expno[
i-1]);
3174indep =
MAX(0,indep - 1);
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Int2 DynamicUint4Array_Append(SDynamicUint4Array *arr, Uint4 element)
Append a Uint4 to the dynamic array structure.
SDynamicUint4Array * DynamicUint4Array_Dup(const SDynamicUint4Array *src)
Make a deep copy of the src dynamic array.
Int4 DynamicUint4Array_Copy(SDynamicUint4Array *dest, const SDynamicUint4Array *src)
Make a shallow copy of the src dynamic array into the dest dynamic array.
SDynamicUint4Array * DynamicUint4ArrayNewEx(Uint4 init_num_elements)
Allocate a dynamic array of Uint4 with an initial size of specified as an argument to the function.
Boolean DynamicUint4Array_AreEqual(const SDynamicUint4Array *a, const SDynamicUint4Array *b)
Compares dynamic arrays a and b for equality of its contents.
SDynamicUint4Array * DynamicUint4ArrayFree(SDynamicUint4Array *arr)
Deallocates a dynamic array structure.
Declarations for various dynamic array types.
int Kappa_impalaScaling(Kappa_posSearchItems *posSearch, Kappa_compactSearchItems *compactSearch, double scalingFactor, Boolean doBinarySearch, BlastScoreBlk *sbp)
Copied from posit2.c.
Kappa_compactSearchItems * Kappa_compactSearchItemsNew(const Uint1 *query, unsigned int queryLength, BlastScoreBlk *sbp)
Creates a new Kappa_compactSearchItems structure.
Kappa_posSearchItems * Kappa_posSearchItemsFree(Kappa_posSearchItems *posSearch)
Deallocates the Kappa_posSearchItems structure.
Kappa_compactSearchItems * Kappa_compactSearchItemsFree(Kappa_compactSearchItems *compactSearch)
Deallocates the Kappa_compactSearchItems structure.
Kappa_posSearchItems * Kappa_posSearchItemsNew(unsigned int queryLength, const char *matrix_name, int **posPrivateMatrix, double **posFreqs)
Allocates a new Kappa_posSearchItems structure.
Port of posit.h structures and impalaScaling for implementing composition based statistics for PSI-BL...
static int s_PSIValidateAlignedColumns(const _PSIMsa *msa)
Validate that there are no unaligned columns or columns which only contain gaps in the multiple seque...
static NCBI_INLINE void _handleEitherAlignedNeitherX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state, Uint4 position)
Handle the event when either position is aligned and neither is X.
static NCBI_INLINE void _handleEitherAlignedEitherX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state)
Handle the event when either position is aligned and either is X.
static NCBI_INLINE void _handleNeitherAligned(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state, _PSIPackedMsa *msa, Uint4 seq_index, double max_percent_identity)
Handles neither is aligned event FIXME: document better.
int _PSIComputeAlignmentBlocks(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Main function to compute aligned blocks' properties for each position within multiple alignment (stag...
static void _PSIGetLeftExtents(const _PSIMsa *msa, Uint4 seq_index)
Computes the left extents for the sequence identified by seq_index.
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
static void _PSICalculateMatchWeights(const _PSIMsa *msa, Uint4 position, const SDynamicUint4Array *aligned_seqs, _PSISequenceWeights *seq_weights)
Calculate the weighted observed sequence weights.
int _PSIComputeFreqRatios(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, const _PSIAlignedBlock *aligned_blocks, Int4 pseudo_count, Boolean nsg_compatibility_mode, _PSIInternalPssmData *internal_pssm)
Main function to compute the PSSM's frequency ratios (stage 5).
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
static int s_PSIValidateNoGapsInQuery(const _PSIMsa *msa)
Validate that there are no gaps in the query sequence.
void _PSIStructureGroupCustomization(_PSIMsa *msa)
Enable NCBI structure group customization to discard the query sequence, as this really isn't the res...
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dimensions, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
static void _PSISpreadGapWeights(const _PSIMsa *msa, _PSISequenceWeights *seq_weights, Boolean nsg_compatibility_mode)
Uses disperse method of spreading the gap weights.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
static void _PSIGetAlignedSequencesForPosition(const _PSIMsa *msa, Uint4 position, SDynamicUint4Array *aligned_sequences)
Populates the array aligned_sequences with the indices of the sequences which are part of the multipl...
_PSIAlignedBlock * _PSIAlignedBlockNew(Uint4 query_length)
Allocates and initializes the _PSIAlignedBlock structure.
unsigned int _PSIPackedMsaGetNumberOfAlignedSeqs(const _PSIPackedMsa *msa)
Retrieve the number of aligned sequences in the compact multiple sequence alignment.
int _PSIComputeSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Boolean nsg_compatibility_mode, _PSISequenceWeights *seq_weights)
Main function to calculate the sequence weights.
int _PSISaveDiagnostics(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_block, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the PSSM.
static int s_PSIValidateParticipatingSequences(const _PSIMsa *msa)
Verify that after purging biased sequences in multiple sequence alignment there are still sequences p...
static void s_fillColumnProbabilities(double *probabilities, const _PSISequenceWeights *posSearch, Int4 columnNumber)
Reorders in the same manner as returned by Blast_GetMatrixBackgroundFreq this function is a copy of p...
double * _PSICalculateInformationContentFromScoreMatrix(Int4 **score_mat, const double *std_prob, const Uint1 *query, Uint4 query_length, Uint4 alphabet_sz, double lambda)
Calculates the information content from the scoring matrix.
static int _PSICheckSequenceWeights(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, Boolean nsg_compatibility_mode)
Verifies that the sequence weights for each column of the PSSM add up to 1.0.
const double kPosEpsilon
minimum return value of s_computeRelativeEntropy
Blast_ScoreFreq * _PSIComputeScoreProbabilities(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, const BlastScoreBlk *sbp)
Compute the probabilities for each score in the PSSM.
static double s_effectiveObservations(const _PSIAlignedBlock *align_blk, const _PSISequenceWeights *seq_weights, int columnNumber, int queryLength, const double *expno)
A method to estimate the effetive number of observations in the interval for the specified columnNumb...
static double s_columnSpecificPseudocounts(const _PSISequenceWeights *posSearch, int columnNumber, const double *backgroundProbabilities, const double observations)
copy of posit.c:columnSpecificPseudocounts
int _PSIPurgeBiasedSegments(_PSIPackedMsa *msa)
Main function for keeping only those selected sequences for PSSM construction (stage 2).
static void s_PSIDiscardIfUnused(_PSIPackedMsa *msa, unsigned int seq_index)
Check if we still need this sequence.
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
static int s_PSIValidateNoFlankingGaps(const _PSIMsa *msa)
Validate that there are no flanking gaps in the multiple sequence alignment.
static void s_initializeExpNumObservations(double *expno, const double *backgroundProbabilities)
initialize the expected number of observations use background probabilities for this matrix Calculate...
_PSIMsa * _PSIMsaNew(const _PSIPackedMsa *msa, Uint4 alphabet_size)
Allocates and initializes the internal version of the PSIMsa structure (makes a deep copy) for intern...
_EPSIPurgeFsmState
Defines the states of the finite state machine used in s_PSIPurgeSimilarAlignments.
static void _PSIComputeAlignedRegionLengths(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Calculates the aligned blocks lengths in the multiple sequence alignment data structure.
const int kPSIScaleFactor
Successor to POSIT_SCALE_FACTOR.
const double kPSINearIdentical
Percent identity threshold for discarding near-identical matches.
const Uint4 kPositScalingNumIterations
Constant used in scaling PSSM routines: Successor to POSIT_NUM_ITERATIONS.
_PSISequenceWeights * _PSISequenceWeightsFree(_PSISequenceWeights *seq_weights)
Deallocates the _PSISequenceWeights structure.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
#define MAX_IND_OBSERVATIONS
max number of independent observation for pseudocount calculation
static void s_PSIComputeFrequenciesFromCDsCleanup(double *sum_weights)
static NCBI_INLINE void _handleBothAlignedSameResidueNoX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state)
Handle event when both positions are aligned, using the same residue, but this residue is not X.
Uint4 _PSISequenceLengthWithoutX(const Uint1 *seq, Uint4 length)
Calculates the length of the sequence without including any 'X' residues.
static void s_PSIPurgeSimilarAlignments(_PSIPackedMsa *msa, Uint4 seq_index1, Uint4 seq_index2, double max_percent_identity)
This function compares the sequences in the msa->cell structure indexed by sequence_index1 and seq_in...
static void _PSIGetRightExtents(const _PSIMsa *msa, Uint4 seq_index)
Computes the right extents for the sequence identified by seq_index.
void _PSIUpdateLambdaK(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, BlastScoreBlk *sbp)
Updates the Karlin-Altschul parameters based on the query sequence and PSSM's score frequencies.
static void _PSIComputePositionExtents(const _PSIMsa *msa, Uint4 seq_index, _PSIAlignedBlock *aligned_blocks)
Computes the aligned blocks extents' for each position for the sequence identified by seq_index.
#define PSEUDO_MAX
effective infinity
static double s_computeRelativeEntropy(const double *newDistribution, const double *backgroundProbabilities)
compute relative entropy of first distribution to second distribution A copy of posit....
static void _PSICalculateNormalizedSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Uint4 position, const SDynamicUint4Array *aligned_seqs, _PSISequenceWeights *seq_weights)
Calculates the position based weights using a modified version of the Henikoff's algorithm presented ...
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
_PSIInternalPssmData * _PSIInternalPssmDataFree(_PSIInternalPssmData *pssm_data)
Deallocates the _PSIInternalPssmData structure.
int _PSIValidateMSA_StructureGroup(const _PSIMsa *msa)
Structure group validation function for multiple sequence alignment structure.
int _PSIScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp)
Scales the PSSM (stage 7)
int _PSIPurgeAlignedRegion(_PSIPackedMsa *msa, unsigned int seq_index, unsigned int start, unsigned int stop)
Marks the (start, stop] region corresponding to sequence seq_index in alignment so that it is not fur...
_PSIMsa * _PSIMsaFree(_PSIMsa *msa)
Deallocates the _PSIMsa data structure.
#define DEFINE_COPY_MATRIX_FUNCTION(type)
Implements the generic copy matrix functions.
_PSIAlignedBlock * _PSIAlignedBlockFree(_PSIAlignedBlock *aligned_blocks)
Deallocates the _PSIAlignedBlock structure.
const double kEpsilon
Small constant to test against 0.
double * _PSICalculateInformationContentFromFreqRatios(double **freq_ratios, const double *std_prob, Uint4 query_length, Uint4 alphabet_sz)
Calculates the information content from the residue frequencies calculated in stage 5 of the PSSM cre...
static void s_PSIPurgeSelfHits(_PSIPackedMsa *msa)
Remove those sequences which are identical to the query sequence.
#define EFFECTIVE_ALPHABET
size of alphabet used for pseudocounts calculations
int _PSISaveCDDiagnostics(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the CDD-based PSSM.
static void s_PSIPurgeNearIdenticalAlignments(_PSIPackedMsa *msa)
Keeps only one copy of any aligned sequences which are >kPSINearIdentical% identical to one another.
struct _PSIAlignmentTraits _PSIAlignmentTraits
Auxiliary structure to maintain information about two aligned regions between the query and a subject...
int _PSIValidateCdMSA(const PSICdMsa *cd_msa, Uint4 alphabet_size)
Validation of multiple alignment of conserved domains structure.
void _PSIUpdatePositionCounts(_PSIMsa *msa)
Counts the number of sequences matching the query per query position (columns of the multiple alignme...
const double kPSIIdentical
Percent identity threshold for discarding identical matches.
_PSIPackedMsa * _PSIPackedMsaNew(const PSIMsa *msa)
Allocates and initializes the compact version of the PSIMsa structure (makes a deep copy) for interna...
static void s_adjustColumnProbabilities(double *initialProbabilities, double *probabilitiesToReturn, double standardWeight, const double *standardProbabilities, double observations)
adjust the probabilities by assigning observations weight to initialProbabilities and standardWeight ...
const double kPositScalingPercent
Constant used in scaling PSSM routines: Successor to POSIT_PERCENT.
int _IMPALAScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp, double scaling_factor)
Provides a similar function to _PSIScaleMatrix but it performs the scaling as IMPALA did,...
static NCBI_INLINE void _PSIResetAlignmentTraits(_PSIAlignmentTraits *traits, Uint4 position)
Resets the traits structure to restart finite state machine.
_PSIPackedMsa * _PSIPackedMsaFree(_PSIPackedMsa *msa)
Deallocates the _PSIMsa data structure.
int _PSIValidateMSA(const _PSIMsa *msa, Boolean ignore_unaligned_positions)
Main validation function for multiple sequence alignment structure.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
#define PSIERR_BADPARAM
Bad parameter used in function.
#define PSIERR_ENDINGGAP
Found flanking gap at end of alignment.
#define PSIERR_UNKNOWN
Unknown error.
#define PSIERR_COLUMNOFGAPS
Found an entire column full of GAP residues.
#define PSIERR_OUTOFMEM
Out of memory.
#define PSIERR_BADPROFILE
Errors in conserved domain profile.
#define PSIERR_POSITIVEAVGSCORE
Positive average score found when scaling matrix.
#define PSIERR_NOALIGNEDSEQS
After purge stage of PSSM creation, no sequences are left.
#define PSIERR_STARTINGGAP
Found flanking gap at start of alignment.
#define PSIERR_BADSEQWEIGHTS
Sequence weights do not add to 1.
#define PSI_SUCCESS
Successful operation.
#define PSIERR_UNALIGNEDCOLUMN
Found an entire column with no participating sequences.
#define PSIERR_GAPINQUERY
GAP residue found in query sequence.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
Int2 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1 *residues, Uint4 residue_size)
Fills a buffer with the 'standard' alphabet (given by STD_AMINO_ACID_FREQS[index]....
Int2 Blast_KarlinBlkUngappedCalc(Blast_KarlinBlk *kbp, Blast_ScoreFreq *sfp)
Computes the parameters lambda, H K for use in calculating the statistical significance of high-scori...
Blast_ScoreFreq * Blast_ScoreFreqFree(Blast_ScoreFreq *sfp)
Deallocates the score frequencies structure.
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Blast_ScoreFreq * Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
Creates a new structure to keep track of score frequencies for a scoring system.
Various auxiliary BLAST utility functions.
#define IS_residue(x)
Does character encode a residue?
double * BLAST_GetStandardAaProbabilities(void)
Get the standard amino acid probabilities.
Constants used in compositional score matrix adjustment.
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
#define BLASTAA_SIZE
Size of aminoacid alphabet.
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
unsigned int
A callback function used to compare two keys in a database.
static const char * GetResidue(TokenStatBlkPtr stoken)
SFreqRatios * _PSIMatrixFrequencyRatiosFree(SFreqRatios *freq_ratios)
Deallocate the frequency ratios structure.
SFreqRatios * _PSIMatrixFrequencyRatiosNew(const char *matrix_name)
Retrive the matrix's frequency ratios.
Definitions used to get joint probabilities for a scoring matrix.
const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name)
Return true if frequency data is available for the given matrix name.
bool is_aligned(T *p) noexcept
Check pointer alignment.
Prototypes for portable math library (ported from C Toolkit)
#define NCBIMATH_LN2
Natural log(2)
long BLAST_Nint(double x)
Nearest integer.
#define MIN(a, b)
returns smaller of a and b.
#define NCBI_INLINE
"inline" seems to work on our remaining in-house compilers (WorkShop, Compaq, ICC,...
Uint1 Boolean
bool replacment for C
#define TRUE
bool replacment for C indicating true.
#define FALSE
bool replacment for C indicating false.
#define ASSERT
macro for assert.
#define MAX(a, b)
returns larger of a and b.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double lambda(size_t dimMatrix_, const Int4 *const *scoreMatrix_, const double *q_)
static const char * kScore
Structure used for scoring calculations.
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
char * name
name of scoring matrix.
Int2 alphabet_size
size of alphabet.
Uint1 alphabet_code
NCBI alphabet code.
SBlastScoreMatrix * matrix
scoring matrix data
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
double K
K value used in statistics.
double Lambda
Lambda value used in statistics.
double logK
natural log of K value used in statistics
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
double score_avg
average score, must be negative for local alignment.
Int4 obs_min
lowest observed (actual) scores
double * sprob
arrays for frequency of given score, shifted down by score_min.
Int4 obs_max
highest observed (actual) scores
Structure used to pass data into the scaling routines.
Structure used to pass data into the scaling routines.
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double iobsr
Effective number of independent observations in a CD column.
double * wfreqs
Frequencies for each residue in CD column.
Uint1 is_aligned
Does this cell represent column aligned to a CD.
PSICdMsaCellData * data
Data needed for PSSM computation.
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
unsigned char * query
Query sequence as Ncbistdaa.
PSICdMsaCell ** msa
Multiple alignment of CDs.
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
double * information_content
position information content (query_length elements)
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Uint4 alphabet_size
Specifies length of alphabet.
Uint4 query_length
Specifies the number of positions in the PSSM.
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
double * independent_observations
Effective number of observations per column.
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
double * sigma
sigma (query_length elements)
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
Boolean is_aligned
Is this letter part of the alignment?
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Structure representing the dimensions of the multiple sequence alignment data structure.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
PSIMsaDimensions * dimensions
dimensions of the msa
int ** data
actual scoring matrix data, stored in row-major form
Data structure to maintain a dynamically allocated array of Uint4.
Uint4 * data
array of Uint4
Uint4 num_used
number of elements used in this array
Uint4 num_allocated
size of array below
Stores the frequency ratios along with their bit scale factor.
double ** data
The actual frequency ratios.
int bit_scale_factor
Used to multiply the values in the above matrix to obtain scores in bit units.
A structure containing two integers, used e.g.
Int4 left
left endpoint of range (zero based)
Int4 right
right endpoint of range (zero based)
This structure keeps track of the regions aligned between the query sequence and those that were not ...
SSeqRange * pos_extnt
Dynamically allocated array of size query_length to keep track of the extents of each aligned positio...
Uint4 * size
Dynamically allocated array of size query_length that contains the size of the intervals in the array...
Auxiliary structure to maintain information about two aligned regions between the query and a subject...
Uint4 n_x_residues
number of X residues in alignment
Uint4 n_identical
number of identical residues in alignment
Uint4 start
starting offset of alignment w.r.t.
Uint4 effective_length
length of alignment not including Xs
Internal representation of a PSSM in various stages of its creation and its dimensions.
int ** scaled_pssm
scaled PSSM (scores)
Uint4 nrows
number of rows (alphabet_size)
double * pseudocounts
pseudocount constant for each column
int ** pssm
PSSM (scores)
Uint4 ncols
number of columns (query_length)
double ** freq_ratios
frequency ratios
Internal data structure to represent a position in the multiple sequence alignment data structure.
unsigned int letter
Preferred letter at this position.
SSeqRange extents
Extents of this aligned position.
unsigned int is_aligned
Is this letter part of the alignment?
Internal multiple alignment data structure used by the PSSM engine.
Uint4 * num_matching_seqs
number of sequences aligned at a given position in the multiple sequence alignment (length: query_len...
Uint1 * query
query sequence (length: query_length)
Uint4 ** residue_counts
matrix to keep track of the raw residue counts at each position of the multiple sequence alignment (d...
PSIMsaDimensions * dimensions
dimensions of field below
Uint4 alphabet_size
number of elements in alphabet
_PSIMsaCell ** cell
multiple sequence alignment matrix (dimensions: query_length x num_seqs + 1)
Compact version of the PSIMsaCell structure.
unsigned int letter
Preferred letter at this position, in ncbistdaa encoding.
unsigned int is_aligned
Is this letter part of the alignment?
Compact version of PSIMsa structure.
PSIMsaDimensions * dimensions
dimensions of the msa
Boolean * use_sequence
used to indicate whether a sequence should be used for further processing by the engine (length: num_...
_PSIPackedMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Internal data structure to keep computed sequence weights.
double ** match_weights
weighted observed residue frequencies (f_i in 2001 paper).
Uint4 posDistinctDistrib_size
Kept to deallocate field above.
double * std_prob
standard amino acid probabilities
double * norm_seq_weights
Stores the normalized sequence weights (length: num_seqs + 1)
int ** posDistinctDistrib
For position i, how many positions in its block have j distinct letters.
double * independent_observations
Number of independent sequences per column.
int * posNumParticipating
number of sequences at each position.
double * sigma
array of length query_length
double * gapless_column_weights
FIXME.
Uint4 match_weights_size
kept for help deallocate the field above
double * row_sigma
array of length num_seqs + 1
static Uint4 letter(char c)
voidp calloc(uInt items, uInt size)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4