& matrix_name
,
79 const string& query_title
)
80: m_QueryTitle(query_title),
82m_SeqalignSet(seqaligns),
85m_MatrixName(matrix_name),
86m_DiagnosticsRequest(diags),
88m_GapExistence(gap_existence),
89m_GapExtension(gap_extension)
95 if(seqaligns.
Empty()) {
106 for(
unsigned int i=0;
i<
m_Hits.size();
i++) {
118 "Minimum RPS-BLAST e-value is larger than the maximum one");
158 "Evalue not found in Seq-align");
161 if(evalue >= min_evalue && evalue < max_evalue) {
162 m_Hits.push_back(
new CHit((*it)->GetSegs().GetDenseg(), evalue));
171 if(
m_Hits.size() < 2) {
177vector<CHit*> new_hits;
178new_hits.reserve(
m_Hits.size());
180new_hits.push_back(
m_Hits[0]);
182vector<CHit*>::iterator it(
m_Hits.begin());
186 for(;it !=
m_Hits.end();++it) {
190 for(
int i=
static_cast<int>(new_hits.size()) - 1;
i>= 0
191&& (*it)->m_SubjectId->Match(*new_hits[
i]->m_SubjectId);
i--) {
193 const CHit* kept_hit = new_hits[
i];
198 CHitintersection(*kept_hit);
204(*it)->Subtract(intersection);
206 if((*it)->IsEmpty()) {
213new_hits.push_back(*it);
235(*it)->FillData(seqdb, *profile_data);
242 const intkQueryLength =
static_cast<int>(
m_QueryData.size());
243 const intkNumCds =
static_cast<int>(
m_Hits.size());
250 m_MsaData.resize(kQueryLength * (kNumCds), cell);
254 "Multiple alignment data structure");
256 for(
int i=0;
i< kNumCds;
i++) {
261 for(
size_thit_idx=0;hit_idx <
m_Hits.size();hit_idx++) {
265 m_Hits[hit_idx]->GetSegments()) {
267 const intkNumQueryColumns
268= (*it)->m_QueryRange.GetTo() - (*it)->m_QueryRange.GetFrom();
270 intq_from = (*it)->m_QueryRange.GetFrom();
273 for(
int i=0;
i< kNumQueryColumns;
i++) {
276 m_Msa[hit_idx][q_from +
i].
data= &(*it)->m_MsaData[
i];
279 m_Hits[hit_idx]->m_MsaIdx =
static_cast<int>(hit_idx);
289 const intkQueryLength =
static_cast<int>(
m_QueryData.size());
290 const intkNumCds =
static_cast<int>(
m_Hits.size());
292 for(
int i=0;
i< kNumCds;
i++) {
296 for(
int i=0;
i< kNumCds;
i++) {
297 for(
intj=0;j < kQueryLength;j++) {
301 "Query sequence cannot contain gaps");
312 if(
data->iobsr <= 0.0) {
314 "Zero independent observations in domain model");
320 if(
data->wfreqs[k] < 0.0) {
322 "Negative residue frequency in a domain " 325s +=
data->wfreqs[k];
331 if(
fabs(s - 1.0) > 1e-5) {
333 "Domain residue frequencies do not sum to 1");
344: m_Evalue(evalue), m_MsaIdx(-1)
346 const intkNumDims = denseg.
GetDim();
347 const intkNumSegments = denseg.
GetNumseg();
351m_SubjectId.Reset(denseg.
GetIds()[1].GetNonNullPointer());
353 constvector<TSignedSeqPos>& starts = denseg.
GetStarts();
354 constvector<TSeqPos>& lens = denseg.
GetLens();
359 for(
intseg=0;seg < kNumSegments;seg++) {
360 TSeqPosquery_offset = starts[query_index];
361 TSeqPossubject_offset = starts[subject_index];
363query_index += kNumDims;
364subject_index += kNumDims;
370m_SegmentList.push_back(
newCHitSegment(
371 TRange(query_offset, query_offset + lens[seg]),
372 TRange(subject_offset, subject_offset
375query_offset += lens[seg];
376subject_offset += lens[seg];
383: m_SubjectId(hit.m_SubjectId),
384m_Evalue(hit.m_Evalue),
385m_MsaIdx(hit.m_MsaIdx)
396 ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
409 ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
410 result+= (*it)->GetLength();
426(*it)->FillData(db_oid, profile_data);
482 _ASSERT(!m_SubjectId.Empty());
484 ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
495 if(m_SegmentList.empty()) {
499 ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
500 if(!(*it)->IsEmpty()) {
515vector<TRange>::const_iterator r_itr = ranges.begin();
516vector<CHitSegment*>::iterator seg_it = m_SegmentList.begin();
517vector<CHitSegment*> new_segs;
518 while(seg_it != m_SegmentList.end() && r_itr != ranges.end()) {
522= (app == eSubject ? (*seg_it)->m_SubjectRange
523: (*seg_it)->m_QueryRange);
526 while(r_itr != ranges.end() && r_itr->GetTo() < seg_range.
GetFrom()) {
530 if(r_itr == ranges.end()) {
538 if(intersection == seg_range) {
544 if(intersection.
Empty()) {
554 while(r_itr != ranges.end() && r_itr->GetFrom() < seg_range.
GetTo()) {
558r_itr->GetFrom()) - seg_range.
GetFrom();
559 intd_to =
min(seg_range.
GetTo(),
560r_itr->GetTo()) - seg_range.
GetTo();
565new_segs.push_back(new_seg);
580 while(seg_it != m_SegmentList.end()) {
587 ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
589new_segs.push_back(*it);
594m_SegmentList.swap(new_segs);
601vector<TRange> ranges;
604ranges.push_back(app == eQuery ? (*it)->m_QueryRange
605: (*it)->m_SubjectRange);
610IntersectWith(ranges, app);
618 if(IsEmpty() || hit.
IsEmpty()) {
626 intfrom = hit.
GetSegments().front()->m_QueryRange.GetFrom();
627 intto = hit.
GetSegments().back()->m_QueryRange.GetTo();
630 if(m_SegmentList.front()->m_QueryRange.GetFrom() >= to
631|| m_SegmentList.back()->m_QueryRange.GetTo() <= from) {
637vector<CHitSegment*>::iterator it = m_SegmentList.begin();
639vector<CHitSegment*> new_segments;
640new_segments.reserve(m_SegmentList.size());
644 while(it != m_SegmentList.end() && (*it)->m_QueryRange.GetTo() <= from) {
646new_segments.push_back(*it);
653 if(it == m_SegmentList.end() || (*it)->m_QueryRange.GetFrom() > to) {
658 if((*it)->m_QueryRange.GetTo() > to) {
665 if((*it)->m_QueryRange.GetFrom() < from) {
670 intd_to = from - (*it)->m_QueryRange.GetTo();
672(*it)->AdjustRanges(0, d_to);
673 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
674new_segments.push_back(*it);
684 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
685new_segments.push_back(new_seg);
689 for(;it != m_SegmentList.end();++it) {
690new_segments.push_back(*it);
697 if((*it)->m_QueryRange.GetFrom() >= from) {
704 intd_to = from - (*it)->m_QueryRange.GetTo();
707(*it)->AdjustRanges(0, d_to);
708 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
709new_segments.push_back(*it);
714 while(it != m_SegmentList.end()
715&& (*it)->m_QueryRange.GetTo() <= to) {
723 if(it != m_SegmentList.end()) {
725 if((*it)->m_QueryRange.GetFrom() < to) {
726 intd_from = to - (*it)->m_QueryRange.GetFrom();
729(*it)->AdjustRanges(d_from, 0);
730 _ASSERT((*it)->m_QueryRange.GetFrom()
731< (*it)->m_QueryRange.GetTo());
733new_segments.push_back(*it);
742 while(it != m_SegmentList.end()) {
743new_segments.push_back(*it);
749m_SegmentList.swap(new_segments);
759 m_MsaData.resize(m_QueryRange.GetTo() - m_QueryRange.GetFrom(), d);
761x_FillResidueCounts(db_oid, profile_data);
762x_FillObservations(db_oid, profile_data);
768 _ASSERT(m_QueryRange.GetFrom() >= 0 && m_QueryRange.GetTo() >= 0);
769 _ASSERT(m_SubjectRange.GetFrom() >= 0 && m_SubjectRange.GetTo() >= 0);
771 const intkQueryLength = m_QueryRange.GetTo() - m_QueryRange.GetFrom();
772 const intkSubjectLength = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
774 if(kQueryLength != kSubjectLength) {
790m_QueryRange.SetFrom(m_QueryRange.GetFrom() + d_from);
791m_QueryRange.SetTo(m_QueryRange.GetTo() + d_to);
793m_SubjectRange.SetFrom(m_SubjectRange.GetFrom() + d_from);
794m_SubjectRange.SetTo(m_SubjectRange.GetTo() + d_to);
800 returnm_QueryRange.GetFrom() > m_QueryRange.GetTo()
801|| m_SubjectRange.GetFrom() > m_SubjectRange.GetTo();
807 _ASSERT(profile_data()->freq_header);
812 _ASSERT(db_oid < num_profiles);
816 const TFreqs* db_counts =
821 intdb_seq_length = db_seq_offsets[db_oid + 1] - db_seq_offsets[db_oid];
826 _ASSERT(m_SubjectRange.GetTo() <= db_seq_length);
831 for(
int i=0;
i< num_columns;
i++) {
845(double)counts[(m_SubjectRange.GetFrom() +
i) *
kAlphabetSize+ j]
846/ (
double)sum_freqs;
856 _ASSERT(profile_data()->obsr_header);
861 _ASSERT(db_oid < num_profiles);
865 const TObsr* data_start
873 for(
int i=0;
i< data_size;
i+=2) {
878 for(
intj=0;j < num;j++) {
879obsr.push_back(
val);
883 intnum_columns = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
884 for(
int i=0;
i< num_columns;
i++) {
User-defined methods of the data storage class.
Declares the BLAST exception class.
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
Defines BLAST error codes (user errors included)
Wrapper class to manage the BlastRPSInfo structure, as currently there aren't any allocation or deall...
Defines system exceptions occurred while running BLAST.
Represents one alignment segment of a RPS-BLAST hit.
Class used for sorting hits by subject seq-id and e-value.
Class used for sorting hit segments by range.
Class used for sorting ranges.
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
#define GAP_IN_ALIGNMENT
Representation of GAP in Seq-align.
virtual ~CCddInputData()
Virtual destructor.
CRef< objects::CBioseq > m_QueryBioseq
Query as Bioseq.
static const char kGapChar('-')
The representation of a gap in ASCII format.
void AdjustRanges(int d_from, int d_to)
Change ranges on query and subject by given values.
TRange m_QueryRange
Segment range on query.
vector< CHit * > m_Hits
RPS-BLAST hits in internal representation.
bool IsEmpty(void) const
Is hit empty.
string m_DbName
CDD database name.
CHit(const objects::CDense_seg &denseg, double evalue)
Constructor.
vector< CHitSegment * > m_SegmentList
List of hit segments.
void x_CreateMsa(void)
Create multiple alignment of CDs.
void x_ProcessAlignments(double min_evalue, double max_evalue)
Process RPS-BLAST hits.
void x_RemoveMultipleCdHits(void)
Remove multiple hits to the same CD.
vector< CHitSegment * > & GetSegments(void)
Get hit segments.
void Subtract(const CHit &hit)
Subtract from another hit from this hit using query ranges.
PSIBlastOptions m_Opts
Delta BLAST options for PSSM Engine.
int GetLength(void) const
Get hit length in residues, counts number of matching residues, gaps are not counted.
void Process(void)
Pre-process CD matches to query.
double m_MinEvalue
Min e-value threshold for all hits to be included in PSSM computation.
PSICdMsaCell ** m_Msa
Pointer to MSA.
CConstRef< objects::CSeq_align_set > m_SeqalignSet
RPS-BLAST hits for the query.
CCddInputData(const Uint1 *query, unsigned int query_length, CConstRef< objects::CSeq_align_set > seqaligns, const PSIBlastOptions &opts, const string &dbname, const string &matrix_name="BLOSUM62", int gap_existence=0, int gap_extension=0, PSIDiagnosticsRequest *diags=NULL, const string &query_title="")
Constructor.
bool x_ValidateHits(void) const
Validate internal representation of RPS-BLAST hits.
PSICdMsa m_CddData
MSA of CDs and CD data.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
vector< Uint1 > m_QueryData
Query sequence.
PSIMsaDimensions m_MsaDimensions
MSA dimensions, used by PSSM engine.
void x_FillResidueCounts(int db_oid, const CBlastRPSInfo &profile_data)
Populate arrays of weighted residue counts.
void x_FillObservations(int db_oid, const CBlastRPSInfo &profile_data)
Populate arrays of effective numbers of observations.
void FillData(int db_oid, const CBlastRPSInfo &profile_data)
Allocate and populate arrays for MSA data (weighted residue counts and effective observations used fo...
static const int kAlphabetSize
bool Validate(void) const
Validate hit.
vector< PSICdMsaCell > m_MsaData
MSA data.
Uint4 TFreqs
Type used for residue frequencies stored in CDD.
void IntersectWith(const vector< TRange > &segments, EApplyTo app)
Intersect hit segments with list of ranges and store result in hit segments.
bool Validate(void) const
Validate hit segment.
unsigned int GetQueryLength(void)
Get query length.
string m_QueryTitle
Query title (for PSSM)
static const int kRpsScaleFactor
Scale of residue frequencies and number of independent observations stored in CDD.
bool x_ValidateMsa(void) const
Validate multiple alignment of CDs.
void x_ExtractQueryForPssm(void)
Create query as Bioseq.
bool IsEmpty(void) const
Does the hit segment represent an empty range.
void FillData(const CSeqDB &seqdb, const CBlastRPSInfo &profile_data)
Allocate and populate arrays of data for PSSM computation.
Uint4 TObsr
Type used for number of independent observations stored in CDD.
void x_FillHitsData(void)
Read data needed for PSSM computation from CDD and populate arrays.
EApplyTo
Master selection for operations involving ranges.
@ fDeltaBlast
Flags set for DELTA-BLAST.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty â not pointing to any object which means having a null value.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty â pointing to an object and has a non-null value.
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty â pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty â not pointing to any object, which means having a null value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
TThisType IntersectionWith(const TThisType &r) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define USING_SCOPE(ns)
Use the specified namespace.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
const TStarts & GetStarts(void) const
Get the Starts member data.
const TLens & GetLens(void) const
Get the Lens member data.
TDim GetDim(void) const
Get the Dim member data.
const TIds & GetIds(void) const
Get the Ids member data.
TNumseg GetNumseg(void) const
Get the Numseg member data.
list< CRef< CSeq_align > > Tdata
TTitle & SetTitle(void)
Select the variant.
@ eRepr_raw
continuous sequence
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
bool is_aligned(T *p) noexcept
Check pointer alignment.
constexpr auto sort(_Init &&init)
static PCRE2_SIZE * offsets
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double inclusion_ethresh
Minimum evalue for inclusion in PSSM calculation.
Data needed for PSSM computation stored in MSA cell for single column in CD aligned to a position in ...
double iobsr
Effective number of independent observations in a CD column.
double * wfreqs
Frequencies for each residue in CD column.
Alignment cell that represents one column of CD aligned to a position in the query.
Uint1 is_aligned
Does this cell represent column aligned to a CD.
PSICdMsaCellData * data
Data needed for PSSM computation.
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
unsigned char * query
Query sequence as Ncbistdaa.
PSICdMsaCell ** msa
Multiple alignment of CDs.
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4