(it->seq1_start, it->seq1_stop),
93 TRange(it->seq2_start, it->seq2_stop),
117 const unsigned char* sequence = it->GetSequence();
118 for(
int i=0;
i< it->GetLength();
i++) {
121 "input sequences are not allowed");
134 "Empty input alignment");
140 if(*it >= (
int)
m_InMSA1.size() || *it < 0) {
143 " for MSA 1 out of bounds");
149 if(*it >= (
int)
m_InMSA2.size() || *it < 0) {
152 " for MSA 2 out of bounds");
167 "Sequence specified by constraint is out of range");
176 if(from1 > to1 || from2 > to2) {
178 "Range specified by constraint is invalid");
185 "Constraint is out of range");
197 if(queries.size() < 2) {
199 "Aligner requires at least two input sequences");
221 if(queries.size() < 2) {
223 "Aligner requires at least two input sequences");
227= objects::CObjectManager::GetInstance();
232vector<objects::CBioseq_Handle> bioseq_handles;
234bioseq_handles.push_back(
m_Scope->AddBioseq(**it));
238 ITERATE(vector<objects::CBioseq_Handle>, it, bioseq_handles) {
240seq_loc(
newobjects::CSeq_loc(objects::CSeq_loc::e_Whole));
243seq_loc->SetId(*it->GetSeqId());
245 catch(objects::CObjMgrException e) {
247(
string)
"Missing seq-id in bioseq. "+ e.
GetMsg());
265 if(queries.size() < 2) {
267 "Aligner requires at least two input sequences");
273 for(
size_t i=0;
i< queries.size();
i++) {
282 m_Scope->AddScope(*queries[
i].scope);
298 constobjects::CSeq_align& msa2,
314 ITERATE(objects::CDense_seg::TIds, it,
315msa1.GetSegs().GetDenseg().GetIds()) {
318 newobjects::CSeq_loc(objects::CSeq_loc::e_Whole)));
321 ITERATE(objects::CDense_seg::TIds, it,
322msa2.GetSegs().GetDenseg().GetIds()) {
325 newobjects::CSeq_loc(objects::CSeq_loc::e_Whole)));
330 if(!repr1.
empty()) {
341 if(!repr2.
empty()) {
361 "No tree to return");
383 if(
strcmp(matrix_name,
"BLOSUM62") == 0)
385 else if(
strcmp(matrix_name,
"BLOSUM45") == 0)
387 else if(
strcmp(matrix_name,
"BLOSUM80") == 0)
389 else if(
strcmp(matrix_name,
"PAM30") == 0)
391 else if(
strcmp(matrix_name,
"PAM70") == 0)
393 else if(
strcmp(matrix_name,
"PAM250") == 0)
397 "Unsupported score matrix. Valid matrix names: BLOSUM45, "\
398 "BLOSUM62, BLOSUM80, PAM30, PAM70 and PAM250");
430 const Int4kGapOpen = 11;
431 const Int4kGapExtend = 1;
435 NCBI_THROW(blast::CBlastException, eInvalidArgument,
436 "Cannot generate Karlin block");
450dmat.
Resize(clusters.size(), clusters.size(), 0.0);
451 for(
size_t i=0;
i< clusters.size() - 1;
i++) {
452 for(
size_tj=
i+1;j < clusters.size();j++) {
453dmat(
i, j) = bigmat(clusters[
i].GetPrototype(),
454clusters[j].GetPrototype());
455dmat(j,
i) = dmat(
i, j);
467printf(
"distance matrix:\n");
469 for(
int i= matrix.
GetCols() - 1;
i> 0;
i--)
470printf(
"%5d ",
i);
474printf(
"%2d: ",
i);
475 for(
intj = matrix.
GetCols() - 1; j >
i; j--) {
476printf(
"%5.3f ", matrix(
i, j));
508 "Alignment interrupted");
524vector<CTree::STreeLeaf> node_list1;
525vector<CTree::STreeLeaf> node_list2;
538vector<int> compress_inds;
540compress_inds.push_back(
i);
543compress_inds.clear();
545compress_inds.push_back(
i);
559vector<const CSequence*> pattern_queries;
580 for(
unsigned int i= 0;
i< pair_info.
GetRows();
i++) {
581 for(
unsigned intj = 0; j < pair_info.
GetCols(); j++) {
582pair_info(
i, j).ResetList();
602(
string)
"Number of queries exceeds maximum of " 606 boolis_cluster_found =
false;
607vector<TPhyTreeNode*> cluster_trees;
635 "Invalid clustering option");
644vector<const CSequence*> pattern_queries;
658 if(is_cluster_found) {
678 "Invalid clustering option");
718 catch(blast::CBlastException e) {
719blast::CBlastException::EErrCode err_code
720= (blast::CBlastException::EErrCode)e.
GetErrCode();
722status = (err_code == blast::CBlastException::eInvalidArgument
731 catch(std::exception e) {
749vector<TKmerCounts> kmer_counts;
758shared_ptr<CClusterer::TDistMatrix> dmat
769 for(
size_t i=0;
i< dmat->GetRows();
i++) {
773(*dmat)(center,
i) = 0.0;
774(*dmat)(
i, center) = 0.0;
788 for(
int i=0;
i< (
int)dmat->GetCols() - 1;
i++) {
792 if(!constr_q.
empty() && constr_q.
find(
i) != constr_q.
end()) {
796 for(
intj=
i+1;j < (
int)dmat->GetCols();j++) {
798 if(!constr_q.
empty() && constr_q.
find(j) != constr_q.
end()) {
803links->
AddLink(
i, j, (*dmat)(
i, j));
811 const doublekMaxDistance = 1.5;
815 for(
int i=0;
i< (
int)dmat->GetRows();
i++) {
817(*dmat)(
i, *it) = kMaxDistance;
818(*dmat)(*it,
i) = kMaxDistance;
825printf(
"K-mer counts distance matrix:\n");
827 for(
size_t i=dmat->GetCols() - 1;
i> 0;
i--) {
828printf(
"%6d", (
int)
i);
831 for(
size_t i=0;
i< dmat->GetRows() - 1;
i++) {
832printf(
"%3d:", (
int)
i);
833 for(
size_tj=dmat->GetCols() - 1;j >
i;j--) {
834printf(
"%6.3f", (*dmat)(
i, j));
862printf(
"\nNumber of queries in clusters: 0 (0%%)\n");
863printf(
"Number of domain searches reduced by: 0 (0%%)\n\n");
864printf(
"Only single-element clusters were found." 865 " No clustering information will be used.\n");
879 if(it->size() == 1) {
880it->SetPrototype(*it->begin());
882}
else if(it->size() == 2) {
887 int prot= (len1 > len2) ? (*it)[0] : (*it)[1];
888it->SetPrototype(
prot);
898vector< CRef<objects::CSeq_loc> > cluster_prototypes;
901cluster_prototypes.push_back(
m_tQueries[cluster_it->GetPrototype()]);
912 constvector<CSequence>& q =
915printf(
"Query clusters:\n");
917 size_tnum_in_clusters = 0;
919printf(
"Cluster %3d: ", cluster_idx++);
920printf(
"(prototype: %3d) ", it_cl->GetPrototype());
923printf(
"%d (%d), ", *it_el, q[*it_el].
GetLength());
926 if(it_cl->size() > 1) {
927num_in_clusters += it_cl->size();
931 size_tgain =
m_QueryData.size() - clusters.size();
932printf(
"\nNumber of queries in clusters: %lu (%.0f%%)\n",
934(
double)num_in_clusters /
m_QueryData.size() * 100.0);
935printf(
"Number of domain searches reduced by: %lu (%.0f%%)\n\n", gain,
939printf(
"Distances in clusters:\n");
940 for(
size_tcluster_idx=0;cluster_idx < clusters.size();
944 if(cluster.
size() == 1) {
948printf(
"Cluster %d:\n", (
int)cluster_idx);
949 if(cluster.
size() == 2) {
950printf(
" %6.3f\n\n", d(cluster[0], cluster[1]));
955 for(
size_t i= cluster.
size() - 1;
i> 0;
i--) {
956printf(
"%6d", (
int)cluster[
i]);
959 for(
size_t i=0;
i< cluster.
size() - 1;
i++) {
960printf(
"%3d:", (
int)cluster[
i]);
961 for(
size_tj=cluster.
size() - 1;j >
i;j--) {
962printf(
"%6.3f", d(cluster[
i], cluster[j]));
969printf(
"Sequences that belong to different clusters with distance" 970 " smaller than threshold (exludes prototypes):\n");
972 if(it->size() == 1) {
983 if(*el == cl->GetPrototype()) {
988printf(
"%3d, %3d: %f\n", *elem, *el, d(*elem, *el));
1023 for(
size_tcluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1030 if(clusters[cluster_idx].
size() > 1) {
1036 boolis_gap_in_prototype =
false;
1056 if(len1 > 1.2 * len2 || len2 > 1.2 * len1) {
1076 for(
size_tj=0;j <
t.size();j++) {
1078is_gap_in_prototype =
true;
1083 if(!is_gap_in_prototype) {
1092 for(it=clusters[cluster_idx].begin();it != seq_idx;++it) {
1114printf(
"Aligning in cluster %d:\n", (
int)cluster_idx);
1117printf(
"%3d: ", *elem);
1131 "Alignement Interrupted");
1141printf(
"Gaps in cluster %d: ", (
int)
i);
1153 if(clusters.size() == 1) {
1163it->insert =
false;
1164it->letters.resize(
len);
1165 for(
size_t i=0;
i<
len;
i++) {
1166it->letters[
i] = -1;
1176vector<CMultiAligner::SColumn>::iterator& it,
1177 size_t len,
intnum,
intcluster)
1180it->letters.resize(
len);
1181 for(
size_t i=0;
i<
len;
i++) {
1182it->letters[
i] = -1;
1185it->cluster = cluster;
1192 intseq_length =
m_Results[0].GetLength();
1195vector<int> letter_inds(clusters.size());
1196vector<SColumn>
columns(seq_length);
1203 for(
size_tj=0;j < clusters.size();j++) {
1205it->letters[clusters[j].GetPrototype()] = letter_inds[j]++;
1212 intnew_length = seq_length;
1215 for(
size_tcluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1233vector<SColumn>::iterator it =
columns.begin();
1234 size_tprototype_idx = clusters[cluster_idx].GetPrototype();
1236&& (it->insert || it->letters[prototype_idx] < (
int)
letter)) {
1256 "Alignment interrupted");
1267it->Reset(new_length);
1272vector<int> gap_offsets(clusters.size());
1281 for(
size_t i=0;
i< clusters.size();
i++) {
1284 size_tprototype_idx = clusters[
i].GetPrototype();
1285 int letter= it->letters[prototype_idx];
1303 results[*elem].SetLetter(col,
1313 for(
int i=0;
i< it->number;
i++) {
1314 results[*elem].SetLetter(col +
i,
1325 "Alignment interrupted");
1332printf(
"Cluster prototypes:\n");
1342printf(
"Individual clusters:\n");
1343 for(
int i=0;
i< (
int)clusters.size();
i++) {
1344 if(clusters[
i].
size() > 1) {
1345printf(
"Cluster %d:\n",
i);
1348 for(
intj=0;j < seq.
GetLength();j++) {
1359printf(
"All queries:\n");
1381 for(
size_tcluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1387 if(cluster.
size() == 1) {
1415freqs(
i, k) += matrix(
i+
offset, k);
1426 "Alignment interrupted");
1434vector<int>& indices)
1445queries.push_back(sl);
1446indices.push_back(*it);
1453queries.push_back(sl);
1454indices.push_back((
int)
m_InMSA1.size() + *it);
1467blast::SSeqLoc sl(**it, *
m_Scope);
1468queries.push_back(sl);
1478 intindex = it->GetPrototype();
1480queries.push_back(sl);
1481indices.push_back(index);
1487 "Invalid in-cluster alignment method");
1494vector<int>& indices)
1505indices[
i] = (
int)
i;
1512queries.resize(clusters.size());
1513indices.resize(clusters.size());
1514 for(
size_t i=0;
i< clusters.size();
i++) {
1515 intindex = clusters[
i].GetPrototype();
1517indices[
i] = index;
1524 "Invalid in-cluster alignment method");
1541 doublenode_dist = distance / 2.0;
1544 if(node_dist <= 0.0) {
1552node->
GetValue().SetDist(node_dist);
1558node->
GetValue().SetDist(node_dist);
1575 int id= ids[node->
GetValue().GetId()];
1599 _ASSERT(trees.size() == clusters.size());
1603 for(
size_t i=0;
i< trees.size();
i++) {
1606 if(clusters[
i].
size() == 1) {
1614trees.resize(clusters.size());
1615 for(
intclust_idx=0;clust_idx < (
int)clusters.size();clust_idx++) {
1619 if(cluster.
size() == 1) {
1620trees[clust_idx] =
NULL;
1624 if(cluster.
size() == 2) {
1633 CTreesingle_tree(mat,
1640trees[clust_idx] = root;
1646 for(
size_t i=0;
i< trees.size();
i++) {
1648printf(
"Tree for cluster %d:\n", (
int)
i);
1667vector<double>& leaf_dists,
1668vector<TPhyTreeNode*>& leaf_nodes,
1669 boollast_edge_only =
false)
1673 if(
tree->IsLeaf()) {
1675 int id=
tree->GetValue().GetId();
1676 doubledist =
tree->GetValue().GetDist();
1677 if(!last_edge_only) {
1678dist += dist_from_root;
1681 _ASSERT(
id< (
int)leaf_dists.size());
1682leaf_dists[id] = dist;
1685 _ASSERT(
id< (
int)leaf_nodes.size() && !leaf_nodes[
id]);
1686leaf_nodes[id] =
tree;
1692 if(
tree->GetParent() &&
tree->GetValue().IsSetDist() && !last_edge_only) {
1693dist =
tree->GetValue().GetDist();
1700 while(it !=
tree->SubNodeEnd()) {
1714 doubledist_from_root)
1718 if(node->
GetValue().GetId() ==
id) {
1719 returndist_from_root + node->
GetValue().GetDist();
1732dist = node->
GetValue().GetDist();
1779 if(curr_dist > 0.0) {
1780scale = dist / curr_dist;
1791 constvector<TPhyTreeNode*>& cluster_trees,
1792 constvector<TPhyTreeNode*>& cluster_leaves)
1794 ITERATE(vector<TPhyTreeNode*>, it, cluster_leaves) {
1802 intcluster_id = node->
GetValue().GetId();
1811 intseq_id = cluster[0];
1825vector<TPhyTreeNode*> children;
1828children.push_back(*child);
1831 ITERATE(vector<TPhyTreeNode*>, it, children) {
1849 _ASSERT(cluster_trees.size() == clusters.size());
1853vector<double> cluster_dists(clusters.size(), 0.0);
1854vector<TPhyTreeNode*> cluster_leaves(clusters.size(),
NULL);
1859vector<TPhyTreeNode*> dummy_vect(clusters.size(),
NULL);
1860vector<double>d(cluster_dists.size());
1862 for(
size_t i=0;
i< d.size();
i++) {
1863printf(
"%d:%f ", (
int)
i, d[
i]);
1870 for(
size_t i=0;
i< cluster_trees.size();
i++) {
1873 if(!cluster_trees[
i]) {
1880 if(cluster_dists[
i] <= 0.0) {
1881cluster_dists[
i] = 1e-5;
1888cluster_dists[
i]);
1900 for(
size_t i=0;
i< cluster_dists.size();
i++) {
1901printf(
"%d:%f ", (
int)
i, cluster_dists[
i]);
1907printf(
"Full tree:\n");
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Declares the BLAST exception class.
Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Boolean standard_only)
Attempts to fill KarlinBlk for given gap opening, extensions etc.
int GetPrototype(void) const
Get cluster prototype.
size_t size(void) const
Get cluster size.
vector< int >::const_iterator const_iterator
Interface for CClusterer class used for clustering any type of data based on distance matrix.
void ReleaseTrees(vector< TPhyTreeNode * > &trees)
Get list of trees for clusters and release ownership to caller.
@ eClique
Clusters can be joined if there is a link between all pairs of their elements.
@ eCompleteLinkage
Maximum distance between elements.
TPhyTreeNode * ReleaseTree(int index=0)
Get cluster tree and release ownership to caller.
void Reset(void)
Clear clusters and distance matrix.
void SetMakeTrees(bool trees)
Set make cluster tree/dendrogram option.
void SetDistMatrix(const TDistMatrix &dmat)
Set new distance matrix.
const TDistMatrix & GetDistMatrix(void) const
Get distance matrix.
void SetLinks(CRef< CLinks > links)
Set distance links.
void Run(void)
Cluster elements.
const TClusters & GetClusters(void) const
Get clusters.
void ComputeClusters(double max_diam, EDistMethod dist_method=eCompleteLinkage, bool do_trees=true, double infinity=-1.0)
Compute clusters.
void SetClustMethod(EClustMethod method)
Set clustering method for links.
vector< TSingleCluster > TClusters
TClusters & SetClusters(void)
Set clusters.
void PurgeDistMatrix(void)
Delete distance matrix.
void GetClusterDistMatrix(int index, TDistMatrix &mat) const
Get distance matrix for elements of a selected cluster.
Representation of pairwise distances, intended for use in multiple sequence alignment applications.
const CDistMethods::TMatrix & GetMatrix() const
Access the current distance matrix.
Interface for the traceback from blast hits.
An ordered collection of CHit objects.
int Size() const
Retrieve number of hits in list.
void PurgeAllHits()
Delete all hits unconditionally.
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
void AddToHitList(CHit *hit)
Append a hit to the hitlist.
A generalized representation of a pairwise alignment.
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
TRange m_SeqRange1
The range of offsets on the first sequence.
TRange m_SeqRange2
The range of offsets on the second sequence.
Set of edges with weights between nodes represented by zero-based positive integers.
void AddLink(int first, int second, double weight)
Add link.
void Sort(void)
Sort links according to weights in ascending order.
Options and parameters for multiple alignement.
@ eFastME
Fast Minimum Evolution.
@ eClusters
Clustering dendrogram.
TScore GetEndGapExtendPenalty(void) const
Get gap extension penalty for end gaps in pairwise global alignment of profiles.
double GetMaxInClusterDist(void) const
Get maximum allowed distance between sequences in a cluster.
string GetScoreMatrixName(void) const
Get alignment score matrix name.
TKMethods::EDistMeasures GetKmerDistMeasure(void) const
Get method for computing distance between word count vectors.
int GetCentralSeq(void) const
Get central sequence.
EInClustAlnMethod GetInClustAlnMethod(void) const
TScore GetGapExtendPenalty(void) const
Get gap extension penlaty for middle gaps in pairwise global alignment of profiles.
const TConstraints & GetUserConstraints(void) const
Get user constraints.
TScore GetGapOpenPenalty(void) const
Get gap opening penalty for middle gaps in pairwise global alignment of profiles.
TScore GetEndGapOpenPenalty(void) const
Get gap opening penalty for end gaps in pairwise global alignment of profiles.
TKMethods::ECompressedAlphabet GetKmerAlphabet(void) const
Get alphabet used for creating word count vectors.
ETreeMethod GetTreeMethod(void) const
Get method for creating tree that guides progressive alignment.
vector< SConstraint > TConstraints
bool GetUseQueryClusters(void) const
Check if query clustering option is on.
@ eToPrototype
All cluster elements are aligner to cluster prototype.
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
int GetKmerLength(void) const
Get word size for creating word count vectors.
int GetUserConstraintsScore(void) const
Get score for user alignment constraints.
bool GetVerbose(void) const
Get verbose mode.
Simultaneously align multiple protein sequences.
vector< CSequence > m_AllQueryData
vector< vector< Uint4 > > m_ClusterGapPositions
CMultiAlignerOptions::EInClustAlnMethod m_ClustAlnMethod
void x_SetScoreMatrix(const char *matrix_name)
Set the score matrix the aligner will use.
SProgress m_ProgressMonitor
CRef< objects::CScope > m_Scope
void x_ComputeClusterTrees(vector< TPhyTreeNode * > &trees)
Compute independent phylogenetic trees each cluster.
void x_FindLocalInClusterHits(const vector< TPhyTreeNode * > &cluster_trees)
Run blast on sequences from each cluster subtree.
static void x_InitInsertColumn(vector< SColumn >::iterator &it, size_t len, int num, int cluster)
vector< int > m_Msa2Repr
Indices of sequence representatives in input alignment 2.
vector< CRef< objects::CSeq_loc > > m_tQueries
void x_MakeClusterResidueFrequencies()
Compute profile residue frequencies for clusters.
vector< CRef< objects::CSeq_loc > > m_AllQueries
TStatus Run(void)
Align the current set of input sequences (reset any existing alignment information).
struct CMultiAligner::SColumn SColumn
Column in an alignment used for combining result from multiple alignment and pair-wise in-cluster ali...
void x_ComputeTree()
Given the current list of domain and local hits, generate a phylogenetic tree that clusters the curre...
void x_FindLocalHits(const blast::TSeqLocVector &queries, const vector< int > &indices)
Run blast on selected input sequences and postprocess the results.
@ eOutOfMemory
Out of memory error.
@ eInternalError
Unexpected error occured.
@ eSuccess
Alignment successfully completed.
@ eInterrupt
Alignment interruped through callback function.
@ eOptionsError
Error related to options occured.
@ eDatabaseError
Error related to RPS database occured.
@ eQueriesError
Error related to query sequences occured.
void x_BuildFullTree(const vector< TPhyTreeNode * > &cluster_trees)
Combine alignment guide tree computed for clusters with guide trees computed for each cluster.
vector< CSequence > m_QueryData
bool x_ValidateInputMSAs(void) const
Validate input alignments.
bool x_ValidateQueries(void) const
Validate query sequences.
void x_CreatePatternQueries(vector< const CSequence * > &queries, vector< int > &indices)
Create query set for PROSITE pattern search along with indices in multiple alignment queries array.
void SetInputMSAs(const objects::CSeq_align &msa1, const objects::CSeq_align &msa2, const set< int > &representatives1, const set< int > &representatives2, CRef< objects::CScope > scope)
Set input alignments.
void x_AlignMSAs(void)
Align multiple sequence alignments.
void x_MultiAlignClusters()
Combine pair-wise in-cluster alignements with multiple alignments of cluster prototypes.
vector< CSequence > m_InMSA1
Input alignment.
vector< CSequence > m_Results
void x_FindConsistentHitSubset(void)
Find consistent subset of pair-wise hits that can be used as alignment constraints.
CConstRef< CMultiAlignerOptions > m_Options
void x_InitParams(void)
Initiate parameters using m_Options.
void x_AlignProfileProfile(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
Align two collections of sequences.
void SetQueries(const vector< CRef< objects::CSeq_loc > > &queries, CRef< objects::CScope > scope)
Set query sequences.
void Reset(void)
Clear out the state left by the previous alignment operation.
void x_AlignInClusters()
Pair-wise align each cluster sequence to cluster representative.
vector< string > m_Messages
static void x_InitColumn(vector< SColumn >::iterator &it, size_t len)
virtual void x_Run(void)
Align the current set of input sequences (reset any existing alignment information).
bool(* FInterruptFn)(SProgress *progress)
Prototype for function pointer to dertermine whether alignment should proceed of be interrupted.
void x_FindDomainHits(blast::TSeqLocVector &queries, const vector< int > &indices)
Run RPS blast on seletced input sequences and postprocess the results.
CMultiAligner(void)
Create mutli aligner with default options.
void x_BuildAlignment()
Given the current domain, local, pattern and user hits, along with the current tree,...
void x_Init(void)
Initiate class attributes that are not alignment parameters.
void x_FindPatternHits(const vector< const CSequence * > &queries, const vector< int > &indices)
Find PROSITE pattern hits on selected input sequences.
CRef< objects::CBioTreeContainer > GetTreeContainer(void) const
Get serializable tree used as guide in progressive alignment.
vector< int > m_Msa1Repr
Indices of sequence representatives in input alignment 1.
void x_AttachClusterTrees(const vector< TPhyTreeNode * > &cluster_trees, const vector< TPhyTreeNode * > &cluster_leaves)
Replace leaves in the alignment guide tree of clusters with cluster trees.
bool x_FindQueryClusters()
Find clusters of similar queries, select cluster representative sequences, and prepare input to multi...
void x_CreateBlastQueries(blast::TSeqLocVector &queries, vector< int > &indices)
Create query set for RPS Blast and Blastp searches along with indices in multiple alignment queries a...
bool x_ValidateUserHits(void)
Validate user constraints with queries.
void x_InitAligner(void)
Initiate PSSM aligner parameters.
static const int kClusterNodeId
FInterruptFn SetInterruptCallback(FInterruptFn fnptr, void *user_data=NULL)
Set a function callback to be invoked by multi aligner to allow interrupting alignment in progress.
vector< CSequence > m_InMSA2
Input alignment.
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
size_t GetRows() const
get the number of rows in this matrix
size_t GetCols() const
get the number of columns in this matrix
Class for representing protein sequences.
int GetLength() const
Get the length of the current sequence.
unsigned char GetLetter(int pos) const
Access the sequence letter at a specified position.
TFreqMatrix & GetFreqs()
Access the list of position frequencies associated with a sequence.
static void CompressSequences(vector< CSequence > &seq, vector< int > index_list)
Given a collection of sequences, remove all sequence positions where a subset of the sequences all co...
void PropagateGaps(const CNWAligner::TTranscript &transcript, CNWAligner::ETranscriptSymbol gap_choice)
Given an edit script, insert gaps into a sequence.
static const unsigned char kGapChar
The ncbistdaa code for a gap.
unsigned char * GetSequence()
Access the raw sequence data, in ncbistdaa format.
static void CreateMsa(const objects::CSeq_align &seq_align, objects::CScope &scope, vector< CSequence > &msa)
Create a vector of CSequence objects that represents the alignment in given Seq_align.
unsigned char GetPrintableLetter(int pos) const
Access the sequence letter at a specified position, and return an ASCII representation of that letter...
definition of a Culling tree
A wrapper for controlling access to the phylogenetic tree generated by CDistMethods.
void SetTree(TPhyTreeNode *tree)
Set tree.
static void PrintTree(const TPhyTreeNode *node, int level=0)
Debug routine to recursively print out a tree.
const TPhyTreeNode * GetTree() const
Access the current tree.
TPhyTreeNode * ReleaseTree()
Get the current tree and release ownership.
void ComputeTree(const CDistMethods::TMatrix &distances, bool use_fastme=false)
Compute a new tree.
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, double(*fsim)(const TKmerCounts &, const TKmerCounts &), TDistMatrix &dmat)
Compute matrix of distances between given counts vectors.
static void ComputeCounts(const vector< CRef< objects::CSeq_loc > > &seqs, objects::CScope &scope, vector< TKmerCounts > &counts)
Create k-mer counts vectors for given sequences.
static void SetParams(unsigned kmer_len, unsigned alphabet_size)
Set default counts vector parameters.
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator find(const key_type &key) const
const_iterator end() const
static TPhyTreeNode * s_MakeTwoLeafTree(const CClusterer::CSingleCluster &ids, double distance)
Create phylogenetic tree for two sequences.
static void s_ScaleTreeEdges(TPhyTreeNode *node, double scale)
Scale all tree edges by given factor (recursive).
static void s_SetLeafIds(TPhyTreeNode *node, const CClusterer::CSingleCluster &ids)
Change ids of leaf nodes in a given tree to desired values (recursive).
static double s_FindNodeDistance(const TPhyTreeNode *node, int id, double dist_from_root)
Find distance from root for selected node (recursive).
static void s_RescaleTree(TPhyTreeNode *tree, int id, double dist)
Rescale tree so that node with given id has desired distance from root.
static void s_FindLeafDistances(TPhyTreeNode *tree, double dist_from_root, vector< double > &leaf_dists, vector< TPhyTreeNode * > &leaf_nodes, bool last_edge_only=false)
Compute length of the edge or distance from root for each leaf (recursive).
Interface for CMultiAligner.
CRef< objects::CBioTreeContainer > MakeBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer.
static const column_t columns[]
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
SNCBIFullScoreMatrix & GetMatrix()
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetStartWs(TScore value)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty â not pointing to any object which means having a null value.
void Reset(void)
Reset reference object.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
TNodeList::iterator TNodeList_I
TTreeType * DetachNode(TTreeType *subnode)
Remove the subtree from the tree without destroying it.
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
TNodeList::const_iterator TNodeList_CI
void AddNode(TTreeType *subnode)
Add new subnode.
bool IsLeaf() const
Report whether this is a leaf node.
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
const TValue & GetValue(void) const
Return node's value.
const TTreeType * GetParent(void) const
Get node's parent.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
unsigned int
A callback function used to compare two keys in a database.
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
CSequnceHelper< CObject > CSequence
int strcmp(const char *str1, const char *str2)
#define ASSERT
macro for assert.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
CTreeNode< CPhyNodeData > TPhyTreeNode
const SNCBIPackedScoreMatrix NCBISM_Pam30
const SNCBIPackedScoreMatrix NCBISM_Blosum62
const SNCBIPackedScoreMatrix NCBISM_Pam250
const SNCBIPackedScoreMatrix NCBISM_Blosum80
const SNCBIPackedScoreMatrix NCBISM_Pam70
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Structure to hold the Karlin-Altschul parameters.
Structure for listing tree leaves.
static Uint4 letter(char c)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4