(
void);
56 virtual int Run(
void);
57 virtual void Exit(
void);
69arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
70 "Distances and clustering of multiple alignment" 73arg_desc->AddKey(
"i",
"infile",
"file containing names of alignment files",
75arg_desc->AddOptionalKey(
"d",
"defline_file",
76 "file containing deflines corrsponding to alignment files",
78arg_desc->AddDefaultKey(
"first",
"first",
79 "produce alignments from alignment number 'first' " 80 "to all succeeding alignments in the list",
82arg_desc->AddDefaultKey(
"last",
"index",
83 "Compare alignments from first to last against the rest " 84 "in the list. Indices start from 1 " 85 "(default indicates an all vs. all comparison).",
87arg_desc->AddDefaultKey(
"g0",
"penalty",
88 "gap open penalty for initial/terminal gaps",
90arg_desc->AddDefaultKey(
"e0",
"penalty",
91 "gap extend penalty for initial/terminal gaps",
93arg_desc->AddDefaultKey(
"g1",
"penalty",
94 "gap open penalty for middle gaps",
96arg_desc->AddDefaultKey(
"e1",
"penalty",
97 "gap extend penalty for middle gaps",
99arg_desc->AddDefaultKey(
"matrix",
"matrix",
100 "score matrix to use",
102arg_desc->AddDefaultKey(
"v",
"verbose",
103 "turn on verbose output",
105arg_desc->AddDefaultKey(
"local",
"local",
106 "reduce end gap penalties in profile-profile alignment",
108arg_desc->AddFlag(
"pairs",
"Report pairwise distances up to a cutoff " 109 "value instead of the distance matrix");
110arg_desc->AddDefaultKey(
"cutoff",
"distance",
111 "Maximum pairwise distance to report",
113arg_desc->AddDefaultKey(
"out",
"outfile",
"Output file name",
135 while(!line_reader.
AtEOF()) {
145 "Could not retrieve seq entry");
151seqloc->
SetWhole().Assign(*itr->GetId().front());
152blast::SSeqLoc sl(seqloc, scope);
153retval.push_back(sl);
162 if(
strcmp(matrix_name,
"BLOSUM62") == 0)
164 else if(
strcmp(matrix_name,
"BLOSUM45") == 0)
166 else if(
strcmp(matrix_name,
"BLOSUM80") == 0)
168 else if(
strcmp(matrix_name,
"PAM30") == 0)
170 else if(
strcmp(matrix_name,
"PAM70") == 0)
172 else if(
strcmp(matrix_name,
"PAM250") == 0)
180vector<CSequence>& query_data)
182 intalign_length = query_data[0].GetLength();
183 intnum_seqs = (
int)query_data.size();
184 for(
int i= 0;
i< align_length;
i++) {
185 for(
intj = 0; j < num_seqs; j++)
186freq_data[
i][query_data[j].GetLetter(
i)]++;
194 for(
int i= 0;
i< freq_size;
i++) {
200sum += freq_data[
i][j];
205freq_data[
i][j] *= sum;
212 double**freq1,
intlen1,
213 double**freq2,
intlen2,
224 const size_tdim = transcript.size();
227 for(
size_t i= 0;
i< dim; ++
i) {
229 intwg1 = end_gap_open, ws1 = end_gap_extend;
230 intwg2 = end_gap_open, ws2 = end_gap_extend;
232 if(offset1 >= 0 && offset1 < len1 - 1) {
233wg1 = aligner.
GetWg();
234ws1 = aligner.
GetWs();
237 if(offset2 >= 0 && offset2 < len2 - 1) {
238wg2 = aligner.
GetWg();
239ws2 = aligner.
GetWs();
249++offset1; ++offset2;
250 doubleaccum = 0.0, sum = 0.0;
255 if(freq1[offset1][m] < freq2[offset2][m]) {
256accum += freq1[offset1][m] * (double)sm[m][m];
258diff_freq2[m] = freq2[offset2][m] - freq1[offset1][m];
261accum += freq2[offset2][m] * (double)sm[m][m];
262diff_freq1[m] = freq1[offset1][m] -
268 if(freq1[offset1][0] <= freq2[offset2][0]) {
270sum += diff_freq1[m];
273sum += diff_freq2[m];
277 if(freq1[offset1][0] <= freq2[offset2][0]) {
279diff_freq1[m] /= sum;
282diff_freq2[m] /= sum;
287accum += diff_freq1[m] *
294freq1[offset1][0] * aligner.
GetWs() * (1-freq2[offset2][0]) +
295freq2[offset2][0] * aligner.
GetWs() * (1-freq1[offset1][0]);
301 if(state1 != 1) dscore += wg1 * (1.0 - freq2[offset2][0]);
302state1 = 1; state2 = 0;
309 if(state2 != 1) dscore += wg2 * (1.0 - freq1[offset1][0]);
310state1 = 0; state2 = 1;
321 return(
int)(dscore + 0.5);
328 intfreq_size = alignment[0].GetLength();
333freq_data =
new double* [freq_size];
336 for(
int i= 1;
i< freq_size;
i++)
339memset(freq_data[0], 0,
kAlphabetSize* freq_size *
sizeof(
double));
356 double**freq2_data,
intfreq2_size,
358 boollocal_alignment)
364aligner.
SetSequences((
const double**)freq1_data, freq1_size,
365(
const double**)freq2_data, freq2_size,
369freq1_data, freq1_size,
370freq2_data, freq2_size,
371local_alignment ? 0 : end_gap_open,
372local_alignment ? 0 : end_gap_extend);
381 for(
int i= 0;
i< freq_size;
i++) {
383score += freq_data[
i][j] * matrix.
s[j][j];
398vector<SAlignEntry>& aligns)
402 for(
i= 0;
i< level;
i++)
406 if(
tree->GetValue().GetId() >= 0)
407printf(
"cluster %d (%s) ",
tree->GetValue().GetId(),
408aligns[
tree->GetValue().GetId()].name);
409 if(
tree->GetValue().IsSetDist())
410printf(
"distance %lf",
tree->GetValue().GetDist());
413 if(
tree->IsLeaf())
419 while(child !=
tree->SubNodeEnd()) {
420 for(
i= 0;
i< level;
i++)
423printf(
"%d:\n", j);
438vector<CTree::STreeLeaf> left_leaves;
439vector<CTree::STreeLeaf> right_leaves;
443(*child)->GetValue().GetDist());
446(*child)->GetValue().GetDist());
448 for(
size_t i= 0;
i< left_leaves.size();
i++) {
449 for(
size_tj = 0; j < right_leaves.size(); j++) {
450 intidx1 = left_leaves[
i].query_idx;
451 doubledist1 = left_leaves[
i].distance;
452 intidx2 = right_leaves[j].query_idx;
453 doubledist2 = right_leaves[j].distance;
459matrix(idx1, idx2) = matrix(idx2, idx1) = dist1 + dist2;
463right_leaves.clear();
479 bool verbose= args[
"v"].AsBoolean();
500vector<SAlignEntry> align_list;
514 for(
size_t i= 0;
i< align_read.size();
i++) {
515e.
align.push_back(
CSequence(*align_read[
i].seqloc, *align_read[
i].scope));
522 char*name_ptr =
buf+ strlen(
buf) - 1;
523 while(name_ptr >
buf&&
524name_ptr[-1] !=
'/'&&
525name_ptr[-1] !=
'\\') {
528strcpy(e.
name, name_ptr);
531 intfirst_index = args[
"first"].AsInteger();
532 intlast_index = args[
"last"].AsInteger();
533 if(last_index == 0) {
534last_index = (
int)align_list.size() - 1;
541 if(first_index != 0) {
542 NcbiCerr<<
"error: all-against-all alignment required"<< endl;
549 for(
int i= first_index;
i<= last_index;
i++) {
550 for(
size_tj =
i+ 1; j < align_list.size(); j++) {
552 if(last_index < (
int)align_list.size() - 1 && (
int)j <= last_index) {
556 intlen1 = align_list[
i].align[0].GetLength();
557 intlen2 = align_list[j].align[0].GetLength();
560 if(args[
"local"].AsBoolean() ==
true) {
561 if(len1 > 1.5 * len2 || len2 > 1.5 * len1) {
564 else if(len1 > 1.2 * len2 || len2 > 1.2 * len1) {
571align_list[
i].length,
572align_list[j].profile,
573align_list[j].length,
575args[
"local"].AsBoolean());
578printf(
"%s %s %.2f\n", align_list[
i].name,
579align_list[j].name, (
double)score / 100);
585distances(
i, j) = distances(j,
i) = 1.0 - (double)score *
586(1.0 / align_list[
i].self_score +
5871.0 / align_list[j].self_score) / 2;
594 intnum_clusters = (
int)align_list.size();
599 while(!dfile.fail() && !dfile.eof()) {
608dfile.getline(defline,
sizeof(defline),
'\n');
612 for(
int i= 0;
i< num_clusters;
i++) {
613 if(strstr(align_list[
i].name, name)) {
614snprintf(align_list[
i].name,
sizeof(align_list[
i].name),
615 "%s %s", name, defline);
616 char*
tmp= align_list[
i].name;
627 if(args[
"pairs"]) {
628 for(
int i=0;
i<= last_index;
i++) {
629 for(
intj=(
i+1);j < num_clusters; j++) {
631 if(last_index < (
int)align_list.size() - 1 && j <= last_index) {
635 if(distances(
i, j) < args[
"cutoff"].AsDouble()) {
636args[
"out"].AsOutputFile() << align_list[
i].name <<
"\t" 637<< align_list[j].name <<
"\t" 644printf(
"%d\n", num_clusters);
645 for(
int i= 0;
i< num_clusters;
i++) {
646printf(
"%s\n", align_list[
i].name);
647 for(
intj = 0; j < num_clusters; j++) {
648printf(
"%5.4f ", distances(
i, j));
665printf(
"\n\nNew Distance matrix:\n ");
666 for(
int i= (
int)align_list.size() - 1;
i> 0;
i--)
667printf(
"%5d ",
i);
670 for(
int i= 0;
i< (
int)align_list.size() - 1;
i++) {
671printf(
"%2d: ",
i);
672 for(
intj = (
int)align_list.size() - 1; j >
i; j--) {
673printf(
"%5.3f ", new_distances(
i, j));
679printf(
"\n\nPercent relative error:\n ");
680 for(
int i= (
int)align_list.size() - 1;
i> 0;
i--)
681printf(
"%5d ",
i);
684 for(
int i= 0;
i< (
int)align_list.size() - 1;
i++) {
685printf(
"%2d: ",
i);
686 for(
intj = (
int)align_list.size() - 1; j >
i; j--) {
687printf(
"%5.2f ", 100*
fabs(new_distances(
i, j) -
688distances(
i, j)) / distances(
i, j));
704 int main(
intargc,
const char* argv[])
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Data loader implementation that uses the blast databases.
Base class for reading FASTA sequences.
virtual void Exit(void)
Cleanup on application exit.
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
virtual void Exit(void)
Cleanup on application exit.
virtual void Init(void)
Initialize the application.
CRef< CObjectManager > m_ObjMgr
virtual int Run(void)
Run the application.
Simple implementation of ILineReader for i(o)streams.
definition of a Culling tree
A wrapper for controlling access to the phylogenetic tree generated by CDistMethods.
static void ListTreeLeaves(const TPhyTreeNode *node, vector< STreeLeaf > &node_list, double curr_dist=0)
Traverse a tree below a given starting point, listing all leaves encountered along the way.
Template class for iteration on objects of class C (non-medifiable version)
Interface for CMultiAligner.
Operators to edit gaps in sequences.
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
TScore GetStartWs() const
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
SNCBIFullScoreMatrix & GetMatrix()
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
TScore GetStartWg() const
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetStartWs(TScore value)
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideLogfile
Hide log file description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eInputFile
Name of file (must exist and be readable)
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
@ eDiag_Warning
Warning message.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
@ fNoParseID
Generate an ID (whole defline -> title)
@ fForceType
Force specified type regardless of accession.
@ fParseRawID
Try to identify raw accessions.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
TNodeList::const_iterator TNodeList_CI
bool IsLeaf() const
Report whether this is a leaf node.
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
unsigned int
A callback function used to compare two keys in a database.
static void x_NormalizeResidueFrequencies(double **freq_data, int freq_size)
static void x_FillResidueFrequencies(double **freq_data, vector< CSequence > &query_data)
static double x_GetSelfScore(double **freq_data, int freq_size, SNCBIFullScoreMatrix &matrix)
static void x_PrintTree(const TPhyTreeNode *tree, int level, vector< SAlignEntry > &aligns)
static int x_ScoreFromTranscriptCore(CPSSMAligner &aligner, double **freq1, int len1, double **freq2, int len2, int end_gap_open, int end_gap_extend)
int x_AlignProfileProfile(double **freq1_data, int freq1_size, double **freq2_data, int freq2_size, CPSSMAligner &aligner, bool local_alignment)
static blast::TSeqLocVector x_GetSeqLocFromStream(CNcbiIstream &instream, CObjectManager &objmgr)
void x_SetScoreMatrix(const char *matrix_name, CPSSMAligner &aligner)
static double ** x_GetProfile(vector< CSequence > &alignment)
int main(int argc, const char *argv[])
static void x_FillNewDistanceMatrix(const TPhyTreeNode *node, CDistMethods::TMatrix &matrix)
static const int kScaleFactor
CSequnceHelper< CObject > CSequence
int strcmp(const char *str1, const char *str2)
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
const SNCBIPackedScoreMatrix NCBISM_Pam30
const SNCBIPackedScoreMatrix NCBISM_Blosum62
const SNCBIPackedScoreMatrix NCBISM_Pam250
#define NCBI_FSM_DIM
Recommended approach: unpack and index directly.
const SNCBIPackedScoreMatrix NCBISM_Blosum80
const SNCBIPackedScoreMatrix NCBISM_Pam70
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
int TNCBIScore
data types
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
vector< CSequence > align
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4