(
string)
"cobalt");
84 virtual void Init(
void);
85 virtual int Run(
void);
86 virtual void Exit(
void);
101 default:
return "";
114default :
return "";
125arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
126 "COBALT multiple sequence alignment utility");
129arg_desc->SetCurrentGroup(
"Input");
130arg_desc->AddOptionalKey(
"i",
"infile",
"File containing input sequences " 133arg_desc->AddOptionalKey(
"in_msa1",
"infile",
"File containing input " 134 "alignment in FASTA format",
137arg_desc->AddOptionalKey(
"in_msa2",
"infile",
"File containing input " 138 "alignment in FASTA format",
141arg_desc->AddOptionalKey(
"ind1",
"numbers",
"Coma separated list of " 142 "sequence indices in MSA1 to be used for " 143 "constraints generation",
146arg_desc->AddOptionalKey(
"ind2",
"numbers",
"Coma separated list of " 147 "sequence indices in MSA2 to be used for " 148 "constraints generation",
162arg_desc->AddFlag(
"parse_deflines",
"Should the sequence deflines be " 167arg_desc->SetCurrentGroup(
"Conserved domain options");
168arg_desc->AddOptionalKey(
"rpsdb",
"database",
"Conserved domain database " 169 "name\nEither database or -norps option must be " 171arg_desc->AddDefaultKey(
"norps",
"norps",
"Do not perform initial " 174arg_desc->AddDefaultKey(
"rps_evalue",
"evalue",
175 "E-value threshold for selecting conserved domains" 176 " from results of RPS-BLAST search",
179arg_desc->AddDefaultKey(
"num_domain_hits",
"number",
"Maximum number of " 180 "of domain hits for each sequence",
183arg_desc->AddOptionalKey(
"p",
"patternfile",
184 "Filename containing regular expression patterns " 185 "for conserved domains",
187arg_desc->AddDefaultKey(
"dfb",
"domain_res_boost",
188 "When assigning domain residue frequencies, the amount of " 189 "extra weight (0..1) to give to the actual sequence letter " 194arg_desc->AddOptionalKey(
"domain_hits",
"infile",
"Results of pre-computed" 195 " domain search in BLAST archive format",
204arg_desc->SetCurrentGroup(
"Constraints options");
205arg_desc->AddOptionalKey(
"c",
"constraintfile",
206 "Filename containing pairwise alignment constraints, " 207 "one per line, each represented by 6 integers:\n" 208 " -zero-based index of sequence 1 in the input file\n" 209 " -zero-based start position in sequence 1\n" 210 " -zero-based stop position in sequence 1\n" 211 " -zero-based index of sequence 2 in the input file\n" 212 " -zero-based start position in sequence 2\n" 213 " -zero-based stop position in sequence 2\n",
218arg_desc->SetCurrentGroup(
"Multiple alignment options");
219arg_desc->AddDefaultKey(
"treemethod",
"method",
220 "Method for generating progressive alignment guide tree",
224 "clust",
"nj",
"fastme"));
225arg_desc->AddDefaultKey(
"iter",
"iterate",
226 "After the first iteration search for conserved columns " 227 "and realign if any are found",
229arg_desc->AddDefaultKey(
"ccc",
"conserved_cutoff",
230 "Minimum average score needed for a multiple alignment " 231 "column to be considered as conserved",
234arg_desc->AddDefaultKey(
"pseudo",
"pseudocount",
235 "Pseudocount constant",
238arg_desc->AddDefaultKey(
"ffb",
"filler_res_boost",
239 "When assigning filler residue frequencies, the amount of " 240 "extra weight (0..1) to give to the actual sequence letter " 247arg_desc->SetCurrentGroup(
"Pairwise alignment options");
248arg_desc->AddDefaultKey(
"matrix",
"matrix",
249 "Score matrix to use",
251arg_desc->AddDefaultKey(
"end_gapopen",
"penalty",
252 "Gap open penalty for terminal gaps",
255arg_desc->AddDefaultKey(
"end_gapextend",
"penalty",
256 "Gap extend penalty for terminal gaps",
259arg_desc->AddDefaultKey(
"gapopen",
"penalty",
260 "Gap open penalty for internal gaps",
263arg_desc->AddDefaultKey(
"gapextend",
"penalty",
264 "Gap extend penalty for internal gaps",
267arg_desc->AddDefaultKey(
"blast_evalue",
"evalue",
268 "E-value threshold for selecting segments matched " 275arg_desc->SetCurrentGroup(
"Query clustering options");
276arg_desc->AddDefaultKey(
"clusters",
"clusters",
277 "Use query clustering for faster alignment",
279arg_desc->AddDefaultKey(
"k",
"length",
280 "K-mer length for query clustering",
283arg_desc->AddDefaultKey(
"max_dist",
"distance",
284 "Maximum allowed distance between sequences in a cluster" 288arg_desc->AddDefaultKey(
"alph",
"name",
289 "Alphabet for used k-mer counting",
293 "se-v10",
"se-b15"));
297arg_desc->SetCurrentGroup(
"Output options");
298arg_desc->AddOptionalKey(
"seqalign",
"file",
299 "Output text seqalign to specified file",
301arg_desc->AddOptionalKey(
"outfmt",
"format",
"Output format for multiple " 304 "clustalw",
"phylip",
"nexus"));
305arg_desc->AddFlag(
"v",
"Verbose output");
314vector<CMultiAlignerOptions::SConstraint>& constr)
317 if(
f.bad() ||
f.fail())
319 "Cannot open file with pairwise constraints");
321 intseq1, seq1_start, seq1_end;
322 intseq2, seq2_start, seq2_end;
326 f>> seq1 >> seq1_start >> seq1_end;
327 f>> seq2 >> seq2_start >> seq2_end;
329c(seq1, seq1_start, seq1_end, seq2, seq2_start, seq2_end);
335 f>> seq1 >> seq1_start >> seq1_end;
336 f>> seq2 >> seq2_start >> seq2_end;
339seq1_start, seq1_end, seq2, seq2_start, seq2_end));
347vector<CMultiAlignerOptions::CPattern>&
patterns)
350 if(
f.bad() ||
f.fail())
352 "Cannot open patterns file");
357 stringsingle_pattern;
361 if(!single_pattern.empty()) {
362 patterns.push_back(single_pattern);
378 if(args[
"rpsdb"] && args[
"norps"].AsBoolean()) {
380 "The options -rpsdb and -norps T are mutually exclusive");
383 if(!args[
"rpsdb"] && !args[
"norps"].AsBoolean()) {
385 "RPS dababase not specified");
404 if(args[
"rpsdb"]) {
405opts->
SetRpsDb(args[
"rpsdb"].AsString());
408 const string dbname= args[
"rpsdb"].AsString();
450 if(args[
"treemethod"].AsString() ==
"clust") {
453 else if(args[
"treemethod"].AsString() ==
"nj") {
456 else if(args[
"treemethod"].AsString() ==
"fastme") {
460 NcbiCerr<<
"Error: Incorrect tree method";
466opts->
SetIterate(args[
"iter"].AsBoolean());
477 if(args[
"alph"]) {
478 if(args[
"alph"].AsString() ==
"regular") {
481 else if(args[
"alph"].AsString() ==
"se-v10") {
484 else if(args[
"alph"].AsString() ==
"se-b15") {
497 if(args[
"domain_hits"]) {
515vector< CRef<objects::CSeq_loc> > queries;
517scope->AddDefaults();
521 if(!args[
"parse_deflines"]) {
532 m_UsageReport.AddParam(blast::CBlastUsageReport::eNumQueries,
533(
int)queries.size());
539objects::CSeqIdGenerator id_generator;
544args[
"in_msa1"].AsInputFile(),
550args[
"in_msa2"].AsInputFile(),
558 size_tnum1 = 0, num2 = 0;
560 if(args[
"ind1"]) {
564 ITERATE(list<string>, it, tokens) {
569 if(args[
"ind2"]) {
573 ITERATE(list<string>, it, tokens) {
580 if(num1 != repr1.
size() || num2 != repr2.
size()) {
581 NcbiCerr<<
"Error: Non-unique indeces of input sequence " 588aligner.
SetInputMSAs(*msa1, *msa2, repr1, repr2, scope);
603sequence::CDeflineGenerator defline_gen;
605 if(args[
"outfmt"]) {
611 if(args[
"outfmt"].AsString() ==
"mfasta") {
614 else if(args[
"outfmt"].AsString() ==
"clustalw") {
617 else if(args[
"outfmt"].AsString() ==
"phylip") {
620 else if(args[
"outfmt"].AsString() ==
"nexus") {
637 if(args[
"parse_deflines"]) {
640 if(
id.IsLocal()) {
643printf(
">%s",
label.c_str());
647 constvector<CSeq_id_Handle>& ids = bhandle.
GetId();
649 ITERATE(vector<CSeq_id_Handle>, it, ids) {
650 const stringid_str = it->GetSeqId()->AsFastaString();
651printf(
"%s", id_str.c_str());
652 if(it + 1 != ids.end()) {
659 stringtitle = defline_gen.GenerateDefline(bhandle);
660 if(title !=
"unnamed protein product") {
661printf(
" %s", title.c_str());
666printf(
">%s\n", defline_gen.GenerateDefline(bhandle).c_str());
669 for(
intj = 0; j <
results[
i].GetLength(); j++) {
670printf(
"%c",
results[
i].GetPrintableLetter(j));
676 if(args[
"seqalign"]) {
682 m_UsageReport.AddParam(blast::CBlastUsageReport::eExitStatus, 0);
691 int main(
intargc,
const char* argv[])
User-defined methods of the data storage class.
Data loader implementation that uses the blast databases.
Options and parameters for multiple alignement.
ETreeMethod
Method for construction of guide tree for progressive alignment.
@ eFastME
Fast Minimum Evolution.
@ eClusters
Clustering dendrogram.
void SetDomainHitlistSize(int size)
Set hitlist size (per sequence) for domain search.
void SetKmerAlphabet(TKMethods::ECompressedAlphabet alph)
Set alphabet for creating word count vectors.
void SetKmerLength(int len)
Set word size for creating word count vectors in query clustering.
void SetEndGapOpenPenalty(TScore penalty)
Set gap opening penalty for end gaps in pairwise global alignment of profiles.
void SetScoreMatrixName(const string &matrix)
Set alignment socre matrix name.
void SetLocalResFreqBoost(double boost)
Set frequency boost for a letter that appears in query sequence in given position.
bool Validate(void)
Validate parameter values.
void SetVerbose(bool verbose)
Set verbose mode.
void SetMaxInClusterDist(double dist)
Set maximum allowed distance between sequences in a cluster.
void SetBlastpEvalue(double evalue)
Set e-value for accepting Blastp hits.
void SetUseQueryClusters(bool use)
Set use of query clustering option.
void SetDomainHits(CConstRef< objects::CBlast4_archive > archive)
Set pre-computed domain hits.
void SetTreeMethod(ETreeMethod method)
Set method for creating tree that guides progressive alignment.
const vector< string > & GetMessages(void)
Get warning messages.
void SetGapOpenPenalty(TScore penalty)
Set gap opening penalty for middle gaps in pairwise global alignment of profiles.
void SetRpsDb(const string &dbname)
Use RPS Blast with given database.
void SetConservedCutoffScore(double score)
Set cutoff score for conserved aligned columns.
TConstraints & SetUserConstraints(void)
Set user constraints.
vector< CPattern > & SetCddPatterns(void)
Set regular expression patterns for identification of conserved domains.
void SetInClustAlnMethod(EInClustAlnMethod method)
void SetIterate(bool use)
Set use of iterative alignment option.
@ fNoPatterns
Do not use conserved domain patterns.
@ fNoRpsBlast
Do not use RPS Blast.
void SetPseudocount(double pseudocount)
Set pseudocount for calculating column entropy.
void SetRpsEvalue(double evalue)
Set e-value threshold for accepting RPS Blast hits.
void SetDomainResFreqBoost(double boost)
Set boost for residue frequencies in conserved domains from RPS data base.
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
void SetGapExtendPenalty(TScore penalty)
Set gap extension penalty for middle gaps in pairwise global alignment of profiles.
void SetEndGapExtendPenalty(TScore penalty)
Set gap extension penalty for end gaps in pairwise global alignment of profiles.
Keeps track of CMultiAligner version.
Simultaneously align multiple protein sequences.
const vector< CSequence > & GetSeqResults(void) const
Retrieve the current aligned results in CSequence format.
CRef< objects::CSeq_align > GetResults(void) const
Retrieve the current aligned results in Seq-align format.
TStatus Run(void)
Align the current set of input sequences (reset any existing alignment information).
const vector< string > & GetMessages(void) const
Get Error/Warning messages.
@ eSuccess
Alignment successfully completed.
void SetInputMSAs(const objects::CSeq_align &msa1, const objects::CSeq_align &msa2, const set< int > &representatives1, const set< int > &representatives2, CRef< objects::CScope > scope)
Set input alignments.
void SetQueries(const vector< CRef< objects::CSeq_loc > > &queries, CRef< objects::CScope > scope)
Set query sequences.
CRef< objects::CScope > GetScope(void)
Get scope.
Printer for popular multiple alignmnet formats.
void Print(CNcbiOstream &ostr)
Print alignment.
void SetEndGapChar(unsigned char gap)
Set end gap character.
void SetFormat(EFormat format)
Set format for printing alignment.
void SetGapChar(unsigned char gap)
Set gap character.
void SetWidth(int width)
Set text width (number of columns) for alignment output.
blast::CBlastUsageReport m_UsageReport
virtual void Exit(void)
Cleanup on application exit.
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
CRef< CObjectManager > m_ObjMgr
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
iterator_bool insert(const value_type &val)
Interface for CMultiAligner.
static void x_LoadConstraints(string constraintfile, vector< CMultiAlignerOptions::SConstraint > &constr)
string s_GetTreeMethodAsString(CMultiAlignerOptions::ETreeMethod method)
string s_GetKmerAlphabetAsString(CMultiAlignerOptions::TKMethods::ECompressedAlphabet alph)
int main(int argc, const char *argv[])
static void x_LoadPatterns(string patternsfile, vector< CMultiAlignerOptions::CPattern > &patterns)
void GetSeqLocFromStream(CNcbiIstream &instream, vector< CRef< objects::CSeq_loc > > &seqs, CRef< objects::CScope > &scope, objects::CFastaReader::TFlags flags)
Reads fasta sequences from stream, adds them to scope, and returns them as the list of Seq_locs.
CRef< objects::CSeq_align > GetAlignmentFromStream(CNcbiIstream &instream, CRef< objects::CScope > &scope, objects::CFastaReader::TFlags flags, objects::CSeqIdGenerator &id_generator)
Reads fasta sequences as multiple sequence alignment.
void Print(const CCompactSAMApplication::AlignInfo &ai)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
std::ofstream out("events_result.xml")
main entry point for tests
Operators to edit gaps in sequences.
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
CVersionInfo GetVersion(void) const
Get the program version information.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideLogfile
Hide log file description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eRequires
One argument requires another.
@ eExcludes
One argument excludes another.
@ eInputFile
Name of file (must exist and be readable)
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
@ eDiag_Warning
Warning message.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual bool Exists(void) const
Check existence of file.
#define MSerial_AsnText
I/O stream manipulators â.
long TFlags
binary OR of EFlags
@ fNoParseID
Generate an ID (whole defline -> title)
@ fForceType
Force specified type regardless of accession.
@ fParseRawID
Try to identify raw accessions.
@ fValidate
Check (alphabetic) residue validity.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
@ eContent
Untagged human-readable accession or the like.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
const TId & GetId(void) const
bool Empty(void) const THROWS_NONE
Check if CRef is empty â not pointing to any object, which means having a null value.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
void Start(void)
Start the timer.
static const char label[]
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
const string version
version string
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Defines: CTimeFormat - storage class for time format.
#define COBALT_GAP_EXTNT
Gap extension score.
#define COBALT_KMER_LEN
K-mer length for sequence clustering.
#define COBALT_END_GAP_OPEN
End gap opening score.
#define COBALT_PSEUDO_COUNT
Pseudocount constant used in multiple alignment.
#define COBALT_CONSERVED_CUTOFF
Conservation score cutoff used for selecting conserved columns in initial MSA.
#define COBALT_LOCAL_BOOST
Weight for sequence residues when creating MSA profules.
#define COBALT_BLAST_EVALUE
Blastp e-value cutoff for creating contraints.
#define COBALT_RPS_EVALUE
Default values for cobalt parameters Rps-Blast e-value cutoff for creating contraints.
#define COBALT_GAP_OPEN
Gap opening score.
#define COBALT_DOMAIN_HITLIST_SIZE
Hitlist size for Rps-Blast searches.
#define COBALT_TREE_METHOD
Default method for computing progressive alignment tree.
#define COBALT_END_GAP_EXTNT
End gap extension score.
#define COBALT_DOMAIN_BOOST
Weight for domain residue frequecies when creating MSA profiles.
#define COBALT_KMER_ALPH
K-mer alphabet for sequence clustering.
#define COBALT_DEFAULT_MATRIX
Default substitution matrix used in multiple alignment.
#define COBALT_MAX_CLUSTER_DIAM
Maximum cluster diameter for pre-alignment sequence clustering.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Structure for representing single user constraint for pair-wise alignment.
static DP_BlockInfo * blocks
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4