(
void);
76 virtual int Run(
void);
77 virtual void Exit(
void);
114m_comp2len, m_comp2range_coll)
130version_str+=
", AGP Specification v2.1";
132 str=
"Validate data in the AGP format:\n" 133 "https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/\n" 135 "Version: "+ version_str +
"\n" 137 "USAGE: agp_validate [-options] [FASTA files...] [AGP files...]\n" 139 "There are 3 validations modes:\n" 140 "no mode option: (default mode) report component, gap, scaffold and object statistics, perform checks\n" 141 " that do not require component sequences to be available in GenBank (see: -list).\n" 142 "-alt, -species: Check component Accessions, Lengths and Taxonomy ID using GenBank data;\n" 143 " -species allows components from different subspecies during Taxid checks.\n" 145 "-comp Check that the supplied object sequences (in FASTA files) match what can be\n" 146 " constructed from the AGP and the component sequences (in FASTA files or in GenBank).\n" 147 " Run \"agp_validate -comp\" to see the options for this mode.\n" 149 "OPTIONS (default and -alt modes):\n" 150 " -g Check that component names look like Nucleotide accessions\n" 151 " (this does not require components to be in GenBank).\n" 152 " -out FILE Save the AGP file, adding missing version 1 to the component accessions (need -alt),\n" 153 " or adding gaps where runs of Ns longer than 10 bp are found in components (need FASTA files).\n" 154 " -obj Use FASTA files to read names and lengths of objects (the default is components).\n" 155 " -v VER AGP version (1 or 2). The default is to choose automatically. Version 2 is chosen\n" 156 " when the linkage evidence (column 9) is not empty in the first gap line encountered.\n" 157 " -xml Report results in XML format.\n" 158 " -sub Treat serious warnings as errors, put summary and stats at the top.\n" 160 " Extra checks specific to an object type:\n" 161 " -un Unplaced/unlocalized scaffolds:\n" 162 " any single-component scaffold must use the whole component in orientation '+'\n" 163 " -scaf Scaffold from component AGP: no scaffold-breaking gaps allowed\n" 164 " -chr Chromosome from scaffold AGP: ONLY scaffold-breaking gaps allowed\n" 165 " Use both of the last 2 options in this order: -scaf Scaf_AGP_file(s) -chr Chr_AGP_file(s)\n" 166 " to check that all scaffolds in Scaf_AGP_file(s) are wholly included in Chr_AGP_file(s)\n" 169 " -list List error and warning messages.\n" 170 " -limit COUNT Print only the first COUNT messages of each type.\n" 171 " Default=100. To print all, use: -limit 0\n" 172 " -skip, -only WHAT Skip, or report only a particular error or warning.\n" 173 " -show WHAT Show the warning hidden by default (w40, w45, w46, w52).\n" 174 " 'WHAT' could be a part of the message text, an error code (e11, w22, etc; see -list),\n" 175 " or a keyword: all, warn, err, alt.\n" 177 "If component FASTA files are given in front of AGP files, also check that:\n" 178 "- component_id from AGP is present in FASTA;\n" 179 "- component_end does not exceed sequence length.\n" 180 "If FASTA files for objects are given (after -obj), check that:\n" 181 "- object_id from AGP is present in FASTA;\n" 182 "- object lengths in FASTA and in AGP match.\n" 194 autoarg_desc = make_unique<CArgDesc_agp_validate>(
GetVersion());
196arg_desc->SetUsageContext(
198 "Validate AGP data",
false);
201arg_desc->AddFlag(
"alt",
"");
203arg_desc->AddFlag(
"g",
"");
204arg_desc->AddFlag(
"obj",
"");
205arg_desc->AddFlag(
"un",
"");
206arg_desc->AddFlag(
"scaf",
"");
207arg_desc->AddFlag(
"chr",
"");
208arg_desc->AddFlag(
"comp",
"");
209arg_desc->AddFlag(
"xml",
"");
210arg_desc->AddFlag(
"sub",
"");
213arg_desc->AddOptionalKey(
"loadlog",
"FILE",
214 "specifies where we write our loading log for -comp",
216arg_desc->AddFlag(
"ignoreagponly",
"");
217arg_desc->AddFlag(
"ignoreobjfileonly",
"");
218arg_desc->AddDefaultKey(
"diffstofind",
"",
"",
221arg_desc->AddFlag(
"species",
"allow components from different subspecies");
223arg_desc->AddOptionalKey(
"out",
"FILE",
224 "add missing version 1 to component accessions",
227arg_desc->AddOptionalKey(
"v",
"ver",
231arg_desc->AddOptionalKey(
"skip",
"error_or_warning",
232 "Message or message code to skip",
236arg_desc->AddOptionalKey(
"only",
"error_or_warning",
237 "Message or message code to print (hide other)",
241arg_desc->AddOptionalKey(
"show",
"error_or_warning",
242 "Message or message code to print (if not printed by default)",
246arg_desc->AddDefaultKey(
"limit",
"ErrorCount",
247 "Print at most ErrorCount lines with a particular error",
251arg_desc->AddFlag(
"list",
"all possible errors and warnings");
254arg_desc->AddExtra(0, 10000,
"files to be processed",
272 if( args[
"list"].
HasValue() ) {
273 pAgpErr->PrintAllMessages(cout);
288 pAgpErr->m_out = error_details_out;
301cerr <<
"Error -- cannot specify -un with -chr/-scaf.\n";
305cerr <<
"Error -- cannot specify -chr/-scaf with -alt/-species.\n";
310 if( args[
"scaf"].
HasValue() ) {
311cerr <<
"Error -- -scaf and -chr must precede different files.\n";
317 else if( args[
"scaf"].
HasValue() ) {
323cerr <<
"Error -- cannot specify -obj with -alt/-species.\n";
330 boolcheckCompNames=args[
"g"].HasValue();
332 if(checkCompNames) {
343 if( args[
"species"].
HasValue() ) {
352 boolonlyNotSkip = args[
"only"].HasValue();
354 if( args[
"skip"].
HasValue() ) {
356cerr <<
"Error -- cannot specify both -only and -skip.\n";
359err_warn = &( args[
"skip"].GetStringList() );
360action=
"Skipping messages:\n";
362 else if(onlyNotSkip) {
363 if( args[
"show"].
HasValue() ) {
364cerr <<
"Error -- cannot specify both -only and -show; please use multiple -only instead.\n";
368err_warn = &( args[
"only"].GetStringList() );
370action=
"Allowed messages:\n";
374 boolneedHeading=
true;
375 for( CArgValue::TStringArray::const_iterator it =
376err_warn->begin(); it != err_warn->end(); ++it
378 stringres =
pAgpErr->SkipMsg(*it, onlyNotSkip);
380cerr <<
"WARNING: no matches for "<< *it <<
"\n";
384 if( res[0] ==
' '&& needHeading) {
385 if(needHeading) cerr << action;
390cerr << res <<
"\n";
397 if( args[
"show"].
HasValue() ) {
398err_warn = &( args[
"show"].GetStringList() );
399 for( CArgValue::TStringArray::const_iterator it =
400err_warn->begin(); it != err_warn->end(); ++it
407args[
"limit"].HasValue() ? args[
"limit"].AsInteger() : 100;
410 if( args[
"v"].AsString()[0]==
'1') {
413 else if( args[
"v"].AsString()[0]==
'2') {
417cerr <<
"Error -- invalid AGP version after -v (must start with 1 or 2).\n";
425 if( ! args[
"comp"] ) {
428 if( args[
"loadlog"] || args[
"ignoreagponly"] ||
429args[
"ignoreobjfileonly"] ||
430args[
"diffstofind"].AsInteger() > 0 )
432cerr <<
"Error -- -comp mode options without -comp"<< endl;
437 booltaxid_check_failed=
false;
439cout <<
"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n<page>\n";
451cout <<
"</page>\n";
454 else if(error_details_out) {
455cout <<
"\n\n===== Details ====="<< endl;
457 deleteerror_details_out;
466 for(
unsigned int i= 1;
i<= args.
GetNExtra();
i++) {
468 if( ! filename.empty() && filename[0] !=
'-') {
474 if( args[
"loadlog"] ) {
475comploadlog = args[
"loadlog"].AsString();
478 stringagp_as_fasta_file;
479 if( args[
"out"] ) {
480agp_as_fasta_file = args[
"out"].AsString();
484 if( args[
"ignoreagponly"] ) {
487 if( args[
"ignoreobjfileonly"] ) {
491 intdiffsToFind = args[
"diffstofind"].AsInteger();
496agp_as_fasta_file, diffsToHide,
499cerr <<
"AGP/FASTA comparison failed."<< endl;
512<< s <<
" and length"<< s <<
" loaded from FASTA."<< endl;
516runs_of_Ns += it->second.size();
521 if(!
m_use_xml) cout <<
"No runs of Ns longer than 10 bp found in FASTA sequences."<< endl;
531cout <<
"===== Reading Chromosome from scaffold AGP ====="<< endl;
533 if(
out) *
out<<
"===== Chromosome from scaffold AGP ====="<< endl;
540cout <<
"===== Reading Scaffold from component AGP ====="<< endl;
541 if(
out) *
out<<
"===== Scaffold from component AGP ====="<< endl;
556 for(
unsigned int i= 1;
i<= args.
GetNExtra();
i++) {
561cerr <<
"Error -- second -chr is not supported.\n";
565cerr <<
"Error -- -chr after a file, but no preceding -scaf. Expecting:\n" 566<<
" -scaf Scaffold_AGP_file(s) -chr Chromosome_AGP_file(s)\n";
575cout <<
"\n===== Reading Chromosome from scaffold AGP ====="<< endl;
576 if(
out) *
out<<
"\n===== Chromosome from scaffold AGP ====="<< endl;
590istr.get(ch); istr.putback(ch);
604 if(num_fasta_files==args.
GetNExtra()) {
632 if(
code==-1)
continue;
634 boolcomp2len_check_failed=
false;
637 if( !agp_row->
IsGap() ) {
664 if(
code!=0 || comp2len_check_failed ||
673 pAgpErr->m_messages = tmp_messages;
691 stringacc, acc_long;
696 intheader_line_num=0;
703 boolmfa_bMasked=
false;
704 boolmfa_prevMasked=
false;
710 if(line[0]==
'>') {
719 if(prev_len)
gotoLengthRedefinedFa;
722 if(mfa_pos-mfa_firstMasked > 10)
723range_coll +=
TSeqRange(mfa_firstMasked, mfa_pos-1);
725 if(!range_coll.
empty()) {
729range_coll.
clear();
730mfa_firstMasked=mfa_pos=0;
732mfa_prevMasked=
false;
738 if(pos2<pos1) pos1 = pos2;
741 if(pos1>0 && line[pos1]==
'|') pos1--;
744acc_long=line.substr(1, pos1);
752cerr<<
"ERROR - expecting >fasta_header at start of file "<< filename <<
", got:\n" 753<< line.substr(0, 100) <<
"\n\n";
759cerr<<
"ERROR - non-alphabetic character in the FASTA:\n" 760 " file "<< filename <<
"\n line "<<
line_num<<
"\n column "<<
i+1 <<
"\n\n";
765mfa_bMasked =
toupper(line[
i]) ==
'N';
766 if(mfa_bMasked!=mfa_prevMasked) {
768mfa_firstMasked=mfa_pos;
771 if(mfa_pos-mfa_firstMasked > 10)
772range_coll +=
TSeqRange(mfa_firstMasked, mfa_pos-1);
775mfa_prevMasked=mfa_bMasked;
798 if(prev_len)
gotoLengthRedefinedFa;
801 if(mfa_pos-mfa_firstMasked > 10)
802range_coll +=
TSeqRange(mfa_firstMasked, mfa_pos-1);
804 if(!range_coll.
empty()) {
809cerr<<
"WARNING - empty file "<< filename <<
"\n";
814cerr<<
"ERROR - sequence length redefined from "<< prev_len <<
" to "<<
len<<
"\n" 815<<
" sequence id: "<< acc_long <<
"\n" 816<<
" File: "<< filename <<
"\n" 817<<
" Lines: "<< header_line_num <<
".."<<
line_num<<
"\n\n";
831 if(runs_of_Ns && runs_of_Ns->
size()) {
834cerr <<
"FATAL: need AGP version (for adding gap lines). Please use -v 1 or -v 2\n";
848 "\t1\t100\t1\tN\t100\t"+
849 string(
row->GetVersion() ==
eAgpVersion_1_1?
"fragment\tyes\t":
"scaffold\tyes\tunspecified")
852 intcomp2obj_ofs =
row->object_beg -
row->component_beg;
861(*m_out) << tmp_row->
ToString() << endl;
866tmp_gap_row->
object_beg= comp2obj_ofs + it->GetFrom();
867tmp_gap_row->
object_end= comp2obj_ofs + it->GetTo();
868tmp_gap_row->
gap_length= it->GetTo() - it->GetFrom() + 1;
871(*m_out) << tmp_gap_row->
ToString(
true) << endl;
887(*m_out) << tmp_row->
ToString() << endl;
892(*m_out) << s << endl;
899 int main(
intargc,
const char* argv[])
901 if(argc==1+1 &&
string(
"-comp")==argv[1]) {
902cout <<
"agp_validate -comp (formerly agp_fasta_compare):\n" 904 "check that the object sequences FASTA matches the AGP.\n" 907 "USAGE: agp_validate -comp [-options] FASTA file(s)... AGP file(s)...\n" 909 " -loadlog OUTPUT_FILE Save the list of all loaded sequences.\n" 910 " -ignoreagponly Do not report objects present in AGP file(s) only.\n" 911 " -ignoreobjfileonly Do not report objects present in FASTA file(s) only.\n" 912 " -diffstofind NUM (EXPERIMENTAL) If specified, list the first NUM lines of each difference.\n" 913 " -out OUTPUT_FILE Save the assembled AGP sequences as FASTA.\n" 915 "FASTA files for components can be provided (along with object FASTA files) if components are not yet in GenBank.\n"void OverrideLenIfAccession(const string &acc, int &in_out_len)
string ExtractAccession(const string &long_acc)
@ eAgpVersion_auto
auto-detect using the first gap line
@ eAgpVersion_1_1
AGP spec 1.1.
@ eAgpVersion_2_0
AGP spec 2.0 or later.
CRef< CAgpErrEx > pAgpErr
int main(int argc, const char *argv[])
virtual ~CAgpCompSpanSplitter()
CAgpCompSpanSplitter(CNcbiOstream *out=NULL)
virtual void SaveRow(const string &s, CRef< CAgpRow > row, TRangeColl *runs_of_Ns)
Correctly print multiple errors and warnings on consequitive lines; suppress undesired or higly repet...
EResult Run(const std::list< std::string > &files, const std::string &loadlog, const std::string &agp_as_fasta_file, TDiffsToHide diffsToHide, int diffs_to_find)
@ fDiffsToHide_ObjfileOnly
virtual void SetVersion(EAgpVersion ver)
Change what AGP version to use for the next input that's read.
virtual int ReadStream(CNcbiIstream &is, EFinalize eFinalize=eFinalize_Yes)
Read an AGP file from the given input stream.
string & GetComponentId()
static bool CheckComponentEnd(const string &comp_id, TAgpPos comp_end, TAgpLen comp_len, CAgpErr &agp_err)
static CRef< CAgpRow > New(CAgpErr *arg, EAgpVersion agp_version=eAgpVersion_auto, CAgpReader *reader=nullptr)
string ToString(bool reorder_linkage_evidences=false)
static bool IsGap(char c)
int FromString(const string &line)
CAgpValidateReader m_reader
void x_LoadLenFa(CNcbiIstream &istr, const string &filename)
enum CAgpValidateApplication::EValidationType m_ValidationType
EAgpVersion m_agp_version
unique_ptr< CAltValidator > m_AltValidator
TMapStrRangeColl m_comp2range_coll
CAgpValidateApplication()
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
void x_ReportFastaSeqCount()
void x_ValidateUsingFiles(const CArgs &args, CNcbiOstream *out=NULL)
void x_ValidateFile(CNcbiIstream &istr)
virtual void Exit(void)
Cleanup on application exit.
void PrintTotals(CNcbiOstream &out=cout, bool use_xml=false)
void Reset(bool for_chr_from_scaf=false)
void SetRowOutput(IAgpRowOutput *row_output)
CVersionInfo m_VersionInfo
string & PrintUsage(string &str, bool) const
Print usage message to end of specified string.
CArgDesc_agp_validate(CVersionInfo &&versionInfo)
TAgpLen AddCompLen(const string &acc, TAgpLen len, bool increment_count=true)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
TRangeVector::const_iterator const_iterator
const_iterator end() const
const_iterator begin() const
container_type::iterator iterator
const_iterator begin() const
const_iterator end() const
const_iterator find(const key_type &key) const
std::ofstream out("events_result.xml")
main entry point for tests
static unsigned int line_num
static const char * str(char *buf, int n)
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
unsigned int TSeqPos
Type for sequence locations and lengths.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
CVersionInfo GetVersion(void) const
Get the program version information.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
void SetVersion(const CVersionInfo &version)
Set the version number for the program.
vector< string > TStringArray
Some values types can contain several value lists.
size_t GetNExtra(void) const
Get the number of unnamed positional (a.k.a. extra) args.
@ fAllowMultiple
Repeated key arguments are legal (use with AddKey)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
CNcbistrstream_Base< IO_PREFIX::ostrstream, IOS_BASE::out > CNcbiOstrstream
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
NCBI_NS_STD::string::size_type SIZE_TYPE
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
void CONNECT_Init(const IRWRegistry *reg=0, CRWLock *lock=0, TConnectInitFlags flag=eConnectInit_OwnNothing, FSSLSetup ssl=0)
Init [X]CONNECT library with the specified "reg" and "lock" (ownership for either or both can be deta...
virtual string Print(void) const
Print version information.
#define NCBI_SC_VERSION_PROXY
#define NCBI_TEAMCITY_BUILD_NUMBER_PROXY
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
#define row(bind, expected)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4