g_m8(
"m8"), g_AsnTxt(
"asntxt"), g_AsnBin(
"asnbin");
60 const stringkMode_Pairwise (
"pairwise");
61 const stringkMode_Multiple (
"multiple");
66 const stringkBoth(
"strict");
67 const string kQuery(
"query");
76 "HitFilter v.2.0.2");
78argdescr->AddDefaultKey(
"mode",
"mode",
79 "Specify whether the hits should be resolved in pairs " 80 "or as a single set.",
84argdescr->AddDefaultKey(
"min_idty",
"min_idty",
85 "Minimal input hit identity",
88argdescr->AddDefaultKey(
"min_len",
"min_len",
89 "Minimal input hit length",
92argdescr->AddDefaultKey(
"retain_overlap",
"retain_overlap",
93 "Min overlap to retain in kilobases (0=OFF)",
96argdescr->AddDefaultKey(
"fmt_in",
"fmt_in",
"Input format",
99argdescr->AddOptionalKey(
"file_in",
"file_in",
"Input file (stdin otherwise)",
103argdescr->AddFlag(
"sas",
"Assume seq-align-set as the top-level structure " 104 "for the input ASN hits",
true);
106argdescr->AddDefaultKey(
"merge",
"merge",
107 "Merge abutting alignments unless the merged " 108 "alignment overlap length ratio is greater " 109 "than this parameter. Any negative value will " 110 "turn merging off.",
114argdescr->AddOptionalKey(
"constraints",
"constraints",
115 "Binary ASN file with constraining alignments",
119argdescr->AddOptionalKey(
"file_out",
"file_out",
"Output file (stdout otherwise)",
123argdescr->AddOptionalKey(
"m",
"m",
124 "Text description/comment to add to the output",
127argdescr->AddDefaultKey(
"fmt_out",
"fmt_out",
"Output format",
130argdescr->AddDefaultKey(
"hits_per_chunk",
"hits_per_chunk",
131 "Input is split into chunks with the number of hits " 132 "per chunk limited by this parameter.",
136argdescr->AddDefaultKey(
"coord_margin",
"coord_margin",
137 "Larger values of this argument will result in less " 138 "RAM used but longer running times.",
142argdescr->AddOptionalKey(
"ids",
"ids",
"Table to rename sequence IDs.",
145argdescr->AddDefaultKey(
"ut",
"uniqueness_type",
146 "uniqueness type (strict, query, or subject)",
150unique_type->
Allow(
"strict")->
Allow(
"query")->
Allow(
"subject");
151argdescr->SetConstraint(
"ut", unique_type);
153argdescr->AddFlag(
"keep_strands",
154 "Keep plus-plus strands" 157argdescr->AddFlag(
"no_output_constraint",
158 "Do not output constraints" 162argdescr->SetConstraint(
"fmt_in", constrain_format);
163argdescr->SetConstraint(
"fmt_out", constrain_format);
166argdescr->SetConstraint(
"min_len", constrain_minlen);
169argdescr->SetConstraint(
"min_idty", constrain_minidty);
172argdescr->SetConstraint(
"merge", constrain_merge);
175constrain_mode->
Allow(kMode_Pairwise)->
Allow(kMode_Multiple);
176argdescr->SetConstraint(
"mode", constrain_mode);
187 stringctgid, accver;
189 if(ctgid.size() == 0) {
193 if(accver.size() == 0) {
196 m_IDs[ctgid] = accver;
201build_ids.
m_id[0] = build_ids.
m_id[1] = ctgid;
206bi.
m_id[1] = ctgid;
216 const stringfmt_in = args[
"fmt_in"].AsString();
217 const stringfmt_out = args[
"fmt_out"].AsString();
218 const THit::TCoordmin_len = args[
"min_len"].AsInteger();
219 const doublemin_idty = args[
"min_idty"].AsDouble();
221 CNcbiIstream& istr = args[
"file_in"]? args[
"file_in"].AsInputFile(): cin;
225 if(fmt_in == g_m8) {
227 static stringfirstline;
230 if(one_pair && firstline.size()) {
233 if(hit->GetIdentity() >= min_idty && hit->GetLength() >= min_len) {
234phitrefs->push_back(hit);
235id_query = hit->GetQueryId();
236id_subj = hit->GetSubjId();
253 if(id_query.
IsNull()) {
254id_query = hit->GetQueryId();
255id_subj = hit->GetSubjId();
257 else if(
false== id_query -> Match(*(hit->GetQueryId()))
258||
false== id_subj -> Match(*(hit->GetSubjId())) )
260 if(phitrefs->size()) {
265id_query = hit->GetQueryId();
266id_subj = hit->GetSubjId();
271 if(hit->GetIdentity() >= min_idty && hit->GetLength() >= min_len) {
272phitrefs->push_back(hit);
279 const boolparse_aln = fmt_out != g_m8;
283unique_ptr<CObjectIStream>
in(in_ptr);
285 const boolassume_sas (args[
"sas"]);
287 while(!
in->EndOfData()) {
297phitrefs, parse_aln, min_len, min_idty);
310 if(one_pair && phitrefs->size()) {
315 staticTStringSet idtags;
317 const stringstrid_query (phitrefs->front()->GetId(0)->GetSeqIdString(
true));
318 const stringstrid_subj (phitrefs->front()->GetId(1)->GetSeqIdString(
true));
319 const string tag(strid_subj +
"$_#_&"+ strid_query);
320 if(idtags.end() != idtags.find(
tag)) {
322 "In pairwise mode input hits must be collated " 323 "by query and subject.");
326idtags.insert(
tag);
336 const double& min_idty)
const 342 if(
r.GetTo() -
r.GetFrom() >= min_len) {
345 if(hit->GetIdentity() >= min_idty) {
346 if(hit->GetQueryStrand() ==
false) {
349phitrefs->push_back(hit);
358 const stringfmt = args[
"fmt_out"].AsString();
360 CNcbiOstream& ostr = args[
"file_out"]? args[
"file_out"].AsOutputFile(): cout;
362 stringcomment (args[
"m"]? args[
"m"].AsString():
"");
366 if(comment.size() > 0) {
367ostr <<
"# "<< comment << endl;
375<<
"\tNumGapOpenings" 385 const THit& hit = **ii;
394 const boolfmt_txt (fmt == g_AsnTxt);
396 const THit& h = **ii;
398 boolno_output_constraint = args[
"no_output_constraint"].HasValue();
399 if(no_output_constraint && h.
GetScore() >= kBigDbl) {
414 boolis_gap =
false;
416 for(
int i= 0;
i< ds->
GetDim();
i++) {
431 boolkeep_strands = args[
"keep_strands"].HasValue();
436vector< CRef< CSeq_id > > &ids = ds->
SetIds();
437 for(
Uint1where = 0; where < 2; ++where) {
440 id->Assign(*h.
GetId(where));
449score->SetId().SetStr(
"reciprocity");
451 if(h.
GetScore() > kBigDbl || args[
"ut"].AsString() == kBoth)
456}
else if(args[
"ut"].AsString() ==
kQuery) {
463cerr <<
"Error adding reciprocity"<< endl;
466seq_align->
SetScore().push_back(score);
469seq_align->
SetSegs().SetDenseg(*ds);
471align_list.push_back(seq_align);
474 if(comment.size() > 0) {
487cerr <<
"Error writing output file"<< endl;
496 returnlhs->GetScore() > rhs->GetScore();
504 const boolmode_multiple ( args[
"mode"].AsString() == kMode_Multiple );
505 const stringfmt_in ( args[
"fmt_in"].AsString() );
506 const stringfmt_out ( args[
"fmt_out"].AsString() );
507 const doublemaxlenfr (args[
"merge"].AsDouble());
509 if((fmt_out == g_AsnTxt || fmt_out == g_AsnBin) &&
510(fmt_in != g_AsnTxt && fmt_in != g_AsnBin))
514 "For ASN output, input must also be in ASN");
517 if( mode_multiple ==
false&& (args[
"ids"] || args[
"constraints"]
518|| fmt_in == g_AsnTxt || fmt_in == g_AsnBin ))
522 "Invalid parameter combination - " 523 "some options are not yet supported in pairwise mode.");
553 const THit::TCoordmin_len (args[
"min_len"].AsInteger());
554 const doublemin_idty (args[
"min_idty"].AsDouble());
555 const size_tmargin (args[
"coord_margin"].AsInteger());
556 const THit::TCoordretain_overlap (1024 * args[
"retain_overlap"].AsInteger());
559 if(args[
"ut"].AsString() ==
"query") {
561}
else if(args[
"ut"].AsString() ==
"subject") {
571hits.begin(), hits.end(),
580 copy(hits.begin(), hits.end(), back_inserter(
all));
581 copy(hits_new.begin(), hits_new.end(), back_inserter(
all));
586cerr <<
"Error running x_DoPairwise"<< endl;
598 const stringfmt_in = args[
"fmt_in"].AsString();
599 const stringfmt_out = args[
"fmt_out"].AsString();
600 const THit::TCoordmin_len = args[
"min_len"].AsInteger();
601 const doublemin_idty = args[
"min_idty"].AsDouble();
602 const THit::TCoordretain_overlap = 1024 * args[
"retain_overlap"].AsInteger();
603 const size_tmargin (args[
"coord_margin"].AsInteger());
606 if(args[
"ut"].AsString() ==
"query") {
608}
else if(args[
"ut"].AsString() ==
"subject") {
617 if(args[
"constraints"]) {
622 copy(restraint.begin(), restraint.end(), back_inserter(
all));
626 const size_t M= args[
"hits_per_chunk"].AsInteger();
627 const size_tdim =
all.size();
628 size_tm =
min(dim,
M);
630 constTHitRefs::iterator ii_beg =
all.begin(), ii_end =
all.end();
631THitRefs::iterator ii_hi = ii_beg, ii = ii_beg;
634 while(ii < ii_end) {
636THitRefs::iterator ii_dst = ii + m;
637 if(ii_dst > ii_end) {
642 copy(ii, ii_dst, ii_hi);
643ii_hi += ii_dst - ii;
658THitRefs::iterator ii_hi0 = ii_hi;
660THitRefs::iterator jj = hits_new.begin(), jje = hits_new.end();
661 for(;jj != jje && ii_hi != ii_hi0; *ii_hi++ = *jj++);
663 LOG_POST(
"Warning: space from eliminated alignments " 664 "not enough for all splits.");
669cerr <<
"Error in x_DoMultiple"<< endl;
672 all.erase(ii_hi, ii_end);
696 for(
Uint1where = 0; where < 2; ++where) {
716 id->Assign(*(hit->GetId(where)));
719 const stringctgid =
string(
"lcl|") + im->second.m_id[where];
720 id.Reset(
new CSeq_id(ctgid));
722hit->SetId(where,
id);
725 if(hit->GetQueryStrand() ==
false) {
729hit->SetScore(kBigDbl);
733 if(hit->GetLength() > maxlen) {
734maxlen = hit->GetLength();
737 floatscore_factor = 0.25 / maxlen;
741h->SetScore(h->GetScore() * (1 + score_factor * h->GetLength()));
743 if(args[
"no_output_constraint"].
HasValue()) {
761 int main(
intargc,
const char* argv[])
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
bool GetQueryStrand(void) const
TCoord GetQueryStart(void) const
static string s_RunLengthDecode(const string &in)
TCoord GetSubjStart(void) const
bool GetSubjStrand(void) const
const TId & GetId(Uint1 where) const
const TTranscript & GetTranscript(void) const
list< CRef< objects::CSeq_align > > TSeqAlignList
void x_LoadIDs(CNcbiIstream &istr)
void x_LoadConstraints(CNcbiIstream &istr, THitRefs &all)
void x_DoPairwise(THitRefs *pall)
void x_ReadInputHits(THitRefs *phitrefs, bool one_pair=false)
virtual void Exit()
Cleanup on application exit.
vector< THitRef > THitRefs
void x_IterateSeqAlignList(const TSeqAlignList &sa_list, THitRefs *phitrefs, bool parse_aln, const THit::TCoord &min_len, const double &min_idty) const
void x_DumpOutput(const THitRefs &hitrefs)
void x_DoMultiple(THitRefs *pall)
virtual int Run()
Run the application.
virtual void Init()
Initialize the application.
float GetScore(void) const
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
void AddComment(const string &comment)
container_type::const_iterator const_iterator
container_type::iterator iterator
const_iterator end() const
const_iterator find(const key_type &key) const
static void s_RunGreedy(typename THitRefs::iterator hri_beg, typename THitRefs::iterator hri_end, THitRefs *phits_new, TCoord min_hit_len=100, double min_hit_idty=.9, TCoord margin=1, TCoord retain_overlap=0, EUnique_type unique_type=e_Strict)
EUnique_type
Multiple-sequences greedy alignment uniquification algorithm.
static void s_MergeAbutting(typename THitRefs::iterator hri_beg, typename THitRefs::iterator hri_end, const double &maxlenfr, THitRefs *pout)
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideLogfile
Hide log file description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
@ fBinary
Open as binary file; for eInputFile, eOutputFile, eIOFile.
@ eInputFile
Name of file (must exist and be readable)
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
#define MSerial_AsnBinary
#define MSerial_AsnText
I/O stream manipulators â.
@ eSerial_AsnText
ASN.1 text.
@ eSerial_AsnBinary
ASN.1 binary.
const string AsFastaString(void) const
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
TGi GetGiForAccession(const string &acc, CScope &scope, EGetIdType flags=0)
Given an accession string retrieve the GI id.
string GetAccessionForGi(TGi gi, CScope &scope, EAccessionVersion use_version=eWithAccessionVersion, EGetIdType flags=0)
Retrieve the accession for a given GI.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
bool IsNull(void) const THROWS_NONE
Check if pointer is null â same effect as Empty().
uint8_t Uint1
1-byte (8-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
TScore & SetScore(void)
Assign a value to Score data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
void ResetStrands(void)
Reset Strands data member.
void SetType(TType value)
Assign a value to Type data member.
TDim GetDim(void) const
Get the Dim member data.
TNumseg GetNumseg(void) const
Get the Numseg member data.
TIds & SetIds(void)
Assign a value to Ids data member.
const TDisc & GetDisc(void) const
Get the variant data.
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
@ eType_partial
mapping pieces together
ENa_strand
strand of nucleic acid
TGi GetGi(void) const
Get the variant data.
bool IsGi(void) const
Check if variant Gi is selected.
void SetData(TData &value)
Assign a value to Data data member.
list< CRef< CSeq_align > > TAlign
const TAlign & GetAlign(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
bool s_PHitRefScore(const CAppHitFilter::THitRef &lhs, const CAppHitFilter::THitRef &rhs)
int main(int argc, const char *argv[])
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
constexpr auto sort(_Init &&init)
#define GetProgramName
Avoid name clash with the NCBI C Toolkit.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
CRef< objects::CObjectManager > om
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4