it->RemoveShortHolesAndRescore(*
m_gnomon);
79test_align.push_back(chain);
82cerr <<
"Testing alignment "<< chain.
ID() <<
" in fragment "<<
l<<
' '<<
r<< endl;
89 boolleftwall,
boolrightwall,
boolleftanchor,
boolrightanchor,
96 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); it++) {
97 if(left <= it->Limits().GetTo() && it->Limits().GetFrom() <= right)
98suspect_aligns.push_back(*it);
103 boolfound_bad_cluster =
false;
104 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); ) {
105 if(it->Limits().GetTo() < left || it->Limits().GetFrom() > right) {
112found_bad_cluster =
true;
113cerr <<
"Deleting alignment "<< it->ID() << endl;
115it->AddComment(
"Bad score prediction alone");
116bad_aligns.push_back(*it);
118it = aligns.erase(it);
121suspect_aligns.push_back(*it++);
125 if(found_bad_cluster) {
126cerr <<
"Testing w/o bad alignments in fragment "<< left <<
' '<< right << endl;
134 boolleftwall,
boolrightwall,
boolleftanchor,
boolrightanchor)
137 for(TGeneModelList::iterator it = suspect_aligns.begin(); it != suspect_aligns.end();) {
143it = suspect_aligns.erase(it);
145cerr <<
"Testing w/o "<< algn.
ID();
148cerr <<
"- Good. Deleting alignment "<< algn.
ID() << endl;
150algn.
AddComment(
"Good score prediction without");
151bad_aligns.push_back(algn);
154cerr <<
" - Still bad."<< endl;
156suspect_aligns.insert(it,algn);
162 boolleftwall,
boolrightwall,
boolleftanchor,
boolrightanchor)
165 for(TGeneModelList::iterator it = suspect_aligns.begin(); score ==
BadScore() && it != suspect_aligns.end(); ) {
170cerr <<
"Deleting alignment "<< it->ID() << endl;
172it->AddComment(
"Bad score prediction in combination");
173bad_aligns.push_back(*it);
174it = suspect_aligns.erase(it);
176cerr <<
"Testing fragment "<< left <<
' '<< right << endl;
183 boolleftmostwall,
boolrightmostwall,
boolleftmostanchor,
boolrightmostanchor,
TGeneModelList& bad_aligns)
189 boolleftwall = leftmostwall;
190 boolleftanchor = leftmostanchor;
194 boolrightwall =
false;
195 boolrightanchor =
false;
197 Int8prev_bad_right = rlimit+1;
198 booldo_it_again =
false;
207 TIVecbusy_spots(rlimit+1,0);
209 int a=
max(0,it_c->Limits().GetFrom()-
margin);
210 int b=
min(rlimit,it_c->Limits().GetTo()+
margin);
211 for(
int i=
a;
i<=
b; ++
i)
216 for( ; right < rlimit && busy_spots[right] != 0; ++right);
218 if(right + (right-left)/2 >= rlimit) {
220rightwall = rightmostwall;
221rightanchor = rightmostanchor;
224rightanchor =
false;
232 if(right < prev_bad_right) {
233suspect_aligns.clear();
237cerr << left <<
' '<< right <<
' '<<
m_gnomon->GetGCcontent() << endl;
242cerr <<
"Inconsistent alignments in fragment "<< left <<
' '<< right <<
'\n';
245leftwall, rightwall, leftanchor, rightanchor,
251prev_bad_right = right;
252right = (left+right)/2;
260leftwall, rightwall, leftanchor, rightanchor);
263leftwall, rightwall, leftanchor, rightanchor);
265cerr <<
"!!! BAD SCORE EVEN WITH FINISHED ALIGNMENTS !!! "<< endl;
268models.push_back(*it);
272prev_bad_right = rlimit+1;
274list<CGeneModel> genes =
m_gnomon->GetGenes();
278 if(right < rlimit && !genes.empty() && !genes.back().RightComplete() && !do_it_again) {
279partial_start = genes.back().LeftComplete() ? genes.back().RealCdsLimits().GetFrom() :
static_cast<TSignedSeqPos>(left);
280 _ASSERT( partial_start < right );
284do_it_again =
false;
286 if(!genes.empty()) {
287left = genes.back().ReadingFrame().GetTo()+1;
289}
else if(partial_start < left+1000) {
291}
else if(partial_start < right) {
292 intnew_left = partial_start-100;
293 for( ; new_left > left && busy_spots[new_left] != 0; --new_left);
294 if(new_left > left+1000) {
301left = (left+right)/2+1;
305models.splice(models.end(), genes);
315}
while(left <= rlimit);
333 returnmodel_lim_for_nested;
335pair<TSignedSeqRange, bool>
GetGeneWallLimits(
constlist<TGeneModelList::iterator>& models,
boolexternal =
false)
337 boolcoding_gene =
false;
338 for(
autoim : models) {
339 if(im->ReadingFrame().NotEmpty()) {
346 for(
autoim : models) {
347 if(coding_gene && im->ReadingFrame().Empty())
352 returnmake_pair(gene_lim, coding_gene);
360 return(
a.GetFrom() !=
b.GetFrom() ?
361 a.GetFrom() <
b.GetFrom() :
362 a.GetTo() >
b.GetTo()
368 for(TGeneModelList::iterator loop_it = models.begin(); loop_it != models.end();) {
369TGeneModelList::iterator ir = loop_it;
372 if(ir->Strand() != strand)
380aligns.push_back(wall_model);
381}
else if(ir->GoodEnoughToBeAnnotation()) {
386aligns.push_back(wall_model);
387}
else if(ir->RankInGene() == 1) {
389aligns.splice(aligns.end(), models, ir);
400 typedeflist<TGeneModelList::iterator> TIterList;
402 typedefTGIDIterlist::iterator TGIter;
403 structgeneid_order {
404 booloperator()(TGIter
a, TGIter
b)
const{
return a->second.front()->GeneID() <
b->second.front()->GeneID(); }
406 typedeftuple<TSignedSeqRange, bool, TGIter> TGenomeRange;
407 structgrange_order {
408 booloperator()(
constTGenomeRange&
a, TGenomeRange&
b)
const{
409 if(get<0>(
a) != get<0>(
b))
410 returnget<0>(
a) < get<0>(
b);
411 else if(get<1>(
a) != get<1>(
b))
412 returnget<1>(
a) < get<1>(
b);
414 returngeneid_order()(get<2>(
a), get<2>(
b));
417 structinterval_order {
420 structGenomeRangeMap :
public map<TSignedSeqRange, list<TGenomeRange>, interval_order> {
421 voidInsert(
constTGenomeRange& intron) {
422list<TGenomeRange> clust(1, intron);
425 for(
autoit = lower_bound(intron_left); it != end() && it->first.IntersectingWith(range); ) {
427clust.splice(clust.end(), it->second);
430emplace(range, clust);
433 for(
auto& range_intronlist : *
this) {
434 auto& lst = range_intronlist.second;
435lst.sort(grange_order());
445genes[im->GeneID()].push_back(im);
448GenomeRangeMap introns;
449 for(
autoig = genes.begin(); ig != genes.end(); ++ig) {
452 boolcoding = rslt.second;
453 for(
autoim : ig->second) {
457 for(
int i= 1;
i< (
int)m.
Exons().size(); ++
i) {
458 if(m.
Exons()[
i-1].m_ssplice_sig ==
"XX"|| m.
Exons()[
i].m_fsplice_sig ==
"XX")
461 if(
Include(lim_for_nested, range)) {
462 boolis_hole = !m.
Exons()[
i-1].m_ssplice || !m.
Exons()[
i].m_fsplice;
463TGenomeRange intron(range, is_hole, ig);
464introns.Insert(intron);
471list<TGIter> genes_hosting_partial;
472list<TGIter> nested_partial;
473GenomeRangeMap finished_intervals;
474 if(!introns.empty()) {
475list<TGIter> genes_to_remove;
476 for(
autoig = genes.begin(); ig != genes.end(); ++ig) {
477TIterList& modelsi = ig->second;
478 autogfront = modelsi.front();
481 if(iclust != introns.end() &&
Include(iclust->first, lim_for_nested)) {
482 for(TGenomeRange& intron : iclust->second) {
484 if(
Include(range, lim_for_nested)) {
485 boolis_hole = get<1>(intron);
486 autohost_it = get<2>(intron);
487 if(is_hole && !gfront->GoodEnoughToBeAnnotation()) {
488 if(host_it->second.front()->Score() > gfront->Score())
489genes_to_remove.push_back(ig);
491genes_to_remove.push_back(host_it);
493 if(gfront->GoodEnoughToBeAnnotation()) {
494 for(
autoim : modelsi)
497genes_hosting_partial.push_back(host_it);
498nested_partial.push_back(ig);
505 if(gfront->GoodEnoughToBeAnnotation()) {
507 for( ; !found && iclust != introns.end() && iclust->first.IntersectingWith(lim_for_nested); ++iclust) {
508 for(TGenomeRange& intron : iclust->second) {
509 if(get<2>(intron) == ig)
514TGenomeRange finished_interval(lim_for_nested,
false, ig);
515finished_intervals.Insert(finished_interval);
522genes_to_remove.sort(geneid_order());
523genes_to_remove.unique();
524genes_hosting_partial.sort(geneid_order());
525genes_hosting_partial.unique();
526nested_partial.sort(geneid_order());
527nested_partial.unique();
528 for(
autoit : genes_to_remove) {
529genes_hosting_partial.remove(it);
530nested_partial.remove(it);
531 for(
autoim : it->second) {
533im->AddComment(
"Partial gene in a hole");
534bad_aligns.push_back(*im);
542GenomeRangeMap hosting_intervals;
543 for(
autoit : genes_hosting_partial) {
544TIterList& lst = it->second;
546 boolcoding_gene = find_if(lst.begin(), lst.end(), [](TGeneModelList::iterator im){ return im->ReadingFrame().NotEmpty(); }) != lst.end();
550 for(
autoim : lst) {
557gene_lim_for_nested += model_lim_for_nested;
560vector<int> grange(gene_lim_for_nested.
GetLength(),1);
561 for(
autoim : lst) {
570 for(
int i= 0;
i< (
int)ai.
Exons().size(); ++
i) {
572 for(
intj = overlap.
GetFrom(); j <= overlap.
GetTo(); ++j)
573grange[j-gene_lim_for_nested.
GetFrom()] = 0;
576 for(
int i= 1;
i< (
int)ai.
Exons().size(); ++
i) {
577 if(!ai.
Exons()[
i-1].m_ssplice || !ai.
Exons()[
i].m_fsplice) {
580 for(
intj = hole.
GetFrom(); j <= hole.
GetTo(); ++j)
581grange[j-gene_lim_for_nested.
GetFrom()] = 0;
585 _ASSERT(grange.front() == 0 && grange.back() == 0);
589 for(
intj = 0; j < (
int)grange.size(); ++j) {
591 if(grange[j] == 1) {
595}
else if(grange[j] == 1) {
599TGenomeRange hosting_interval(interval,
false, it);
600hosting_intervals.Insert(hosting_interval);
607TRangeModels nested_models;
608 for(
autoig : nested_partial) {
609TGeneModelList::iterator nested_modeli = ig->second.front();
610 _ASSERT(ig->second.size() == 1);
615 if(rslt != hosting_intervals.end() &&
Include(rslt->first, lim_for_nested)) {
616 for(
auto& grange : rslt->second) {
618 if(
Include(interval,lim_for_nested)) {
619 if(hosting_interval.
Empty())
620hosting_interval = interval;
622hosting_interval = (hosting_interval&interval);
628 if(hosting_interval.
NotEmpty()) {
629TIterList nested(1,nested_modeli);
631 for(
autoit = finished_intervals.lower_bound(left); it != finished_intervals.end() && it->first.IntersectingWith(hosting_interval); ++it) {
632 for(
auto& grange : it->second) {
637 if(
Precede(finished_interval,lim_for_nested)) {
638hosting_interval.
SetFrom(finished_interval.
GetTo());
639}
else if(
Precede(lim_for_nested,finished_interval)) {
640hosting_interval.
SetTo(finished_interval.
GetFrom());
642 for(
autoim : get<2>(grange)->second)
643nested.push_back(im);
648nested_models[hosting_interval].splice(nested_models[hosting_interval].begin(), nested);
653 boolscaffold_wall =
wall;
655 ITERATE(TRangeModels,
i, nested_models) {
660 ITERATE(TIterList, im,
i->second) {
661nested.push_back(**im);
663 if(!(*im)->GoodEnoughToBeAnnotation()) {
664 if(nested.back().HasStart() && !
Include(hosting_interval,nested.back().MaxCdsLimits())) {
665 CCDSInfocds = nested.back().GetCdsInfo();
666 if(nested.back().Strand() ==
ePlus)
670nested.back().SetCdsInfo(cds);
672nested.back().AddComment(
"partialnested");
675included_complete_models.
insert((*im)->ID());
679cerr <<
"Interval "<< hosting_interval <<
'\t'<< nested.size() << endl;
684 if(!im->Support().empty()) {
686 if(im->ID() == 0 || included_complete_models.
find(im->ID()) == included_complete_models.
end())
687models.push_back(*im);
691 wall= scaffold_wall;
703 boolgapfilled =
false;
706 if(ie->m_fsplice_sig ==
"XX"|| ie->m_ssplice_sig ==
"XX")
709genome_cds += (cds&ie->Limits()).
GetLength();
712 if(gapfilled && genome_cds < 45) {
714model.
AddComment(
"Most CDS in genomic gap");
715bad_aligns.push_back(model);
727 if(!models.empty()) {
728 for(
autoit_loop =
next(models.begin()); it_loop != models.end(); ) {
730 if(it->RankInGene() != 1 || it->GoodEnoughToBeAnnotation() || it->Type()&
CGeneModel::eNested)
732 autoit_prev =
prev(it);
733 if(it_prev->RankInGene() != 1 || it_prev->GoodEnoughToBeAnnotation() || it_prev->Type()&
CGeneModel::eNested)
736 if(it->MaxCdsLimits().IntersectingWith(it_prev->MaxCdsLimits())) {
737cerr <<
"Intersecting alignments "<< it->ID() <<
" "<< it_prev->ID() <<
" "<< it->Score() <<
" "<< it_prev->Score() << endl;
738 autoit_erase = (it->Score() < it_prev->Score()) ? it : it_prev;
740it_erase->AddComment(
"Intersects with other partial");
741bad_aligns.push_back(*it_erase);
742models.erase(it_erase);
755 Predict(left, right, aligns.begin(), aligns.end(), models_tmp,(left!=0 ||
wall),
wall, left!=0,
false, bad_aligns);
757 if(!it->Support().empty() || it->RealCdsLen() >=
minCdsLen)
758models.push_back(*it);
763 CCDSInfocds_info = it->GetCdsInfo();
769 if(((
i->IsInsertion() ||
i->IsMismatch()) &&
Include(fullcds,
i->Loc())) ||
770(
i->IsDeletion() &&
i->Loc() > fullcds.
GetFrom() &&
i->Loc() <= fullcds.
GetTo())) {
774it->FrameShifts() = fs;
783it->SetCdsInfo(cds_info);
785 if(it->PStop(
false) || !it->FrameShifts().empty()) {
789 CCDSInfocds_info = it->GetCdsInfo();
791it->SetCdsInfo(cds_info);
808 for(five_p=0; five_p < (
int)vec.size() && vec[five_p] ==
'N'; ++five_p);
809 for(three_p=0; three_p < (
int)vec.size() && vec[(
int)vec.size()-1-three_p] ==
'N'; ++three_p);
811 if(five_p > 0 || three_p > 0) {
819 _ASSERT(m.
Exons().front().Limits().GetLength() > left);
823 _ASSERT(m.
Exons().back().Limits().GetLength() > right);
827 doublescore = m.
Score();
844arg_desc->
AddKey(
"param",
"param",
845 "Organism specific parameters",
848arg_desc->
AddFlag(
"nognomon",
"Skips ab initio prediction and ab initio extension of partial chains.");
851arg_desc->
AddFlag(
"open",
"Allow partial predictions at the ends of contigs. Used for poorly assembled genomes with lots of unfinished contigs.");
853arg_desc->
AddFlag(
"nonconsens",
"Allows to accept nonconsensus splices starts/stops to complete partial alignmet. If not allowed some partial alignments " 854 "may be rejected if there is no way to complete them.");
857arg_desc->
AddFlag(
"norep",
"DO NOT mask lower case letters");
861arg_desc->
AddFlag(
"singlest",
"Allow single exon EST chains as evidence");
867 CNcbiIfstreamparam_file(args[
"param"].AsString().c_str());
871annot->
window= args[
"window"].AsInteger();
872annot->
margin= args[
"margin"].AsInteger();
873annot->
wall= !args[
"open"];
874annot->
mpp= args[
"mpp"].AsDouble();
875 boolnonconsens = args[
"nonconsens"];
879annot->
mincontig= args[
"mincont"].AsInteger();
881annot->
minCdsLen= args[
"minlen"].AsInteger();
883 if(!args[
"norep"])
pair< TSignedSeqRange, bool > GetGeneWallLimits(const list< TGeneModelList::iterator > &models, bool external=false)
TSignedSeqRange WalledCdsLimits(const CGeneModel &a)
bool s_AlignScoreOrder(const CGeneModel &ap, const CGeneModel &bp)
bool s_AlignSeqOrder(const CGeneModel &ap, const CGeneModel &bp)
TSignedSeqRange GetWallLimits(const CGeneModel &m, bool external=false)
void FindPartials(TGeneModelList &models, TGeneModelList &aligns, EStrand strand)
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
void Set5PrimeCdsLimit(TSignedSeqPos p)
void SetScore(double score, bool open=false)
TSignedSeqRange Start() const
void AddPStop(SPStop stp)
TSignedSeqRange Cds() const
const TPStops & PStops() const
void AddExon(TSignedSeqRange exon, const string &fs="", const string &ss="", double ident=0, const string &seq="", const CInDelInfo::SSource &src=CInDelInfo::SSource())
const TExons & Exons() const
TSignedSeqRange ReadingFrame() const
virtual CAlignMap GetAlignMap() const
TSignedSeqRange RealCdsLimits() const
virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant=true)
void SetCdsInfo(const CCDSInfo &cds_info)
TSignedSeqRange Limits() const
void AddComment(const string &comment)
const CCDSInfo & GetCdsInfo() const
vector< CModelExon > TExons
TSignedSeqRange MaxCdsLimits() const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
static void ReadArgs(CGnomonAnnotator *annot, const CArgs &args)
void SetHMMParameters(CHMMParameters *params)
unique_ptr< CGnomonEngine > m_gnomon
TGgapInfo m_inserted_seqs
unique_ptr< SPhyloCSFSlice > m_pcsf_slice
TIntMap m_notbridgeable_gaps_len
void Predict(TGeneModelList &models, TGeneModelList &bad_aligns)
void RemoveShortHolesAndRescore(TGeneModelList chains)
bool GnomonNeeded() const
double TryToEliminateOneAlignment(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
double ExtendJustThisChain(CGeneModel &chain, TSignedSeqPos left, TSignedSeqPos right)
double TryWithoutObviouslyBadAlignments(TGeneModelList &aligns, TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor, TSignedSeqPos left, TSignedSeqPos right, TSignedSeqRange &tested_range)
double TryToEliminateAlignmentsFromTail(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
HMM model parameters just create it and pass to a Gnomon engine.
static bool RangeNestedInIntron(TSignedSeqRange r, const CGeneModel &algn, bool check_in_holes=true)
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
const_iterator end() const
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static const TDS_WORD limits[]
vector< TResidue > CResidueVec
bool Precede(TSignedSeqRange l, TSignedSeqRange r)
bool Include(TSignedSeqRange big, TSignedSeqRange small)
list< CGeneModel > TGeneModelList
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
void AddKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for mandatory key.
void SetCurrentGroup(const string &group)
Set current arguments group name.
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
@ eInputFile
Name of file (must exist and be readable)
@ eDouble
Convertible into a floating point number (double)
@ eInteger
Convertible into an integer number (int or Int8)
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
int64_t Int8
8-byte (64-bit) signed integer
position_type GetLength(void) const
bool NotEmpty(void) const
bool IntersectingWith(const TThisType &r) const
TThisType & CombineWith(const TThisType &r)
CRange< TSignedSeqPos > TSignedSeqRange
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetTo(TTo value)
Assign a value to To data member.
unsigned int
A callback function used to compare two keys in a database.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
virtual void transform_model(CGeneModel &a)
RemoveTrailingNs(const CResidueVec &seq)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4