,
79arg_desc.
AddDefaultKey(
"min-rna-total-coverage",
"MinRnaTotalCoverage",
80 "Minimum total query percent coverage for a Splign " 81 "compartment to be identified as RNA",
85arg_desc.
AddFlag(
"allow-consistent-intersection",
86 "Allow intersecting alignments in genomic compartments, " 87 "but enforce consistency");
88arg_desc.
AddFlag(
"allow-inconsistent-intersection",
89 "Allow intersecting alignments in genomic compartments, " 90 "and don't enforce consistency");
93 "allow-consistent-intersection");
95arg_desc.
AddFlag(
"allow-large-compart-gaps",
96 "Permit genomic compartments to contain large gaps between " 97 "alignments; default limits gaps to 3 x alignment size");
99arg_desc.
AddDefaultKey(
"max-compartment-failures",
"MaxCompartmentFailures",
100 "Fail if we have failure on more than this number of compartments",
107arg_desc.
AddFlag(
"no-prosplign-introns",
108 "Generate ProSplign alignment without introns");
110prosplign::CCompartOptions::SetupArgDescriptions(&arg_desc);
111arg_desc.
AddOptionalKey(
"prosplign-size-threshold",
"ProsplignSizeThreshold",
112 "Maximum compartment size - protein length x " 113 "genomic range length - on which to run prosplign",
116arg_desc.
AddFlag(
"prosplign-gaps",
"Precalculate un-bridgeable gaps. Prohibit compartment to go over un-bridgeable gaps.");
117arg_desc.
AddFlag(
"prosplign-unk-gaps",
"Prohibit compartment to go over gaps of unknown length.");
121 "GenColl ASN.1 to process");
128, m_MinSingletonIdty(0.7)
129, m_MinSingletonIdtyBps(9999999)
133, m_MaxCompartmentFailures(UINT_MAX)
134, m_CompartmentFailureCount(0)
135, m_ProsplignGaps(
false)
136, m_ProsplignUnknownGaps(
false)
137, m_ProsplignRunThreshold(UINT_MAX)
163 m_Penalty= args[
"compartment_penalty"].AsDouble();
164 m_MinIdty= args[
"min_compartment_idty"].AsDouble();
168 if(args[
"min_singleton_idty"]) {
174 if(args[
"allow-consistent-intersection"] ||
175args[
"allow-inconsistent-intersection"])
179 if(args[
"allow-inconsistent-intersection"]) {
182 if(!args[
"allow-large-compart-gaps"]) {
190assm_source; ++assm_source)
194 "Can have only one gc assembly input");
196 _TRACE(
"Reading assembly...");
220 for(
unsignedarg = 0; arg < raw_args.
Size(); ++arg) {
221command_line_args.
insert(raw_args[arg]);
223 if(!command_line_args.count(
"-min_hole_len")) {
226 if(!command_line_args.count(
"-compartment_penalty")) {
227 m_CompartOptions->m_CompartmentPenalty = prosplign::CCompartOptions::default_CompartmentPenalty;
229 if(!command_line_args.count(
"-min_compartment_idty")) {
230 m_CompartOptions->m_MinCompartmentIdty = prosplign::CCompartOptions::default_MinCompartmentIdty;
233args[
"no-prosplign-introns"]));
236 if(args[
"prosplign-size-threshold"]) {
248EQueryType query_type,
249 boolwith_best_placement,
251ESplignDirRun splign_direction)
253cleaned_aligns.clear();
259 Cleanup(query_it->second, this_query_output, query_type,
260with_best_placement, splign_direction);
261cleaned_aligns.splice(cleaned_aligns.end(), this_query_output);
263}
else if(!input_aligns.empty()) {
266input_aligns.front()->GetSeq_id(1))] = input_aligns;
267 Cleanup(subject_aligns, cleaned_aligns, query_type,
268with_best_placement, splign_direction);
274EQueryType query_type,
275 boolwith_best_placement,
276ESplignDirRun splign_direction)
278cleaned_aligns.clear();
286list<CSplignCompartment> splign_compartments;
287list< CRef<CSeq_align_set> > genomic_compartments;
288 if(query_type ==
eInfer) {
291query_aligns.begin()->second.front()->GetSeq_id(0));
295 "Type information not available for " 314splign_compartments,
true);
320 if(compart_it->HasRnaCharacteristics()) {
329 switch(query_type) {
334TCompartmentsBySubject;
335TAlignsBySubjectAndPos indexed_aligns;
336TCompartmentsBySubject failed_compartments;
337 boolfound_prosplign_alignments =
false;
340 TAlignsByPos&aligns_by_pos = indexed_aligns[subject_it->first];
342 true, &aligns_by_pos);
343 sort(aligns_by_pos.begin(), aligns_by_pos.end());
349compart_aligns, genomic_range);
351 if(compart_aligns.empty()) {
352failed_compartments[subject_it->first]
353. push_back(genomic_range);
355found_prosplign_alignments =
true;
358cleaned_aligns.splice(cleaned_aligns.end(), compart_aligns);
361 if(!found_prosplign_alignments) {
364 ITERATE(TCompartmentsBySubject, subject_it, failed_compartments) {
365 ITERATE(vector<TSeqRange>, compart_it, subject_it->second)
368indexed_aligns[subject_it->first], *compart_it,
379 if(splign_compartments.empty()) {
382splign_compartments,
true);
386 ITERATE(list<CSplignCompartment>, it, splign_compartments) {
390cleaned_aligns.push_back(align);
396cleaned_aligns.push_back(align);
400 if(!cleaned_aligns.empty()) {
407genomic_compartments);
415genomic_compartments,
true);
426 if(with_best_placement) {
438aligns_by_pair[
query][
subject].push_back(*align_it);
444list<CSplignCompartment> &compartments,
455}
else if(!input_aligns.empty()) {
462 if(hitref->GetQueryStrand() ==
false) {
463hitref->FlipStrands();
465hit_refs.push_back(hitref);
480 const TCoordmin_singleton_matches (
min(msm1, msm2));
482 TAccessorca(penalty_bps, min_matches, min_singleton_matches, !
m_NoXF);
487 TCoordsmin (0), smax (kCoordMax);
488 boolsame_strand (
false);
494same_strand =
false;
499same_strand = strand_this == strand_next;
500smax = same_strand? (box + 4)[2]: kCoordMax;
509 if(smax < box[3]) smax = box[3];
510 if(smin > box[2]) smin = box[2];
513compartments.push_back(comp);
516smin = same_strand? box[3]: 0;
551 true, aligns_by_pos);
554}
else if(!input_aligns.empty()) {
560hit_refs.push_back(hitref);
562aligns_by_pos->push_back(make_pair(hitref->GetSubjMin(), *iter));
565vector<pair<TSeqPos, TSeqPos> > *pgap =
NULL;
576 for(;smit; ++smit) {
580(unk_gaps && slit->CanGetFuzz() && slit->GetFuzz().IsLim() &&
597THitComparator sorter (THitComparator::eSubjMinSubjMax);
598stable_sort(hit_refs.begin(), hit_refs.end(), sorter);
602compartments.splice(compartments.end(), compartments_for_aligns);
612 "Function RunSplignOnCompartment() supported only " 613 "for one direction");
646.
Print(
"splign_error", comp_align.
m_Msg)
647.
Print(
"compart_query", compart.
query.AsString())
648.
Print(
"compart_subject", compart.
subject.AsString())
651.
Print(
"compart_strand", dir ==
eDirSense?
"plus":
"minus");
652 if(comp_align.
m_Msg.find(
"Space limit exceeded") == string::npos &&
656 "Failures on too many compartments");
670 cleanup.FillUnaligned(
true);
671 cleanup.Cleanup(compart, compart_cleaned_aligns);
677cleaned_aligns.splice(cleaned_aligns.end(), compart_cleaned_aligns);
691 switch(desc.
Which()) {
698 "unexpected number of IDs in align-ref");
718 "failed to find protein");
721 if( !genomic_loc ) {
723 "failed to find genomic location");
736cleaned_aligns.push_back(prosplign_output);
754 for(TAlignsByPos::const_iterator it =
755lower_bound(aligns_by_pos.begin(), aligns_by_pos.end(),
757it != lower_bound(aligns_by_pos.begin(), aligns_by_pos.end(),
761compart_members.push_back(it->second);
769 constlist<CSplignCompartment> &splign_compartments,
772 ITERATE(list<CSplignCompartment>, compart_it, splign_compartments) {
775genomic_compart->
Set().push_back(
779genomic_compartments.push_back(genomic_compart);
838align_set.
Set().splice(align_set.
Set().end(), aligns);
842aligns.splice(aligns.end(), align_set.
Set());
849{
returnhit1->GetQueryMin() < hit2->GetQueryMin(); }
854 if(
hits.size() < 2) {
862 if((*hit_it)->GetSubjStrand() != prev_hit->GetSubjStrand()) {
866 if((*hit_it)->GetSubjStrand()
867? ((*hit_it)->GetSubjMin() < prev_hit->GetSubjMin())
868: ((*hit_it)->GetSubjMax() > prev_hit->GetSubjMax()))
874 TSeqRangesubject_hit1(prev_hit->GetSubjMin(), prev_hit->GetSubjMax()),
875subject_hit2((*hit_it)->GetSubjMin(), (*hit_it)->GetSubjMax());
877intron -= subject_hit1;
878intron -= subject_hit2;
884query_coverage +=
TSeqRange((*hit_it)->GetQueryMin(),
885(*hit_it)->GetQueryMax());
897, m_AlignRef(const_cast<
CSeq_align*>(&align))
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CSplignAlignmentHit(const objects::CSeq_align &align)
void x_CleanupProsplignAsGenomic(const TAlignsByPos &aligns_by_pos, const TSeqRange &genomic_range, objects::CSeq_align_set::Tdata &cleaned_aligns)
void SetParams(const CArgs &args)
int m_GenomicCompartOptions
unique_ptr< prosplign::CCompartOptions > m_CompartOptions
CRef< objects::CScope > m_Scope
map< objects::CSeq_id_Handle, CSeq_align_set::Tdata > TAlignsBySubject
CRef< objects::CSeq_align > RunSplignOnCompartment(const CSplignCompartment &compart, ESplignDirRun dir)
void x_SplignCompartmentsToGenomicFormat(const list< CSplignCompartment > &splign_compartments, list< CRef< objects::CSeq_align_set > > &genomic_compartments)
bool m_ProsplignUnknownGaps
void GetGenomicCompartments(const objects::CSeq_align_set::Tdata &input_aligns, list< CRef< objects::CSeq_align_set > > &compartments, bool one_pair=false)
Divide list of genomic alignments into compartments.
static void SetupArgDescriptions(CArgDescriptions &arg_desc)
map< objects::CSeq_id_Handle, vector< pair< TSeqPos, TSeqPos > > > m_seq_gaps
void SetScope(const CRef< objects::CScope > &scope)
vector< pair< TSeqPos, CRef< objects::CSeq_align > > > TAlignsByPos
void GetProsplignCompartments(const objects::CSeq_align_set::Tdata &input_aligns, prosplign::TCompartments &compartments, bool one_pair=false, TAlignsByPos *aligns_by_pos=NULL)
void CleanupGenomicCompartment(const objects::CSeq_align_set::Tdata &compart, objects::CSeq_align_set::Tdata &cleaned_aligns, bool add_scores=true)
TSeqPos m_MinSingletonIdtyBps
map< objects::CSeq_id_Handle, TAlignsBySubject > TAlignsBySeqIds
void GetSplignCompartments(const objects::CSeq_align_set::Tdata &input_aligns, list< CSplignCompartment > &compartments, bool one_pair=false)
Divide list of RNA alignments into Splign compartments.
void Cleanup(const objects::CSeq_align_set::Tdata &input_aligns, objects::CSeq_align_set::Tdata &cleaned_aligns, EQueryType query_type=eInfer, bool with_best_placement=true, bool one_pair=false, ESplignDirRun splign_direction=eDirBoth)
Divide list of RNA alignments into Splign compartments.
unsigned m_CompartmentFailureCount
bool x_CleanupProsplignCompartment(const objects::CSeq_annot &compartment, const TAlignsByPos &aligns_by_pos, objects::CSeq_align_set::Tdata &cleaned_aligns, TSeqRange &genomic_range)
CRef< CProSplignScoring > m_ProSplignScoring
void x_AddStandardAlignmentScores(objects::CSeq_align &align)
void DivideByQuerySubjectPairs(const objects::CSeq_align_set::Tdata &input_aligns, TAlignsBySeqIds &aligns_by_pair)
double m_MinSingletonIdty
unsigned m_MaxCompartmentFailures
CRef< CProSplign > m_ProSplign
CRef< CProSplignOutputOptions > m_ProSplignOutputOptions
TSeqPos m_ProsplignRunThreshold
void BestPlacement(objects::CSeq_align_set::Tdata &aligns)
pair< TCoord, TCoord > TCoordRange
class CAlignCleanup implements an alignment cleanup utility based on the C++ alignment manager.
void Run(typename THitRefs::iterator start, typename THitRefs::iterator finish, CScope *scope=NULL, const vector< pair< TCoord, TCoord > > *gaps=NULL)
Execute: identify compartments.
bool GetStrand(size_t i) const
void SetMaxIntron(TCoord mi)
Assign the maximum intron length, in base pairs.
void Get(size_t idx, THitRefs &compartment) const
Retrieve a compartment by index.
const TCoord * GetBox(size_t i) const
bool GetStatus(size_t i) const
pair< size_t, size_t > GetCounts(void) const
Retrieve the compartment counts.
void CreateHierarchy(CGC_Assembly *target_set=NULL)
Generate the internal up-pointers.
class CInputStreamSource encapsulates details of how we supply applications with input data through s...
static void SetStandardInputArgs(CArgDescriptions &arg_desc, const string &prefix="input", const string &description="data to process", bool is_mandatory=false)
Supply a standard set of arguments via argument descriptions to an application.
static CNcbiApplication * Instance(void)
Singleton method.
bool operator()(const CSplign::THitRef &hit1, const CSplign::THitRef &hit2) const
Output filtering parameters.
CProSplignOutputOptions & SetMinHoleLen(int)
fill back small holes between good pieces holes with both unaligned protein and nucleotide portions l...
static const int default_min_hole_len
static void SetupArgDescriptions(CArgDescriptions *argdescr)
static void SetupArgDescriptions(CArgDescriptions *argdescr)
spliced protein to genomic alignment
CRef< objects::CSeq_align > FindAlignment(objects::CScope &scope, const objects::CSeq_id &protein, const objects::CSeq_loc &genomic, CProSplignOutputOptions output_options=CProSplignOutputOptions())
Aligns protein to a region on genomic sequence.
position_type GetCoveredLength(void) const
Returns total length covered by ranges in this collection, i.e.
void AddScore(CScope &scope, CSeq_align &align, CSeq_align::EScoreType score)
int GetGapCount(const CSeq_align &align)
Compute the number of gaps in the alignment.
@ eScore_PercentIdentity_Gapped
@ eScore_PercentIdentity_Ungapped
@ eScore_HighQualityPercentCoverage
TSeqPos GetSeqStop(TDim row) const
void SetNamedScore(const string &id, int score)
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
TSeqPos GetSeqStart(TDim row) const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup core splign argument descriptions for the application.
static void ArgsToSplign(CSplign *splign, const CArgs &args)
Translate command line arguments into splign algorithm core settings.
CRef< objects::CSeq_align_set > AsSeqAlignSet(const CSplign::TResults *results=0, int flags=eAF_SplicedSegWithParts) const
Format alignment as a seq-align-set.
void SetSeqIds(CConstRef< objects::CSeq_id > id1, CConstRef< objects::CSeq_id > id2)
@ eAF_SplicedSegWithParts
CSplign is the central library object for computing spliced cDNA-to-genomic alignments.
CRef< objects::CScope > & SetScope(void)
void SetMaxIntron(size_t max_intron)
void SetStrand(bool strand)
void PreserveScope(bool preserve=true)
Controls whether to clean the scope object's cache on a new sequence.
bool AlignSingleCompartment(THitRefs *hitrefs, THit::TCoord range_left, THit::TCoord range_right, SAlignedCompartment *result)
void SetStartModelId(size_t model_id)
vector< SAlignedCompartment > TResults
vector< THitRef > THitRefs
static void Rank(objects::CSeq_align_set &sas, score_fn_t score_fn=&NBestPlacement::GetScore)
Adds the following scores: `best_placement_score` as computed by score_fn `rank` - all top scoring al...
iterator_bool insert(const value_type &val)
TCompartments SelectCompartmentsHits(const THitRefs &hitrefs, CCompartOptions compart_options, const vector< pair< THit::TCoord, THit::TCoord > > *gaps=NULL)
Composition of first two functions.
list< CRef< CSeq_annot > > TCompartments
static void cleanup(void)
void FindCompartments(const list< CRef< CSeq_align > > &aligns, list< CRef< CSeq_align_set > > &align_sets, TCompartOptions options=fCompart_Defaults, float diff_len_filter=3.0f)
@ fCompart_FilterByDiffLen
@ fCompart_AllowIntersections
@ fCompart_AllowInconsistentIntersection
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
void SetDependency(const string &arg1, EDependency dep, const string &arg2)
Define a dependency.
void AddOptionalKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for optional key without default value.
void SetCurrentGroup(const string &group)
Set current arguments group name.
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
@ eRequires
One argument requires another.
@ eExcludes
One argument excludes another.
@ eInteger
Convertible into an integer number (int or Int8)
CDiagContext_Extra & Print(const string &name, const string &value)
The method does not print the argument, but adds it to the string.
CDiagContext & GetDiagContext(void)
Get diag context instance.
CDiagContext_Extra Extra(void) const
Create a temporary CDiagContext_Extra object.
SIZE_TYPE Size(void) const
Get size (number) of arguments.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
#define MSerial_AsnText
I/O stream manipulators â.
CConstRef< CSeq_id > GetSeqId(void) const
int CompareOrdered(const CSeq_id &sid2) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
TRange GetTotalRange(void) const
@ fThrowOnMissing
throw exception if sequence or requested data aren't found
TInst_Mol GetInst_Mol(void) const
bool CanGetInst_Mol(void) const
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
SSeqMapSelector & SetResolveCount(size_t res_cnt)
Set max depth of resolving seq-map.
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
CSeqMap::ESegmentType GetType(void) const
TSeqPos GetPosition(void) const
return position of current segment in sequence
CConstRef< CSeq_literal > GetRefGapLiteral(void) const
return CSeq_literal with gap data, or null if either the segment is not a gap, or an unspecified gap
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
void Reset(void)
Reset reference object.
position_type GetLength(void) const
position_type GetToOpen(void) const
TThisType & Set(position_type from, position_type to)
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
Tdata & Set(void)
Assign a value to data member.
E_Choice Which(void) const
Which variant is currently selected.
const TSpliced & GetSpliced(void) const
Get the variant data.
const TExons & GetExons(void) const
Get the Exons member data.
list< CRef< CSeq_align > > Tdata
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
const Tdata & Get(void) const
Get the member data.
E_Choice Which(void) const
Which variant is currently selected.
const TDesc & GetDesc(void) const
Get the Desc member data.
const TRegion & GetRegion(void) const
Get the variant data.
const TIds & GetIds(void) const
Get the Ids member data.
const TAlign & GetAlign(void) const
Get the variant data.
list< CRef< CAnnotdesc > > Tdata
@ e_Align
definition of the SeqAligns
@ e_Region
all contents cover this region
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
static unsigned s_MaxRnaIntronSize
objects::CSeq_id_Handle query
objects::CSeq_id_Handle subject
bool HasRnaCharacteristics()
static unsigned s_MinRnaTotalCoverage
ECompartmentStatus m_Status
bool operator()(const CRef< CSeq_align > &al1, const CRef< CSeq_align > &al2) const
Selector used in CSeqMap methods returning iterators.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4