in_quotes =
false;
66 boolin_space =
true;
71 for(
unsigned int i= 0;
i< s.size(); ++
i) {
73 if((c ==
' '|| c ==
'\t') && !in_quotes) {
85 if(c ==
'\''|| c ==
'"') {
87 if(c == quote_char) {
104 throwruntime_error(
"Unbalanced quotes (')");
110vector<CContigAssembly::SAlignStats::STails>& tails)
113 for(
intj = 0; j < vec.
GetNumRows(); ++j) {
121tls.
right= seq_len - stop - 1;
124tls.
left= seq_len - stop - 1;
126tails.push_back(tls);
133 const string& param_string,
137query_loc.
SetWhole().Assign(query_id);
139subject_loc.
SetWhole().Assign(subject_id);
140 return Blastn(query_loc, subject_loc, param_string, scope);
146 const string& param_string,
149 SSeqLocquery_sl(query_loc, scope);
150 SSeqLocsubject_sl(subject_loc, scope);
158 for(
unsigned int i= 0;
i< args.size();
i+= 2) {
159 const string& name = args[
i];
160 if(
i+ 1 >= args.size()) {
161 throwruntime_error(
"no value given for "+ name);
163 const string&
value= args[
i+ 1];
166}
else if(name ==
"-r") {
168}
else if(name ==
"-q") {
170}
else if(name ==
"-e") {
172}
else if(name ==
"-Z") {
174}
else if(name ==
"-F") {
176}
else if(name ==
"-G") {
178}
else if(name ==
"-E") {
181 throwruntime_error(
"invalid option: "+ name);
191vector<unsigned int>& plus_vec,
192vector<unsigned int>& minus_vec)
195*align_set.
Get().front()->GetSegs().GetDenseg().GetIds()[0];
197*align_set.
Get().front()->GetSegs().GetDenseg().GetIds()[1];
203plus_vec.resize(len0 + len1);
206minus_vec.resize(len0 + len1);
210 const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
214 if(start0 == -1 || start1 == -1) {
219 TSeqPosdiag = (start0 + seg_len - 1) + start1;
220minus_vec[diag] += seg_len;
223 const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
227 if(start0 == -1 || start1 == -1) {
232 TSeqPosdiag = start1 - start0 + len0 - 1 ;
233plus_vec[diag] += seg_len;
243vector<TRange>& max_range)
245 unsigned intrunning_sum = 0;
247 for(
i= 0;
i< window; ++
i) {
248running_sum += vec[
i];
252max_range.push_back(
TRange(window - 1, window - 1));
254 for(
i= window;
i< vec.size(); ++
i) {
255running_sum -= vec[
i- window];
256running_sum += vec[
i];
257 if(running_sum >=
max) {
258 if(running_sum >
max) {
262 if(max_range.size() && max_range.back().GetFrom() ==
i- 1) {
263max_range.back().SetFrom(
i);
265max_range.push_back(
TRange(
i,
i));
279vector<unsigned int> plus_vec, minus_vec;
280 DiagCounts(align_set, scope, plus_vec, minus_vec);
282 unsigned intplus_count;
283vector<TRange> plus_range;
286 unsigned intminus_count;
287vector<TRange> minus_range;
291vector<TRange>*
r=
NULL;
292 if(plus_count > minus_count) {
304(
r->front().GetFrom() +
r->front().GetTo() + 1) / 2 -
window_size/ 2;
314 unsigned inthalf_width,
CScope& scope)
345 if(diag <= seq0.size()) {
347shift = seq0.size() - 1 - diag;
350shift = diag - (seq0.size() - 1);
363ds->
SetIds().push_back(cr_id0);
364ds->
SetIds().push_back(cr_id1);
381 for(
unsigned int i= 0;
i< ds->
GetLens().
size() - 1; ++
i) {
398 unsigned intslop,
CScope& scope)
404 if( (
stats.tails[0].right <= slop &&
stats.tails[1].left <= slop) ||
405(
stats.tails[0].left <= slop &&
stats.tails[1].right <= slop) ) {
446 unsigned intslop,
CScope& scope)
462 unsigned intslop,
CScope& scope)
478 boolFirstContainsSecond;
479 boolSecondContainsFirst;
481FirstContainsSecond = ((((long)
stats.tails[0].left -
stats.tails[1].left) >= -
int(slop)) &&
482(((
long)
stats.tails[0].right -
stats.tails[1].right) >= -
int(slop)));
483SecondContainsFirst = ((((long)
stats.tails[1].left -
stats.tails[0].left) >= -
int(slop)) &&
484(((
long)
stats.tails[1].right -
stats.tails[0].right) >= -
int(slop)));
486 return(FirstContainsSecond | SecondContainsFirst);
501Ident =
stats.pct_identity/100.0;
511 intWg = -5, Wm = 1, Wms = -2, Ws = -2;
518 for(
unsigned int i= 0;
i< ds_in.
GetLens().
size(); ++
i) {
521vector<int> scores(sz);
523 for(
unsigned int i= 0;
i< scores.size(); ++
i) {
527previous_score = scores[
i- 1];
533scores[
i] = previous_score + Ws;
535scores[
i] = previous_score + Wg + Ws;
537}
else if(res1 ==
'-') {
539scores[
i] = previous_score + Ws;
541scores[
i] = previous_score + Wg + Ws;
543}
else if(res0 == res1) {
545scores[
i] = previous_score + Wm;
548scores[
i] = previous_score + Wms;
552 if(scores[
i] < 0) {
559 unsigned intright_end = 0;
560 unsigned intleft_end;
561 for(
unsigned int i= 0;
i< scores.size(); ++
i) {
562 if(scores[
i] > max_score) {
563max_score = scores[
i];
570 if(right_end == 0) {
577 for(
i= right_end - 1;
i> 0; --
i) {
578 if(scores[
i] == 0) {
585 if(scores[0] == 0) {
601vector<CRef<CSeq_align> >
603 const string& blast_params,
doublemin_ident,
604 unsigned intmax_end_slop,
CScope& scope,
606 constvector<unsigned int>& band_halfwidths,
607 unsigned intdiag_finding_window,
608 unsigned intmin_align_length,
611 if(min_ident > 1 || min_ident < 0) {
612 throwruntime_error(
"min_ident must be between zero and one (got " 623*ostr <<
"Filtering on "<< min_ident <<
"%, slop "<< max_end_slop
624<<
"bp, min length "<< min_align_length <<
"bp" 625<<
" and strands "<< strandmap[strand0] <<
", "<< strandmap[strand1] << endl;
629alns =
Blastn(id0, id1, blast_params, scope);
631 catch(exception& e) {
633*ostr <<
"blast failed:\n"<< e.what() << endl;
635 returnvector<CRef<CSeq_align> >();
638vector<CRef<CSeq_align> > good_alns;
644 if(
IsDovetail((*aln)->GetSegs().GetDenseg(), max_end_slop, scope)
645&&
FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
647&&
x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length ) {
649good_alns.push_back(*aln);
652 if(!good_alns.empty()) {
654*ostr <<
"Found "<< good_alns.size() <<
655 " acceptable dovetail alignment(s) by blast " 656<< blast_params << endl;
661*ostr <<
"Found no acceptable dovetail alignments by blast " 662<< blast_params << endl;
664 if(alns->
Get().empty()) {
666*ostr <<
"No alignments found by blast; " 667 "can't do banded alignment"<< endl;
669 returnvector<CRef<CSeq_align> >();
676 ITERATE(vector<unsigned int>, band_halfwidth, band_halfwidths) {
678*ostr <<
"Trying banded global alignment with bandwidth = " 679<< 2 * *band_halfwidth + 1 << endl;
685diag, *band_halfwidth, scope);
689*ostr <<
"banded alignment failed:\n"<< e.
what() << endl;
696*ostr <<
"banded alignment failed: num segs == 0\n"<< endl;
702 doublefrac_ident =
FracIdent(*local_ds, scope);
704*ostr <<
"Fraction identity: "<< frac_ident << endl;
706 if(
IsDovetail(*local_ds, max_end_slop, scope)
707&&
FracIdent(*local_ds, scope) >= min_ident
711*ostr <<
"Alignment acceptable (full dovetail)"<< endl;
715aln->
SetSegs().SetDenseg(*local_ds);
717 returnvector<CRef<CSeq_align> >(1, aln);
722*ostr <<
"No acceptable alignments from banded alignment algorithm" 731&&
FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
733&&
x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length
736good_alns.push_back(*aln);
740*ostr <<
"Found "<< good_alns.size() <<
741 " acceptable half-dovetail " 742 "or contained alignment(s) by blast"<< endl;
744 if(!good_alns.empty()) {
751&&
FracIdent(*local_ds, scope) >= min_ident
754 stringdovetail_string;
755 if(
IsContained(*local_ds, max_end_slop, scope)) {
756dovetail_string =
"contained";
758dovetail_string =
"half-dovetail";
761*ostr <<
"Banded alignment acceptable (" 762<< dovetail_string <<
")"<< endl;
766aln->
SetSegs().SetDenseg(*local_ds);
768 returnvector<CRef<CSeq_align> >(1, aln);
772*ostr <<
"Banded alignment not an acceptable " 773 "half-dovetail or contained"<< endl;
775 returnvector<CRef<CSeq_align> >();
783objects::CScope& scope)
793 _ASSERT(row1.size() == row2.size());
796 for(
unsigned int i= 0;
i< row1.size(); ++
i) {
797 if(row1[
i] !=
'N'&& row2[
i] !=
'N') {
800 if(row1[
i] != row2[
i]) {
801 if(row1[
i] ==
'-') {
803 while(
i+1 < row1.size() && row1[
i+1] ==
'-') ++
i;
804}
else if(row2[
i] ==
'-') {
806 while(
i+1 < row1.size() && row2[
i+1] ==
'-') ++
i;
829align_stats.
gaps.clear();
832vector<CRef<CSeq_loc> > dust_locs;
843loc->SetPacked_int(*res);
844dust_locs.push_back(loc);
850 boolsimple =
false;
853 boolis_gap =
false;
854 for(
intj = 0; j < vec.
GetNumRows(); ++j) {
857 unsigned intother_row = (j + 1) % 2;
867gap_loc.
SetInt().SetId().Set(
"lcl|dummy");
879}
else if(gap_simple == -1) {
892align_stats.
is_simple.push_back(simple);
911SAlignStats& align_stats)
920SAlignStats& align_stats)
933 if(
stats.tails[0].left <
stats.tails[1].left) {
944 boolmatches[2] = {
false,
false};
957 if(!(matches[0] & matches[1])) {
964 return(matches[0] & matches[1]);
973 intDim = ds.GetDim();
975 for(
unsigned intSeg = 0; Seg < Lens.size(); Seg++) {
977 if(Starts[(Seg*Dim)] == -1 || Starts[(Seg*Dim)+1] == -1)
991 for(
int i= 0;
i< vec.GetNumSegs(); ++
i) {
992 boolis_gap =
false;
993 for(
intj = 0; j < vec.GetNumRows(); ++j) {
994 if(vec.GetStart(j,
i) == -1) {
1000AlignedLength += vec.GetLen(
i);
1006 unsigned intidentities = 0;
1007 for(
int i= 0;
i< vec.GetNumSegs(); ++
i) {
1009vec.GetSegSeqString(s1, 0,
i);
1010 for(
intj = 1; j < vec.GetNumRows(); ++j) {
1012vec.GetSegSeqString(s2, j,
i);
1014 for(
unsigned intk = 0; k <
min(s1.size(), s2.size()); ++k) {
1015identities += (s1[k] == s2[k]);
1020align_stats.
mismatches= AlignedLength - identities;
1022100.0 * double(identities) / double(AlignedLength);
Declares the CBl2Seq (BLAST 2 Sequences) class.
Declares the CBlastNucleotideOptionsHandle class.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
Definitions of special type used in BLAST.
vector< CRef< objects::CSeq_align_set > > TSeqAlignVector
Vector of Seq-align-sets.
@ eBlastn
Nucl-Nucl (traditional blastn)
TSignedSeqPos GetStop(TNumrow row, TNumseg seg, int offset=0) const
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
bool IsPositiveStrand(TNumrow row) const
TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
TDim GetNumRows(void) const
TSeqPos GetAlnStop(TNumseg seg) const
TSeqPos GetLen(TNumseg seg, int offset=0) const
TSeqPos GetSeqStop(TNumrow row) const
TNumseg GetNumSegs(void) const
TSeqPos GetSeqStart(TNumrow row) const
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
void SetEndChar(TResidue gap_char)
void SetGapChar(TResidue gap_char)
CScope & GetScope(void) const
string & GetAlnSeqString(string &buffer, TNumrow row, const CAlnMap::TSignedRange &aln_rng) const
TResidue GetResidue(TNumrow row, TSeqPos aln_pos) const
Runs the BLAST algorithm between 2 sequences.
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
unsigned int m_AdjustedLen
CAlnStats(unsigned int adjusted_len, unsigned int mm, unsigned int gaps)
static void FindMaxRange(const vector< unsigned int > &vec, unsigned int window, unsigned int &max, vector< TRange > &max_range)
static void x_GatherIdentStats(const objects::CAlnVec &vec, SAlignStats &align_stats)
static CRef< objects::CDense_seg > BestLocalSubAlignment(const objects::CDense_seg &ds_in, objects::CScope &scope)
Find the highest-scoring local subalignment.
static bool IsAtLeastHalfDovetail(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static CRef< objects::CSeq_align_set > Blastn(const objects::CSeq_id &query_id, const objects::CSeq_id &subject_id, const string ¶m_string, objects::CScope &scope)
Utility for running blastn.
static void GatherAlignStats(const objects::CAlnVec &vec, SAlignStats &align_stats)
static bool x_IsAllowedStrands(const objects::CDense_seg &ds, objects::ENa_strand strand0, objects::ENa_strand strand1)
static void FindDiagFromAlignSet(const objects::CSeq_align_set &align_set, objects::CScope &scope, unsigned int window_size, objects::ENa_strand &strand, unsigned int &diag)
Given a set of alignments, pick out a diagonal to use as the center of a band in a banded alignment.
CRange< unsigned int > TRange
Find the range (or more than one tied range) containing the maximal diagonal count,...
static void DiagCounts(const objects::CSeq_align_set &align_set, objects::CScope &scope, vector< unsigned int > &plus_vec, vector< unsigned int > &minus_vec)
Count the cells with "ink" along each diagonal in a dot-matrix-type plot of some set of alignments (e...
static vector< CRef< objects::CSeq_align > > Align(const objects::CSeq_id &id0, const objects::CSeq_id &id1, const string &blast_params, double min_ident, unsigned int max_end_slop, objects::CScope &scope, CNcbiOstream *ostr=0, const vector< unsigned int > &band_halfwidths=vector< unsigned int >(1, 200), unsigned int diag_finding_window=200, unsigned int min_align_length=50, objects::ENa_strand strand0=objects::eNa_strand_unknown, objects::ENa_strand strand1=objects::eNa_strand_unknown)
Most users of the class need only to call this function.
static CRef< objects::CDense_seg > BandedGlobalAlignment(const objects::CSeq_id &id0, const objects::CSeq_id &id1, objects::ENa_strand strand, unsigned int diag, unsigned int half_width, objects::CScope &scope)
Do a banded global alignment using an arbitrary band.
static void x_OrientAlign(objects::CDense_seg &ds, objects::CScope &scope)
static TSeqPos x_DensegLength(const objects::CDense_seg &ds)
static bool IsDovetail(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static bool IsContained(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static double FracIdent(const objects::CDense_seg &ds, objects::CScope &scope)
ENa_strand GetSeqStrand(TDim row) const
TSeqPos GetSeqStop(TDim row) const
void Reverse(void)
Reverse the segments' orientation.
TSeqPos GetSeqStart(TDim row) const
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
CRef< CDense_seg > ExtractSlice(TDim row, TSeqPos from, TSeqPos to) const
Extract a slice of the alignment that includes the specified range.
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
CRef< objects::CPacked_seqint > GetMaskedInts(objects::CSeq_id &seq_id, const sequence_type &seq)
Mask a sequence and return result as a CPacked_seqint instance.
static void s_SplitCommandLine(string s, vector< string > &result)
static void s_GetTails(const CAlnVec &vec, vector< CContigAssembly::SAlignStats::STails > &tails)
void SetSpaceLimit(const size_t &maxmem)
void SetShift(Uint1 where, size_t offset)
string GetTranscriptString(void) const
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetEvalueThreshold(double eval)
Sets EvalueThreshold.
void SetMatchReward(int r)
Sets MatchReward.
void SetTraditionalBlastnDefaults()
Sets TraditionalBlastnDefaults.
void SetMismatchPenalty(int p)
Sets MismatchPenalty.
void SetGapXDropoffFinal(double x)
Sets GapXDropoffFinal.
void SetGapExtensionCost(int e)
Sets GapExtensionCost.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
void SetWordSize(int ws)
Sets WordSize.
void SetGapOpeningCost(int g)
Sets GapOpeningCost.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
TSeqPos GetBioseqLength(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eStrand_Plus
Plus strand.
@ eStrand_Minus
Minus strand.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
const TDenseg & GetDenseg(void) const
Get the variant data.
Tdata & Set(void)
Assign a value to data member.
TLens & SetLens(void)
Assign a value to Lens data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
const TLens & GetLens(void) const
Get the Lens member data.
vector< TSignedSeqPos > TStarts
void SetType(TType value)
Assign a value to Type data member.
TStarts & SetStarts(void)
Assign a value to Starts data member.
TStrands & SetStrands(void)
Assign a value to Strands data member.
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
const TIds & GetIds(void) const
Get the Ids member data.
bool CanGetStrands(void) const
Check if it is safe to call GetStrands method.
TNumseg GetNumseg(void) const
Get the Numseg member data.
list< CRef< CSeq_align > > Tdata
TIds & SetIds(void)
Assign a value to Ids data member.
const TStrands & GetStrands(void) const
Get the Strands member data.
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
@ eType_partial
mapping pieces together
ENa_strand
strand of nucleic acid
unsigned int
A callback function used to compare two keys in a database.
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Uint8 GetPhysicalMemorySize(void)
Return the amount of physical memory available in the system.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define row(bind, expected)
Alignment characterization.
vector< STails > tails
unaligned tails
double pct_identity
% identity (varies from 0 to 100)
TSeqPos mismatches
number of mismatched bases
TSeqPos total_length
total covered length of the alignment, including gaps
TSeqPos gap_count
count of total number of gaps
vector< bool > is_simple
for each gap, whether is consists of "simple sequence"
vector< TSeqPos > gaps
the set of gap lengths for this alignment
TSeqPos aligned_length
total number of bases included in the alignment
Structure to represent a single sequence to be fed to BLAST.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4