* alphabet =
"ACGT";
49 for(
size_t i= 0;
i< mer_size;
i++) {
50s[mer_size - 1 -
i] = alphabet[w & 3];
60 if(words.size() == 0) {
65iupac.resize(words.size() - 1);
66 static const char* alphabet =
"ACGT";
67 for(
size_t i= 0;
i< words.size() - 1;
i++) {
68iupac[
i] = alphabet[words[
i] >> (mer_size * 2 - 2)];
70iupac +=
s_AsIUPAC(words.back(), mer_size);
77vector<Uint1> counts(64, 0);
80counts[w2 & 0x3F] += 1;
84 for(
size_t i= 0;
i< 64;
i++) {
85 size_tc = counts[
i];
88 return(144 - sumsq)/132.0;
94 const char*
constseq,
106 static const Uint1char2letter[] = {
1070, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1090, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1110, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
1120, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1130, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
1140, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1150, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1160, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1190, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1220, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
131 Uint1c = char2letter[
static_cast<int>(seq[
i])];
137 for(
size_t i= 1;
i< words.size();
i++) {
138 Uint1c = char2letter[
static_cast<int>(seq[
i+ k])];
139words[
i] = ((words[
i- 1] &
mask) << 2) | c;
144 Uint1c = 3 - char2letter[
static_cast<int>(seq[seq_size - 1 -
i])];
150 for(
size_t i= 1;
i< words.size();
i++) {
151 Uint1c = 3 - char2letter[
static_cast<int>(seq[k -
i])];
152words[
i] = ((words[
i- 1] &
mask) << 2) | c;
159::CPairedEndAdapterDetector
162TWords::const_iterator begin,
163TWords::const_iterator end)
165 size_twords_size = end - begin;
166 size_t len=
min(words_size, m_len);
170 for(
size_t i= 0;
i<
len;
i++) {
171x_IncrCount(
i, *(begin +
i) >> 4);
175 if(words_size > 0 && words_size < m_len) {
176 size_t len= min<size_t>(2, m_len - words_size);
177 TWordw = *(begin + words_size);
178 for(
size_t i= 0;
i<
len;
i++) {
179 size_tpos = words_size +
i;
180 TWordw2 = (w >> (2 -
i*2)) & 0xFFFFF;
181x_IncrCount(pos, w2);
188::CPairedEndAdapterDetector
190::x_NextWord(
size_tpos,
TWordprev_word)
const 193 size_tbest_count = 0;
195 TWordword = ((prev_word << 2) & 0xFFFFF) |
letter;
197 if(
count> best_count) {
206 stringNAdapterSearch
207::CPairedEndAdapterDetector
209::InferConsensus(
const SParams& params)
const 213 size_tsecondbest_sup(0);
215 for(
size_tword = 0; word < NMERS10; word++) {
216 size_tsup = m_counts[word];
218secondbest_sup = top_sup;
229words[0] = best_word;
233<< s_AsIUPAC(best_word,10)
234<<
"; overrepresentation: " 240 for(
size_t i= 1;
i< words.size();
i++) {
241 TWordlast_word = words[
i- 1];
242 TCountlast_sup = x_GetCount(
i- 1, last_word);
244 TWordcurr_word = x_NextWord(
i- 1, last_word);
245 TCountcurr_sup = x_GetCount(
i, curr_word);
248words[
i] = curr_word;
255 returns_AsIUPAC(words, 10);
260pair<size_t, size_t> NAdapterSearch
261::CPairedEndAdapterDetector
262::s_FindAdapterStartPos(
266 returnpair<size_t, size_t>(
267find(fwd_read.begin(), fwd_read.end(), rev_read.back())
268- fwd_read.begin() + MER_LENGTH,
270find(rev_read.begin(), rev_read.end(), fwd_read.front())
276::CPairedEndAdapterDetector
281 size_t len= spot_len / 2;
282 const char* fwd_read = spot;
283 const char* rev_read = spot+
len;
293pair<size_t, size_t> pos = s_FindAdapterStartPos(fwd_words, rev_words);
295 boolis_consistent = (pos.first + pos.second ==
len);
298 size_tadapter_len =
len- pos.first;
299 if(is_consistent && adapter_len >= MER_LENGTH) {
300m_cons5.AddExemplar(fwd_words.begin() + pos.first, fwd_words.end());
307s_Translate(rev_read + pos.first, adapter_len,
false, words2);
308m_cons3.AddExemplar(words2.begin(), words2.end());
335 size_ttop_count = m_counts[
seed];
339words.push_back(
seed);
340x_ExtendSeed(words, top_count,
false);
341reverse(words.begin(), words.end());
342x_ExtendSeed(words, top_count,
true);
350 charc = seq[seq.size() - 1];
351 while(seq.size() > 0 && seq[seq.size() - 1] == c) {
352seq.resize(seq.size() - 1);
361::CUnpairedAdapterDetector
362::x_FindAdapterSeed()
const 364 typedefpair<TCount, TWord>
TPair;
365 typedefpriority_queue< TPair, vector<TPair>, greater<TPair> > TTopWords;
372 for(
TWordw = 0; w < m_counts.size(); w++) {
376 while(top_words.size() > m_params.top_n) {
386 size_t n= top_words.size();
387 for(; !top_words.empty(); top_words.pop()) {
388sup = top_words.top().first;
389word = top_words.top().second;
390sumlogs +=
log(sup + 1.0);
393 size_tgmean =
n== 0 ? 0 : exp(sumlogs /
n) - 1;
398<<
"; overrepresentation: " 401<<
static_cast<size_t>(gmean));
403 returnm_params.HaveInitialSupport(sup, gmean) ? word : 0;
416: (w >> 2) | (
letter<< 22);
419 if(
count> best_count) {
434 TWordprev_word = words.back();
435 TWordcurr_word = x_GetAdjacent(prev_word, right);
436 TCountprev_sup = m_counts[prev_word];
437 TCountcurr_sup = m_counts[curr_word];
439 if(!m_params.HaveContinuedSupport(top_sup, prev_sup, curr_sup)
440|| find(words.begin(), words.end(), curr_word) != words.end())
446words.push_back(curr_word);
459 return a.second <
b.second;
465::CSimpleUngappedAligner
466::GetSeqRange(
TPospos)
const 468TRanges::const_iterator it = lower_bound(
469m_subseq_ranges.begin(),
470m_subseq_ranges.end(),
474 returnit == m_subseq_ranges.end() ?
TRange(NULLPOS, NULLPOS) : *it;
479::CSimpleUngappedAligner
482 if(
a.len > 0 &&
b.len == 0) {
484}
else if(
a.len == 0 &&
b.len > 0) {
487 TRangear = GetSeqRange((
a.second -
a.first) / 2);
488 TRange br= GetSeqRange((
b.second -
b.first) / 2);
492 TPosa_un = (ar.second - ar.first) -
a.len/2;
493 TPosb_un = (
br.second -
br.first) -
b.len/2;
497<<
" "<< (
a.second -
a.first) / 2
500<<
" "<< m_seq.substr(ar.first, ar.second - ar.first));
502<<
" "<< (
b.second -
b.first) / 2
505<<
" "<< m_seq.substr(
br.first,
br.second -
br.first));
511 doublea_score =
a.len -
log(1.0 + a_un)*5;
512 doubleb_score =
b.len -
log(1.0 + b_un)*5;
513 returna_score < b_score ?
b:
a;
525s_PermuteMismatches(word, approx_words);
535 TPos& pos_ref = vec_index[w];
536 if(pos_ref == pos || pos_ref == NULLPOS) {
539 if(pos_ref != MULTIPOS) {
555positions.resize(coordset.
size());
556TPositions::iterator dest = positions.begin();
560 TPospos = it->second;
563 if(map_index.
find(w) == map_index.
end()) {
564map_index[w].first = dest;
567map_index[w].second = dest;
575m_seq.assign(seq,
len);
577m_subseq_ranges.clear();
580fill(m_vec_index.begin(), m_vec_index.end(), (
TPos)NULLPOS);
587 const char* endend = seq+
len;
588 for(
const char*begin = seq, *end = find(begin, endend,
'-');
590begin = end + 1, end = find(begin, endend,
'-'))
592 TRange r(begin - seq, end - seq);
593m_subseq_ranges.push_back(
r);
598 for(
size_t i= 0;
i< words.size();
i++) {
600s_IndexWord(words[
i], pos, m_vec_index, coordset);
604s_CoordSetToMapIndex(coordset, m_map_index, m_positions);
613TWords::iterator dest = words.begin();
614 for(
size_ti1 = 3; i1 < 9; i1++) {
616 for(
Uint1c1 = 0; c1 < 4; c1++) {
617 TWordw1 = s_Put(w, i1, c1);
619 for(
size_ti2 = i1+1; i2 < 9; i2++) {
621 for(
Uint1c2 = 0; c2 < 4; c2++) {
622*(dest++) = s_Put(w1, i2, c2);
634 if(m1.
first== NULLPOS) {
656m.
first= q_pos - s_pos;
657m.
second= q_pos + s_pos;
666 const size_tquery_len,
667 const booldirection,
668 const intmatch_score,
669 const intmismatch_score,
670 const intdropoff_score)
const 672 intdelta_pos = direction ? 1 : -1;
676 typedefpair<TPos, TPos> TPoint;
677TPoint p(seg.
first+ (direction ? seg.
len- 1 : 0),
678seg.
second+ (direction ? seg.
len- 1 : 0));
680 Int8score(0), max_score(0);
684 TRangeq_bounds(0, query_len);
688p.first += delta_pos;
689p.second += delta_pos;
690 while(score + dropoff_score > max_score
691&& p.first >= q_bounds.first
692&& p.first < q_bounds.second
693&& p.second >= s_bounds.first
694&& p.second < s_bounds.second)
696score += (q_seq[p.first] == m_seq[p.second]) ? match_score
700<<
" "<< q_seq[p.first]
701<<
" "<< m_seq[p.second]
703<<
" "<< best_p.first);
706 if(score > max_score) {
711p.first += delta_pos;
712p.second += delta_pos;
717seg.
len= best_p.first - seg.
first+ 1;
722seg2.
first= best_p.first;
723seg2.
second= best_p.second;
743 for(
size_t i= 0;
i< words.size();
i++) {
747m_vec_index.begin() + w + 1);
749 if(*
r.first == NULLPOS) {
751}
else if(*
r.first == MULTIPOS) {
752 r= m_map_index.find(w)->second;
763 for(TPositions::const_iterator it =
r.first; it !=
r.second; ++it) {
764 SMatchnew_match = x_CreateMatch(
i, *it);
767 if(!
s_Merge(old_match, new_match)) {
768best_match = x_GetBetterOf(best_match, old_match);
769old_match = new_match;
774 ITERATE(TMatches, it, matches) {
775best_match = x_GetBetterOf(best_match, it->
second);
LargeInt< 1 > revcomp(const LargeInt< 1 > &x, size_t sizeKmer)
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
ncbi::TMaskedQueryRegions mask
static void s_CoordSetToMapIndex(const TCoordSet &coordset, TMapIndex &map_index, TPositions &positions)
static void s_PermuteMismatches(TWord w, TWords &words)
SMatch FindSingleBest(const char *query, size_t len) const
Return best ungapped alignment of at least MER_LENGTH.
pair< TPositions::const_iterator, TPositions::const_iterator > TPosRange
void Init(const char *seq, size_t len)
IUPAC sequence of target sequence(s) Multiple sequences may be concatenated and '-'-delimited e....
SMatch x_CreateMatch(TPos q_start, TPos s_start) const
Create a match in diag/antidiag coordspace for a matched word.
static bool s_Merge(SMatch &m, const SMatch &m2)
void x_Extend(SMatch &match, const char *query_seq, const size_t query_len, const bool direction, const int match_score=3, const int mismatch_score=-2, const int dropoff=5) const
Ungapped extension (match is in normal coordinates)
static void s_IndexWord(TWord w, TPos pos, TPositions &vec_index, TCoordSet &coordset)
pair< TPos, TPos > TRange
vector< TPos > TPositions
TWord x_GetAdjacent(TWord w, bool right) const
if right, return most frequent word whose 11-prefix = 11-suffix of w; otherwise, return most frequent...
virtual void AddExemplar(const char *seq, size_t len)
Add sequence of a spot from an SRA run (single or paired-end read)
virtual string InferAdapterSeq() const
returns IUPAC seq of the inferred adapter seq; empty if not found.
void x_ExtendSeed(TWords &words, size_t top_count, bool right) const
Extend the seed one way, as long as it satisfies extension thresholds.
static bool s_IsLowComplexity(TWord word)
With threshold of 0.9, 0.7% of all words are classified as such.
static double s_GetWordComplexity(TWord word)
Complexity is a value between 0 (for a homopolymer) and 1 (max complexity).
static const size_t MER_LENGTH
Will use 24-bit 12-mers as words.
static string s_AsIUPAC(TWord w, size_t mer_size=MER_LENGTH)
Convert word to IUPAC string [ACGT]{MER_LENGTH}.
static void s_Translate(const char *const iupac_na, size_t len, bool apply_revcomp, TWords &words)
Translate IUPAC sequence to a sequence of overlapping words (A,C,G,T,*) <-> (0,1,2,...
static bool s_IsHomopolymer(TWord w)
const_iterator end() const
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
parent_type::value_type value_type
SStaticPair< const char *, const char * > TPair
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Info(CExceptionArgs_Base &args)
uint8_t Uint1
1-byte (8-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
void s_Merge(SExpression &l, SExpression &r)
NCBI C++ auxiliary debug macros.
Miscellaneous common-use basic types and functionality.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
static SLJIT_INLINE sljit_ins br(sljit_gpr target)
Represents single ungapped alignment.
An adapter sequence is presumed to occur at least min_support times in the input, and is overrepresen...
bool HaveContinuedSupport(size_t top_sup, size_t prev_sup, size_t candidate_sup) const
bool HaveInitialSupport(size_t top_candidate_sup, size_t non_candidate_sup) const
bool operator()(const NAdapterSearch::CSimpleUngappedAligner::TRange &a, const NAdapterSearch::CSimpleUngappedAligner::TRange &b) const
static Uint4 letter(char c)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4