objects::CScope& scope)
89 while(index + kmer_len - 1 < sv.size() &&
i< kmer_len) {
92 if(sv[index +
i] ==
kXaa) {
99pos |=
GetAALetter(sv[index +
i]) << (num_bits * (kmer_len -
i- 1));
114 intindex = pos / chunk;
115 int offset= pos - index * chunk;
122 unsigned intnum_bits)
136counts =
new TCount[num_elements];
138 catch(std::bad_alloc) {
144counts =
new TCount[num_elements];
146 catch(std::bad_alloc) {
148 "Memory cannot be allocated for k-mer counting." 149 " Try using compressed alphabet or smaller k.");
158objects::CScope& scope)
163 _ASSERT(kmer_len > 0 && alphabet_size > 0);
167 "Compressed alphabet selected, but translation table not" 171 if(!seq.IsWhole() && !seq.IsInt()) {
173 "Unsupported SeqLoc encountered");
177objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
179 unsigned intnum_elements;
180 unsigned intseq_len = sv.size();
188 "Sequence shorter than desired k-mer length");
192 unsigned int mask= 1;
194 while(alphabet_size >
mask) {
198 const intkNumBits = num;
204counts = tmp_counts.
get();
215num_elements = 1 << (kNumBits * kmer_len);
216 const Uint4kMask = num_elements - (1 << kNumBits);
219memset(counts, 0, num_elements *
sizeof(
TCount));
221 const intkBitChunk =
sizeof(
Uint4) * 8;
224vector<Uint4> used_entries(num_elements / kBitChunk + 1);
229 boolis_pos =
InitPosBits(sv, pos,
i, kNumBits, kmer_len);
233 MarkUsed(pos, used_entries, kBitChunk);
237 for(;
i< seq_len && is_pos;
i++) {
241 "Letter out of alphabet in sequnece");
246 if(sv[
i] ==
kXaa) {
250 if(
i>= seq_len || !is_pos) {
260 MarkUsed(pos, used_entries, kBitChunk);
268 Uint4num_bit_chunks = num_elements / kBitChunk + 1;
269 while(ind < num_elements / kBitChunk + 1) {
272 while(ind < num_bit_chunks && used_entries[ind] == 0) {
276 if(ind == num_bit_chunks) {
281 for(
Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
284 if((used_entries[ind] &
mask) != 0) {
285pos = ind * kBitChunk + j;
290used_entries[ind] ^=
mask;
298 _ASSERT(pow((
double)alphabet_size, (
double)kmer_len)
302 for(
Uint4 i=0;
i< kmer_len;
i++) {
303base[
i] = pow((
double)alphabet_size, (
double)
i);
306num_elements = (
Uint4)pow((
double)alphabet_size, (double)kmer_len);
309memset(counts, 0, num_elements *
sizeof(
TCount));
312 const intkBitChunk =
sizeof(
Uint4) * 8;
313vector<Uint4> used_entries(num_elements / kBitChunk + 1);
316 for(
unsigned i=0;
i< seq_len - kmer_len + 1;
i++) {
319 if(sv[
i+ kmer_len - 1] ==
kXaa) {
326 for(
Uint4j=1;j < kmer_len;j++) {
331 MarkUsed(pos, used_entries, kBitChunk);
338 Uint4num_bit_chunks = num_elements / kBitChunk + 1;
339 while(ind < num_elements / kBitChunk + 1) {
342 while(ind < num_bit_chunks && used_entries[ind] == 0) {
346 if(ind == num_bit_chunks) {
351 for(
Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
354 if((used_entries[ind] &
mask) != 0) {
355pos = ind * kBitChunk + j;
360used_entries[ind] ^=
mask;
378 unsigned intnum_counts2 =
v2.GetNumCounts();
379 unsigned intfewer_counts
380= num_counts1 < num_counts2 ? num_counts1 : num_counts2;
385 return1.0 - (double)num_common / (
double)fewer_counts;
398 unsigned intnum_counts2 =
v2.GetNumCounts();
399 unsigned intmore_counts
400= num_counts1 > num_counts2 ? num_counts1 : num_counts2;
405 return1.0 - (double)num_common / (
double)more_counts;
424&& it1->position == it2->position) {
428 result+= (unsigned)(it1->value < it2->value
429? it1->value : it2->value);
441&& it1->position < it2->position) {
446&& it2->position < it1->position) {
461 unsigned int mask= 1;
485ostr << it->position <<
":"<< (
int)it->value <<
" ";
494objects::CScope& scope)
501objects::CScope& scope)
506 _ASSERT(kmer_len > 0 && alphabet_size > 0);
510 "Compressed alphabet selected, but translation table not" 514 if(!seq.IsWhole() && !seq.IsInt()) {
516 "Unsupported SeqLoc encountered");
520objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
522 unsigned intnum_elements;
523 unsigned intseq_len = sv.size();
531 "Sequence shorter than desired k-mer length");
534 const intkBitChunk =
sizeof(
Uint4) * 8;
541 _ASSERT(pow((
double)alphabet_size, (
double)kmer_len)
545 for(
Uint4 i=0;
i< kmer_len;
i++) {
546base[
i] = pow((
double)alphabet_size, (
double)
i);
549num_elements = (
Uint4)pow((
double)alphabet_size, (double)kmer_len);
560 for(
unsignedj=0;j < kmer_len &&
i< seq_len - kmer_len + 1;j++) {
562 if(sv[
i+ j] ==
kXaa) {
568}
while(
i< seq_len - kmer_len + 1 && is_xaa);
570 if(
i>= seq_len - kmer_len + 1) {
575 for(;
i< seq_len - kmer_len + 1;
i++) {
578 if(sv[
i+ kmer_len - 1] ==
kXaa) {
586 for(
unsignedj=0;j < kmer_len &&
i< seq_len - kmer_len + 1;
589 if(sv[
i+ j] ==
kXaa) {
595}
while(
i< seq_len - kmer_len + 1 && is_xaa);
598 if(
i>= seq_len - kmer_len + 1) {
605 for(
Uint4j=1;j < kmer_len;j++) {
627 unsigned intnum_counts2 =
v2.GetNumCounts();
628 unsigned intfewer_counts
629= num_counts1 < num_counts2 ? num_counts1 : num_counts2;
634 return1.0 - (double)num_common / (
double)fewer_counts;
647 unsigned intnum_counts2 =
v2.GetNumCounts();
648 unsigned intmore_counts
649= num_counts1 > num_counts2 ? num_counts1 : num_counts2;
654 return1.0 - (double)num_common / (
double)more_counts;
667 for(
size_t i=0;
i<
size;
i++) {
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
ncbi::TMaskedQueryRegions mask
K-mer counts implemented as bit vectors.
static unsigned int GetKmerLength(void)
Get k-mer length.
static double FractionCommonKmersGlobalDist(const CBinaryKmerCounts &v1, const CBinaryKmerCounts &v2)
void Reset(const objects::CSeq_loc &seq, objects::CScope &scope)
Compute counts.
CBinaryKmerCounts(void)
Constructor.
unsigned int GetNumCounts(void) const
Get number of k-mers.
static unsigned int sm_AlphabetSize
static double FractionCommonKmersDist(const CBinaryKmerCounts &vect1, const CBinaryKmerCounts &vect2)
static Uint4 x_Popcount(Uint4 v)
Get number of set bits (adapted from http://graphics.stanford.edu/~seander/bithacks....
static CSafeStatic< vector< Uint1 > > sm_TransTable
static bool sm_UseCompressed
static Uint4 GetAALetter(Uint1 letter)
static unsigned int sm_KmerLength
static unsigned int CountCommonKmers(const CBinaryKmerCounts &v1, const CBinaryKmerCounts &v2)
Copmute number of common kmers between two count vectors.
Exception class for Kmer counts.
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
static unsigned int sm_AlphabetSize
static unsigned int GetKmerLength(void)
Get default kmer length.
CSparseKmerCounts(void)
Create empty counts vector.
static TCount * ReserveCountsMem(unsigned int num_bits)
vector< SVectorElement > m_Counts
vector< SVectorElement >::const_iterator TNonZeroCounts_CI
static double FractionCommonKmersGlobalDist(const CSparseKmerCounts &v1, const CSparseKmerCounts &v2)
static TCount * sm_Buffer
static void PreCount(void)
Perform preparations before k-mer counting common to all sequences.
static CSafeStatic< vector< Uint1 > > sm_TransTable
static void PostCount(void)
Perform post-kmer counting tasks.
static const unsigned int kLengthBitsThreshold
TNonZeroCounts_CI BeginNonZero(void) const
Get non-zero counts iterator.
static unsigned int sm_KmerLength
static double FractionCommonKmersDist(const CSparseKmerCounts &vect1, const CSparseKmerCounts &vect2)
TNonZeroCounts_CI EndNonZero(void) const
Get non-zero counts iterator.
static Uint4 GetAALetter(Uint1 letter)
void Reset(const objects::CSeq_loc &seq, objects::CScope &scope)
Reset the counts vector.
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
static bool sm_UseCompressed
static unsigned int CountCommonKmers(const CSparseKmerCounts &v1, const CSparseKmerCounts &v2, bool repetitions=true)
Copmute number of common kmers between two count vectors.
static bool sm_ForceSmallerMem
static bool InitPosBits(const objects::CSeqVector &sv, Uint4 &pos, unsigned int &index, Uint4 num_bits, Uint4 kmer_len)
Initializes element index as bit vector for first k letters, skipping Xaa.
CNcbiOstream & Print(CNcbiOstream &ostr) const
Print counts.
element_type * get(void) const
Get pointer.
void reset(element_type *p=0)
Reset will delete the old pointer, set content to the new value, and assume the ownership upon the ne...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
unsigned int
A callback function used to compare two keys in a database.
static void MarkUsed(Uint4 pos, vector< Uint4 > &entries, int chunk)
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines NCBI C++ exception handling.
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
Element of the sparse vector.
static wxAcceleratorEntry entries[3]
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4