,
'C',
'G',
'T',
'N',
'B',
'D',
'E',
'F',
'H',
'I',
'J',
'K',
70 'L',
'M',
'O',
'P',
'Q',
'R',
'S',
'U',
'V',
'W',
'X',
'Y',
'Z',
105 template<
typenameAlphabet = TSeqAlphabet<true>>
120 if(IsRegion(index) ==
false) {
126 autoit = m_Terms.get_const_iterator(
offset);
127 for(
size_t i= 0;
i<
len&& it.valid(); ++
i) {
139 returnx_GetOffset(index) == 0 && x_GetLen(index) > 0;
146 return len== 0 ? 1 :
len;
160 #ifdef __USE_BM_XOR_COMPRESSION__ 164m_Terms.optimize(TB);
165sv_serializer.
serialize(m_Terms, sv_lay);
166 const unsigned char*
buf= sv_lay.
data();
168 if(!os.write(
reinterpret_cast<const char*
>(&sz),
sizeof(sz)))
170 if(sz && !os.write((
char*)&
buf[0], sz))
186sv_deserializer.
deserialize(m_Terms, (
const unsigned char*)&
data[pos]);
219 returnx_SetIndex(char_code, 0);
225 returnx_SetIndex(0,
len);
251 template<
typenameAlphabet = TSeqAlphabet<true>>
272 if(seq.length() == 1) {
273index = x_AddChar(seq[0]);
276m_CheckSum.Reset(ncbi::CChecksum::eCRC32);
277m_CheckSum.AddLine(seq);
278 auto crc32= m_CheckSum.GetChecksum();
279 if(m_SeqPos.test(
crc32)) {
284 if(x_FindSequence(seq, pos)) {
285index = TSeqDict::x_SetIndex(pos + 1, seq.length());
287index = x_AddSequence(seq);
290m_SeqPos.set(
crc32);
291index = x_AddSequence(seq);
300 returnTSeqDict::x_SetIndexAsRegion(
len);
306unique_ptr<TSeqDict> seqdict(
new TSeqDict());
307 autosz = m_SearchStr.size();
309seqdict->m_Terms[0] = 0;
310 while(curr_pos < sz) {
311vector<unsigned>
buffer(10 * 1024 * 1024 );
313 for(;
i<
buffer.size() && curr_pos < sz; ++
i, ++curr_pos) {
314 buffer[
i] = m_Coder[(unsigned)m_SearchStr[curr_pos]];
316seqdict->m_Terms.import_back(&
buffer[0],
i);
325 autoseq_sz = seq.size();
328m_SearchStr.push_back(
'\0');
329 autoptr = strstr(&m_SearchStr[0], seq.c_str());
330m_SearchStr.pop_back();
332pos = ptr - &m_SearchStr[0];
341 auto code= m_Coder[(unsigned)
C];
343 throwruntime_error(
"The sequence contains bad symbol '"+
string(1,
C) +
"'");
344 returnTSeqDict::x_SetIndexAsChar(
code);
350 autoseq_size = seq.size();
354vector<unsigned> seq_v(seq_size);
355 for(
size_t i= 0;
i< seq_size; ++
i) {
356 auto code= m_Coder[(unsigned)seq[
i]];
357 if(
code==
kBAD_CODE)
throwruntime_error(
"The sequence contains bad symbol '"+
string(1, seq[
i]) +
"'");
360idx = TSeqDict::x_SetIndex(m_CurrPos, seq_size);
361m_CurrPos += seq_size;
362 copy(seq.begin(), seq.end(), back_inserter(m_SearchStr));
371 size_tm_CurrPos = 0;
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for bit ranges and intervals.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
string sparse vector based on bit-transposed matrix
Checksum and hash calculation classes.
ncbi::CChecksum m_CheckSum
unique_ptr< TSeqDict > Build()
Builds the sequence dictionary from previously added sequences.
vector< unsigned > m_Coder
TIndex AddRegion(uint32_t len)
Create Region index.
TSeqDict::svector_u32 svector_u32
bool x_FindSequence(const string &seq, unsigned &pos)
TIndex x_AddSequence(const string &seq)
if necessary adds a ductionary sequence and returns its index
CSeqDictionary< Alphabet > TSeqDict
TIndex AddSequence(const string &seq)
Add sequence and returns its index.
vector< char > m_SearchStr
static bool IsRegion(TIndex index)
Checks if index references an actual sequence or it just contains a length of unaligned region.
string & GetSequence(TIndex index, string &seq) const
returns Sequence by Index
size_t Deserialize(const char *data)
Deserialization from string starting from pos returns number of deserilized bytes.
static TIndex x_SetIndexAsChar(uint32_t char_code)
static TIndex x_SetIndex(uint32_t offset, uint32_t len)
Index mask functions.
static uint32_t x_GetOffset(TIndex index)
static size_t GetSeqLength(TIndex index)
Returns the sequence length encoded in the index.
bool Serialize(ostream &os)
Serialization to ostream.
bm::sparse_vector< unsigned, bm::bvector<> > svector_u32
static uint32_t x_GetLen(TIndex index)
unsigned long long TIndex
static TIndex x_SetIndexAsRegion(uint32_t len)
sparse vector de-serializer
void deserialize(SV &sv, const unsigned char *buf, bool clear_sv=true)
Serialize sparse vector into a memory buffer(s) structure.
void serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout)
Serialize sparse vector into a memory buffer(s) structure.
void enable_xor_compression() noexcept
Enable XOR compression on vector serialization.
succinct sparse vector with runtime compression using bit-slicing / transposition method
@ BM_GAP
GAP compression is ON.
static const BitmapCharRec *const chars[]
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
static const unsigned kBAD_CODE
layout class for serialization buffer structure
const unsigned char * data() const noexcept
Return serialization buffer pointer.
size_t size() const noexcept
return current serialized size
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4