std::ostringstream os;
93 template<
typenameword_t >
95{ os.write(
reinterpret_cast< char*
>( &word ),
sizeof(
word_t) ); }
143 typedefobjects::CSeqVector
TSeq;
151 typedefobjects::CSeq_loc::TPacked_int::Tdata
TLocs;
157 static constTSeqStore::size_type
SS_INCR= 100*1024*1024;
160 static constTSeqStore::size_type
SS_THRESH= 10*1024*1024;
176 typedefobjects::CSeq_loc::TPacked_int::Tdata
TLocs;
199 if( loc->IsPacked_int() ) {
201&( loc->GetPacked_int().Get() ) );
237TLocsVec::const_iterator
vit_;
238TLocs::const_iterator
it_;
371 typedefstd::vector< SSeqSeg >
TSegs;
543 it_= (*vit_)->begin();
545 if(
it_!= (*vit_)->end() ) {
546 start_= (*it_)->GetFrom();
547 stop_= (*it_)->GetTo() + 1;
559 if( ++it_ != (*vit_)->end() ) {
560start_ = (*it_)->GetFrom();
561stop_ = (*it_)->GetTo() + 1;
566 if( Good() ) it_ = (*vit_)->begin();
577}
while( notdone && pos < stop_ );
583 if( c_locs_.empty() )
return false;
588 while( vit_ != c_locs_.begin() && (*vit_)->empty() ) {
592 if( !(*vit_)->empty() ) {
593it_ = (*vit_)->end();
595start_ = (*it_)->GetFrom();
596stop_ = (*it_)->GetTo() + 1;
600vit_ = c_locs_.end();
604 if( it_ != (*vit_)->begin() ) {
606start_ = (*it_)->GetFrom();
607stop_ = (*it_)->GetTo() + 1;
611 if( vit_ == c_locs_.begin() ) {
618 while( vit_ != c_locs_.begin() && (*vit_)->empty() ) {
622 if( !(*vit_)->empty() ) {
623it_ = (*vit_)->end();
625start_ = (*it_)->GetFrom();
626stop_ = (*it_)->GetTo() + 1;
637 while( Good() && pos >= stop_ ) Advance();
638 if( !Good() )
return false;
639 returnpos >= start_;
648entry->Which() != objects::CSeq_entry_Base::e_Seq ) {
651 "input seq-entry is NULL or not a sequence");
654objects::CScope scope( *
om_);
655objects::CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry( *entry );
656objects::CBioseq_Handle bsh = seh.GetSeq();
657 c_seq_= bsh.GetSeqVector( objects::CBioseq_Handle::eCoding_Iupac );
659 Uint4pos =
static_cast<Uint4>(idstr.find_first_of(
" \t"));
660idstr = idstr.substr( 0, pos );
668 string result=
"unknown";
677 for( TMask::const_iterator mask_it =
mask.begin();
678mask_it !=
mask.end(); ++mask_it ) {
694chunks_.size()*
sizeof(
TWord));
697 for( TSubjects::const_iterator cit =
subjects_.begin();
702 for( TChunks::const_iterator cit = chunks_.begin();
703cit != chunks_.end(); ++cit ) {
715TSeqStore::size_type seq_off )
719 if( chunk_start >=
c_seq_.size() ) {
726 TSeqPoschunk_len = chunk_end - chunk_start;
729 if( chunk_len > 0 ) {
731 bool in=
false, in1;
734 for(
TSeqPospos = chunk_start;
735pos < chunk_end; ++pos,
lc= (
lc+ 1)%
CR) {
748 if( segs.empty() ) {
749segs.push_back(
SSeqSeg( 0 ) );
752segs.rbegin()->stop_ = pos - chunk_start;
754}
else if( !in1 &&
in) {
755segs.push_back(
SSeqSeg( pos - chunk_start ) );
761 if( segs.empty() ) {
762segs.push_back(
SSeqSeg( 0 ) );
765segs.rbegin()->stop_ = chunk_end - chunk_start;
773*
subjects_.rbegin() =
static_cast<unsigned int>(chunks_.size());
794TSeqStore::size_type newsize =
795(TSeqStore::size_type)(chunks_[
last_chunk_].seq_start_);
807cur_lid_len_( 0 ), offset_bits_( 16 )
817 boolstarting = (this->
c_chunk_== 0);
820TBase::TSeqStore::size_type seq_off =
822this->
chunks_.rbegin()->seq_start_
825TBase::TSeq::size_type seqlen = this->
c_seq_.size();
832 TSeqPoschunk_len = chunk_end - chunk_start;
838 if(
lid_map_.size() >= lid_limit ) {
854 if( starting && seqlen > 0 ) {
862 for(
TSeqPospos = 0; pos < seqlen; ++pos,
lc= (
lc+ 1)%
CR) {
865accum = (accum << 2) +
letter;
870accum <<= (
CR-
lc)*2;
883TLIdMap::const_reverse_iterator iter =
lid_map_.rbegin();
884 while( iter !=
lid_map_.rend() && iter->seq_start_ > soff ) ++iter;
885 ASSERT( iter->seq_start_ <= soff );
886off += (soff - iter->seq_start_)*
CR;
895TLIdMap::const_reverse_iterator iter =
lid_map_.rbegin();
896 while( iter !=
lid_map_.rend() && iter->seq_start_ > soff ) ++iter;
897 ASSERT( iter->seq_start_ <= soff );
898off += (soff - iter->seq_start_)*
CR;
921 for( TLengthTable::const_iterator it =
lengths_.begin();
929 for( TLIdMap::const_iterator it =
lid_map_.begin();
1025 if( d == 0 )
return;
1028 while( d->
next!= 0 ) d = d->
next;
1128{
return!(rhs == lhs); }
1180 if( newsize == 0 ) {
1191 while(
t< newsize ) {
1232 unsigned longm =
mult_;
1241 for(
unsigned long n=
mult_;
n> m; --
n)
1242 if( (*cit)%
n== 0 ) { skip =
true;
break; }
1244 if( !skip && (*cit)%m == 0 )
WriteWord( os, *cit );
1318 i->SetDataPool( pool );
1403 for( THashTable::const_iterator cit =
hash_table_.begin();
1405 if( cit->Size() > 0 ) ++this->
total_;
1409std::unique_ptr< CNcbiOfstream >
stats;
1418 unsigned longnmer = 0;
1420 for( THashTable::const_iterator cit =
hash_table_.begin();
1422 if( cit->Size() != 0 ) {
1426 if( cit->Size() != 0 )
1432 if( stat && cit->Size() > 0 ) {
1433*
stats<<
hex<< setw( 10 ) << nmer
1434<<
" "<< dec << cit->Size() << endl;
1441 for( THashTable::const_iterator cit =
hash_table_.begin();
1455 TSeqPosend_diff = stop - curr;
1461 hash_table_[(THashTable::size_type)nmer].AddData(
1465 hash_table_[(THashTable::size_type)nmer].AddData(
1474 const Uint1letter_mask = 0x3;
1476 unsigned long count= 0;
1478 for(
TSeqPoscurr = start; curr < stop; ++curr, ++
count) {
1481nmer = ((nmer<<2)&nmer_mask) +
letter;
1495 for( TSeqInfo::TSegs::const_iterator it = sinfo.
segs_.begin();
1496it != sinfo.
segs_.end(); ++it ) {
1499sinfo.
len_, it->start_, it->stop_ );
1509 for( THashTable::iterator it =
hash_table_.begin();
1614 for(
int i= 0;
i< 7; ++
i)
WriteWord( os, (
unsigned char)0 );
1622 for(
int i= 0;
i< 7; ++
i)
WriteWord( os, (
unsigned char)0 );
1643 input, oname, start, start_chunk, stop, stop_chunk, options );
1653 input, oname, start, start_chunk, stop, stop_chunk, options );
1665std::unique_ptr< COffsetList::CDataPool > pool(
1669TOffsetData offset_data( subject_map, options, pool.get() );
1678vector< string > idmap;
1680 while(
i< stop ) {
1685 stringidstr = subject_map.NewSequenceInit( *sd, start_chunk );
1686idmap.push_back( idstr );
1701 while( subject_map.AddSequenceChunk( overflow ) ) {
1703offset_data.Update();
1706std::cerr <<
"WARNING: logical sequence id overflow. " 1707<<
"Starting new volume."<< std::endl;
1711((
Uint8)
sizeof(
TWord))*offset_data.total();
1715subject_map.RollBack();
1716offset_data.Update();
1717subject_map.Commit();
1718stop = start + subject_map.GetLastSequence() - 1;
1719stop_chunk = subject_map.GetLastSequenceChunk();
1724subject_map.Commit();
1730std::ostringstream os;
1731os <<
"Last processed: sequence " 1732<< start + subject_map.GetLastSequence() - 1
1733<<
" ; chunk "<< subject_map.GetLastSequenceChunk()
1738std::ostringstream os;
1739os <<
"Index size: " 1740<< subject_map.total() +
sizeof(
TWord)*offset_data.total()
1741<<
" bytes (not counting the hash table)."<< std::endl;
1745 SaveHeader( os, options, start, start_chunk, stop, stop_chunk );
1746offset_data.Save( os );
1747subject_map.Save( os );
1749 if( options.
idmap) {
1750 stringmapname = oname +
".map";
1754 i!= idmap.end(); ++
i) {
1755maps << *
i<<
"\n";
1772 input, oname, start, start_chunk,
1773stop, stop_chunk, options );
1782 MakeIndex( fname, oname, start, stop,
t, options );
1792TIndex_Impl::Create(
1793 input, oname, start, start_chunk, stop, stop_chunk, options );
Structures and functions prototypes used for BLAST gapped extension.
Structures and API used for saving BLAST hits.
ncbi::TMaskedQueryRegions mask
Types of exception the indexing library can throw.
Index factory implementation.
static void Create(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index implementation object.
virtual ~CDbIndex_Factory()
Object destructor.
static const Uint8 MEGABYTE
Obvious...
static void SaveHeader(CNcbiOstream &os, const SOptions &options, TSeqNum start, TSeqNum start_chunk, TSeqNum stop, TSeqNum stop_chunk)
Save the index header.
static void do_create(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Called by CDbIndex::Create() (should be merged?).
static void do_create_1_2(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Another forward from do_create() (should be merged?).
Base class providing high level interface to index objects.
Uint4 TWord
Type representing main memory unit of the index structure.
static const unsigned char VERSION
Index version that this library handles.
static void MakeIndex(const std::string &fname, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
A class responsible for creation and management of Nmer offset lists.
CSubjectMap_Factory TSubjectMap
Rename for consistency.
std::vector< TOffsetList > THashTable
Type used for mapping Nmer values to corresponding offset lists.
TWord total_
Current size of the structure in bytes.
TSubjectMap::TSeqInfo TSeqInfo
Forwarding from TSubjectMap.
void AddSeqInfo(const TSeqInfo &sinfo)
Update offset lists with information corresponding to the given sequence.
void Save(CNcbiOstream &os)
Save the offset lists into the binary output stream.
void AddSeqSeg(const Uint1 *seq, TWord seqlen, TSeqPos start, TSeqPos stop)
Update offset lists with information corresponding to the given valid segment of a sequence.
CDbIndex::TSeqNum TSeqNum
Forwarding from CDbIndex.
TSubjectMap & subject_map_
Instance of subject map structure.
void EncodeAndAddOffset(TWord nmer, TSeqPos start, TSeqPos stop, TSeqPos curr, TWord offset)
Encode the offset data and add to the offset list corresponding to the given Nmer value.
unsigned long code_bits_
Number of bits to encode special offset prefixes.
THashTable hash_table_
Mapping from Nmer values to the corresponding offset lists.
TSeqNum last_seq_
Logical oid of last processed sequence.
const CDbIndex::SOptions & options_
Index options.
void Truncate()
Truncate the offset lists according to the information from the subject map.
unsigned long hkey_width_
Nmer width in bases.
COffsetList TOffsetList
Type used for individual offset lists.
COffsetData_Factory(TSubjectMap &subject_map, const CDbIndex::SOptions &options, COffsetList::CDataPool *pool)
Object constructor.
const TWord total() const
Get the total memory usage by offset lists in bytes.
void Update()
Bring offset lists up to date with the corresponding subject map instance.
static const Uint4 BLOCK_SIZE
vector< SDataUnit > TBlock
CDataIterator & operator--()
CDataIterator & operator++()
friend bool operator!=(const CDataIterator &rhs, const CDataIterator &lhs)
friend bool operator==(const CDataIterator &rhs, const CDataIterator &lhs)
CDataIterator(SDataUnit *cunit, Uint4 cindex, Uint4 size)
CDataIterator const_iterator
void SetDataPool(CDataPool *pool)
const_iterator end() const
void resize(Uint4 newsize)
const_iterator begin() const
void push_back(const TWord &d)
Type representing an offset list corresponding to an Nmer.
CData TData
Type used to store offset list data.
TWord Size() const
Return the size of the offset list in words.
void SetIndexParams(const TOptions &options)
Set the index creation parameters.
unsigned long min_offset_
Minimum offset used by the index.
void SetDataPool(CDataPool *pool)
void AddData(TWord item, TWord &total)
Add an offset to the list.
void Save(CNcbiOstream &os) const
Save the offset list.
TData data_
Offset list data storage.
unsigned long mult_
Max multiple to use in list pre-ordering.
static const Uint4 DATA_UNIT_SIZE
void TruncateList(TWord offset, TWord &total)
Truncate the list to the value of offset.
Sequence stream for reading FASTA formatted files.
Class used to abstract reading nucleotide sequences from various sources.
TSeqData::TMask TMask
Public alias for type containing masking info.
A helper class used when creating internal set masked locations in the process of converting the sequ...
TLocs::const_iterator it_
State of the iterator over *vit_ (inner iteration).
bool In(TSeqPos pos)
Check if a point falls within the intervals stored in the object.
TSeqPos start_
Left end of *it_.
objects::CSeq_loc::TPacked_int::Tdata TLocs
See documentation for CSubjectMap_Factory_Base::TLocs.
std::vector< const TLocs * > TLocsVec
Collection of TLocs extracted from CSequenceIStream::TSeqData.
void Init()
Initialize the iterators after the masked locations are added.
bool Good() const
Check if the end of iteration has been reached.
void Advance()
Iteration step.
void Adjust(TSeqPos pos)
Backtrack to the first interval to the left of pos or to the beginning, if not possible.
TLocsVec c_locs_
Container with sets of masked intervals.
bool Retreat()
Iteration step backwords.
TSeqPos stop_
One past the right end of *it_.
CMaskHelper()
Default object constructor.
void Add(const TMask::value_type &loc)
Add a set of masked intervals.
TLocsVec::const_iterator vit_
State of the iterator over c_locs_ (outer iteration).
CSequenceIStream::TMask TMask
forwarded type
Part of the CSubjectMap_Factory class that is independent of template parameters.
CSequenceIStream::TMask TMask
Masking information.
TSeqStore seq_store_
Container for storing the packed sequence data.
TSeqNum committed_
Logical number of the last committed sequence.
unsigned long report_level_
Level of reporting requested by the user.
TSeqStore::size_type ss_cap_
Current seq_store capacity.
CRef< CMaskHelper > mask_helper_
Auxiliary object used to compute unmasked parts of the sequences.
CDbIndex::TSeqNum TSeqNum
forwarded type
TSeqNum last_chunk_
Logical number of last processed sequence.
unsigned long chunk_size_
Maximum internal sequence size.
string extractSeqVector(TSeqData &sd)
Helper function used to extract CSeqVector instance from a TSeqData object.
unsigned long chunk_overlap_
Length of overlap between consequtive chunks of one sequence.
objects::CSeq_loc::TPacked_int::Tdata TLocs
The inner most type needed to access mask data in the representation returned by ReadFasta().
static const TSeqStore::size_type SS_THRESH
Threshold for the difference between seqstore size and capacity.
unsigned long stride_
Stride selected in index creation options.
objects::CSeqVector TSeq
Sequence data without masking.
unsigned long min_offset_
Minimum offset value used by the index.
const Uint1 * seq_store_start() const
Get the start of the compressed sequence storage space.
std::vector< Uint1 > TSeqStore
Container type used to store compressed sequence information.
static const TSeqStore::size_type SS_INCR
Increment used to increase seqstore capacity.
string NewSequenceInit(TSeqData &sd, TSeqNum start_chunk)
Start processing of the new input sequence.
CSequenceIStream::TSeqData TSeqData
forwarded type
TSubjects subjects_
Mapping from subject oid to chunk information.
std::vector< TSeqNum > TSubjects
Type for storing mapping from subject oids to the chunk numbers.
CRef< objects::CObjectManager > om_
Reference to the ObjectManager instance.
TSeq c_seq_
Sequence data of the sequence currently being processed.
TSeqNum c_chunk_
Current chunk number of the sequence currently being processed.
CSubjectMap_Factory_Base(const TOptions &options)
Object constructor.
To be merged with CSubjectMap_Factory_Base.
bool AddSequenceChunk(TSeqStore::size_type seq_off)
Append the next chunk of the input sequence currently being processed to the subject map.
TSeqNum GetLastSequenceChunk() const
Get the oid of the last chunk number of the last processed sequence.
CSubjectMap_Factory_TBase(const TOptions &options)
Object constructor.
void Commit()
Finalize processing of the current input sequence.
void Save(CNcbiOstream &os) const
Save the subject map and sequence info.
TSeqNum LastGoodSequence() const
Get the internal oid of the last valid sequence.
void RollBack()
Revert to the state before the start of processing of the current input sequence.
TChunks chunks_
Collection of sequence chunks (or logical sequences).
TSeqNum GetLastSequence() const
Get the oid of the last processed sequence.
std::vector< SSeqInfo > TChunks
Type for the collection of sequence chunks.
const TSeqInfo * GetSeqInfo(TSeqNum snum) const
Get the chunk info by internal oid.
SSeqSeg TSeqSeg
Type definition for external users.
SSeqInfo TSeqInfo
Type definition for external users.
TWord total() const
Get the total memory usage by the subject map in bytes.
To be merged with CSubjectMap_Factory_Base.
Uint1 offset_bits_
Number of bits used to encode offset.
string NewSequenceInit(TSeqData &sd, TSeqNum start_chunk)
Start processing of the new input sequence.
vector< TWord > TLengthTable
Type of lengths table.
TSeqPos cur_lid_len_
Current length of local sequence.
TLengthTable lengths_
The table of subject sequence lengths.
void Save(CNcbiOstream &os) const
Save the subject map and sequence info.
TLIdMap lid_map_
Maping of local sequence ids to chunks.
vector< SLIdMapElement > TLIdMap
Type of mapping of local sequence ids to chunks.
TWord MakeOffset(const Uint1 *seq, TSeqPos off) const
Encode an offset given a pointer to the compressed sequence data and relative offset.
CSubjectMap_Factory_TBase TBase
Base class.
bool CheckOffset(const Uint1 *seq, TSeqPos off) const
Check if index information should be produced for this offset.
CSubjectMap_Factory(const TOptions &options)
Object constructor.
bool AddSequenceChunk(bool &overflow)
Append the next chunk of the input sequence currently being processed to the subject map.
Type representing subject map data.
static const int chunk_size
const unsigned long WIDTH_32
32-bit index.
const unsigned long OFFSET_COMBINED
Combination of chunk number and chunk-based offset.
unsigned long GetMinOffset(unsigned long stride)
Compute the minimum offset value needed encode offsets based on stride.
const unsigned long UNCOMPRESSED
No compression.
unsigned long GetCodeBits(unsigned long stride)
Compute the number of bits to encode special offsets based on stride.
Uint1 base_value(objects::CSeqVectorTypes::TResidue r)
Convertion from IUPACNA to NCBI2NA (+1).
void WriteWord(CNcbiOstream &os, word_t word)
Write a word into a binary output stream.
static const unsigned long CR
CDbIndex::TWord TWord
Alias for CDbIndex::TWord type.
const std::string to_hex_str(TWord word)
Convert an integer to hex string representation.
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
objects::CSeqVectorTypes::TResidue TResidue
unsigned int TSeqPos
Type for sequence locations and lengths.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
NCBI_XOBJUTIL_EXPORT string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags=0)
TObjectType * GetNonNullPointer(void)
Get pointer value and throw a null pointer exception if pointer is null.
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static void hex(unsigned char c)
double value_type
The numeric datatype used by the parser.
#define ASSERT
macro for assert.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
CSequenceIStream::TSeqData TSeqData
Simple record type used to specify index creation parameters.
bool legacy
Indicator of the legacy index format.
unsigned long max_index_size
Maximum index size in megabytes.
unsigned long chunk_size
Long sequences are split into chunks of this size.
std::string stat_file_name
File to write index statistics into.
unsigned long ws_hint
Most likely word size to use for searches.
bool idmap
Indicator of the index map creation.
unsigned long hkey_width
Width of the hash key in bits.
unsigned long stride
Stride to use for stored database locations.
TWord data[DATA_UNIT_SIZE]
Type containing the sequence itself along with the masking information.
CRef< objects::CSeq_entry > seq_entry_
Sequence data.
TMask mask_locs_
Masked portion of the sequence.
Element of mapping of local sequence ids to chunks.
TSeqNum start_
First chunk.
TSeqPos seq_start_
Start of the combined sequence in seq_store.
TSeqNum end_
One past the last chunk.
TSeqPos seq_end_
End of the combined sequence in seq_store.
Type used to store a masked segment internally.
TSeqPos stop_
One past the end of the segment.
SSeqSeg(TSeqPos start, TSeqPos stop=0)
Object constructor.
TSeqPos start_
Start of the segment.
Information about the sequence chunk.
SSeqInfo(TWord start=0, TWord len=0, const TSegs &segs=TSegs())
Object constructor.
TSegs segs_
Valid intervals, i.e.
TWord seq_start_
Start of the compressed sequence data.
TWord len_
Sequence length.
std::vector< SSeqSeg > TSegs
Type containing the valid intervals.
static Uint4 letter(char c)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4