found_index_file =
103 TIndxdisk_file_length(0);
104 boolfound_data_file =
126 Int4num_elements(0);
165 while(Stop >= Start) {
166SampleNum = ((
Uint4)(Stop + Start)) >> 1;
171 const void* keydatap(0);
197Start = SampleNum +1;
228 "Error: Unable to use ISAM index in batch mode.");
248 intgilist_index = 0;
251 const void* data_page (0);
254 intstart = 0, num_elements = 0;
261 for(
int i= 0;
i< num_elements;
i++) {
276 if(gilist_index < gilist_size) {
285 if(isam_data < vol_end) {
308 boolsameAccession =
false;
313 if(acc2 == accession) {
314sameAccession =
true;
318 returnsameAccession;
325 boolsameAccession =
false;
326 if(currIndex < num_keys - 1) {
329 returnsameAccession;
339 if(! gilist_size)
return;
348 "Error: Unable to use ISAM index in batch mode.");
352vector<string> sample_keys;
353vector<TIndx> page_offs;
365 intgilist_index = 0;
366 intsample_index = 0;
381 for(
int i= 0;
i< num_keys;
i++) {
385 if(gilist_index < gilist_size) {
391 if(vals[
i] < vol_end) {
434 Int4 last= Start + NumElements - 1;
436 const void* KeyDataPage =
NULL;
437 const void* KeyDataPageStart =
NULL;
445KeyDataPage = (
char*)KeyDataPageStart - Start *
m_TermSize;
447 boolfound (
false);
458}
else if(Key < Number) {
466 if(found ==
false) {
481*Index = Start + current;
532 TIndxoffset_begin = KeyOffset;
533 TIndxterm_end = KeyOffset + term_in.size() + 1;
534 TIndxmap_end = term_end + at_least;
536 if(map_end > file_length) {
537map_end = file_length;
539 if(term_end > map_end) {
541 result=
int(file_length - offset_begin);
550file_data + term_in.size() + 1,
553 if(dc_result != -1) {
597 const char* file_data = begin;
598 intbytes =
int(end - begin);
600 for(
i= 0; (
i< bytes) &&
i< (
int) term_in.size();
i++) {
601 char ch1= term_in[
i];
602 char ch2= file_data[
i];
619 const char* p = file_data +
i;
621 while((p < end) && ((*p) ==
' ')) {
625 if(((p == end) ||
ENDS_ISAM_KEY(*p)) && (
i== (
int) term_in.size())) {
638vector<TIndx> & indices_out,
639vector<string> & keys_out,
640vector<string> & data_out)
644 boolignore_case =
true;
648 const char* indexp(beginp);
649 boolfound_match(
false);
651 while(indexp < endp) {
665indices_out.push_back(page_index + TermNum);
690vector<TIndx> & indices_out,
691vector<string> & keys_out,
692vector<string> & data_out)
699 boolignore_case =
true;
704 booldone_b(
false), done_e(
false);
706 const char* beginp(0);
707 const char* endp(0);
712 while(! (done_b && done_e)) {
713 if(sample_index < pre_amt) {
717beg_off = sample_index - pre_amt;
724end_off = sample_index + post_amt;
727 x_LoadPage(beg_off, end_off, & beginp, & endp);
735 if(diff_begin != -1) {
743 const char* last_term(0);
744 const char* p(endp-1);
748 enum{ eEndNulls, eLastTerm } search_stage = eEndNulls;
753 if(search_stage == eEndNulls) {
755search_stage = eLastTerm;
776 if(diff_end != -1) {
794 const char* map_end,
795vector<string> & keys_out,
796vector<string> & data_out)
798 const char* data_ptr(0);
799 const char* p(key_start);
801 while(p < map_end) {
805keys_out.push_back(
string(key_start, data_ptr));
806data_out.push_back(
string(data_ptr+1, p));
808keys_out.push_back(
string(key_start, p));
809data_out.push_back(
"");
827 TIndxoffset_begin = sample_offset + (sample_num *
sizeof(
Uint4));
844 const char* key_offset_addr =
849 for(
int i= 0;
i<length;
i++) {
850 if(! key_offset_addr[
i]) {
857 str.assign(key_offset_addr, length);
874 boolignore_case(
true);
882 TIndxoffset_begin = SampleOffset + (SampleNum *
sizeof(
Uint4));
901 const char** beginp,
907 _ASSERT(SampleNum2 > SampleNum1);
935vector<string> & terms_out,
936vector<string> & values_out,
937vector<TIndx> & indices_out)
943 boolshort_match(
false);
944 boolfollow_match(
false);
946 size_tpreexisting_data_count = values_out.size();
957 boolignore_case =
true;
964 intLength = (
int) term_in.size();
977 while(Stop >= Start) {
978SampleNum = ((
Uint4)(Stop + Start)) >> 1;
982 intdiff =
x_DiffSample(term_in, SampleNum, KeyOffset);
991 if(BytesToEnd > (
TIndx) max_lines_2) {
992BytesToEnd = max_lines_2;
1009 if(short_match && (diff >= Length)) {
1013 while(SampleNum > 0) {
1027 if(prefix != term_in) {
1035found_short = SampleNum + 1;
1050found_short = SampleNum;
1059?
tolower((
unsigned char) term_in[diff]) <
tolower((
unsigned char) KeyData[diff])
1060: term_in[diff] < KeyData[diff]) {
1063Start = SampleNum + 1;
1070 if( (SampleNum < 0) || (SampleNum >=
m_NumSamples)) {
1076 const char* beginp(0);
1077 const char* endp(0);
1079 x_LoadPage(SampleNum, SampleNum + 1, & beginp, & endp);
1095 if(preexisting_data_count == values_out.size()) {
1108m_IdentType (ident_type),
1109m_IndexLease (atlas),
1110m_DataLease (atlas),
1117m_Initialized (
false),
1118m_KeySampleOffset(0),
1119m_TestNonUnique (
true),
1128 switch(ident_type) {
1143 "Error: ident type argument not valid");
1157 string msg(
"Error: Could not open input file (");
1176 string& index_name,
1180(!
isalpha((
unsigned char) prot_nucl)) ||
1181(!
isalpha((
unsigned char) file_ext_char))) {
1185 "Error: argument not valid");
1188index_name.reserve(
dbname.size() + 4);
1189data_name.reserve(
dbname.size() + 4);
1193index_name += prot_nucl;
1194index_name += file_ext_char;
1196data_name = index_name;
1205 stringiname, dname;
1237vector<TOid> & oids,
1239 bool& version_check)
1242 boolstrip_version = version_check;
1243version_check =
false;
1251 boolfound =
false;
1253 stringaccession(
string(
"gb|") + acc +
"|");
1254 stringlocus_str(
string(
"gb||") + acc);
1258vector<string> keys_out;
1259vector<string> data_out;
1260vector<TIndx> indices_out;
1266indices_out)) < 0) {
1278indices_out)) < 0) {
1292indices_out)) < 0) {
1302 if((! found) && strip_version) {
1303 size_tpos = acc.find(
".");
1305 boolis_version =
false;
1307 if(pos != string::npos) {
1308 intver_len =
static_cast<int>(acc.size() - pos) - 1;
1310is_version = (ver_len <= 3 && ver_len >= 1);
1312 for(
size_tvp = pos+1; vp < acc.size(); vp++) {
1314is_version =
false;
1321 stringnover(acc, 0, pos);
1329 if(data_out.size()) {
1330version_check =
true;
1357 if(
id.
size() &&
1361indices_out)) < 0)) {
1372 ITERATE(vector<string>, iter, data_out) {
1373oids.push_back(atoi((*iter).c_str()));
1383cerr <<
" this should be derived from readdb_acc2fastaEx().."<< endl;
1399x_TranslateGiList<TGi>(vol_start, ids);
1403x_TranslateGiList<TTi>(vol_start, ids);
1407x_TranslateGiList<string>(vol_start, ids);
1411x_TranslateGiList<TPig>(vol_start, ids);
1417 "Error: Wrong type of idlist specified.");
1473 intnum_elements(0);
1475 const void* data_page(0);
1509elem_index = num_elements - 1;
1521 const char* beginp(0);
1522 const char* endp(0);
1527 x_LoadPage(Start, Start + 1, & beginp, & endp);
1531vector<string> keys_out;
1532vector<string> data_out;
1546 x_LoadPage(Stop, Stop + 1, & beginp, & endp);
1550 const char* lastp(0);
1551 const char* indexp(beginp);
1553 while(indexp < endp) {
1668vector<TOid> & oids)
1677 boolfound =
false;
1683vector<string> keys_out;
1684vector<string> data_out;
1685vector<TIndx> indices_out;
1690indices_out)) < 0) {
1700 ITERATE(vector<string>, iter, data_out) {
1701oids.push_back(atoi(iter->c_str()));
bool GetFileSizeL(const string &fname, TIndx &length)
Get size of a file.
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
void Init(const string &filename)
Initializes a memory map object.
void Clear()
Clears the memory mapobject.
bool OutsideLastBound(Int8 ident)
Returns true if the provided integer compares as higher than the assigned upper boundary for this ISA...
bool IsSet()
Returns true if this object has an assigned value.
string GetString() const
Fetch the numeric value of this object.
void SetString(const string &ident)
Fetch the string value of this object.
void SetNumeric(Int8 ident)
Assign a numeric value to this object.
Int8 GetNumeric() const
Fetch the numeric value of this object.
bool OutsideFirstBound(Int8 ident)
Returns true if the provided integer compares as lower than the assigned lower boundary for this ISAM...
EErrorCode x_StringSearch(const string &term_in, vector< string > &term_out, vector< string > &value_out, vector< TIndx > &index_out)
String identifier lookup.
EErrorCode x_SearchIndexNumeric(Int8 Number, int *Data, Uint4 *Index, Int4 &SampleNum, bool &done)
Index file search.
CSeqDBIsam(CSeqDBAtlas &atlas, const string &dbname, char prot_nucl, char file_ext_char, ESeqDBIdType ident_type)
Constructor.
@ eNumericLongId
This type is not supported.
@ eString
This type is not supported.
@ eNumericNoData
Numeric database with Key/Value pairs in the index file.
void x_SearchNegativeMulti(int vol_start, int vol_end, CSeqDBNegativeList &gis, bool use_tis)
Negative ID List Translation.
CSeqDBFileMemMap m_DataLease
A persistent lease on the ISAM data file.
TIndx m_IndexFileLength
The length of the ISAM index file.
bool m_LongId
Use Uint8 for the key.
int x_DiffCharLease(const string &term_in, CSeqDBFileMemMap &lease, const string &file_name, TIndx file_length, Uint4 at_least, TIndx KeyOffset, bool ignore_case)
Find the first character to differ in two strings.
int x_DiffChar(const string &term_in, const char *begin, const char *end, bool ignore_case)
Find the first character to differ in two strings.
int x_GetPageNumElements(Int4 SampleNum, Int4 *Start)
Determine the number of elements in the data page.
ESeqDBIdType m_IdentType
The type of identifier this class uses.
SIsamKey m_LastKey
Last volume key.
Int4 m_IdxOption
Options set by upper layer.
void x_LoadData(CSeqDBFileMemMap &lease, vector< T > &keys, vector< int > &vals, int num_keys, TIndx begin)
Load and extract a data page into array at once.
void x_GetDataElement(const void *dpage, int index, Int8 &key, int &data)
Get a particular data element from a data page.
Int4 m_NumSamples
Number of terms in ISAM index.
void HashToOids(unsigned hash, vector< TOid > &oids)
Sequence hash lookup.
EErrorCode
Exit conditions occurring in this code.
@ eBadVersion
Lookup was successful.
@ eBadType
The format version of the ISAM file is unsupported.
@ eWrongFile
The requested ISAM type did not match the file.
@ eNoError
The key was not found.
@ eInitFailed
The file was not found, or was the wrong length.
int x_DiffSample(const string &term_in, Uint4 SampleNum, TIndx &KeyOffset)
Find the first character to differ in two strings.
~CSeqDBIsam()
Destructor.
Uint8 x_GetNumericKey(const void *p)
void x_LoadIndex(CSeqDBFileMemMap &lease, vector< T > &keys, vector< TIndx > &offs)
Load and extract all index samples into array at once.
bool x_SparseStringToOids(const string &acc, vector< int > &oids, bool adjusted)
Lookup a string in a sparse table.
void x_FindIndexBounds()
Find the least and greatest keys in this ISAM file.
Int4 m_NumTerms
Number of terms in database.
void IdsToOids(int vol_start, int vol_end, CSeqDBGiList &ids)
Translate Gis and Tis to Oids for the given ID list.
EErrorCode x_SearchDataNumeric(Int8 Number, int *Data, Uint4 *Index, Int4 SampleNum)
Data file search.
int TOid
This class works with OIDs relative to a specific volume.
bool m_Initialized
Flag indicating whether initialization has been done.
TIndx x_GetIndexKeyOffset(TIndx sample_offset, Uint4 sample_num)
Get the offset of the specified sample.
static void x_MakeFilenames(const string &dbname, char prot_nucl, char file_ext_char, string &index_name, string &data_name)
Make filenames for ISAM file.
static void x_Lower(string &s)
Converts a string to lower case.
bool x_OutOfBounds(Int8 key)
Check whether a numeric key is within this volume's bounds.
void x_SearchNegativeMultiSeq(int vol_start, int vol_end, CSeqDBNegativeList &gis)
EErrorCode x_InitSearch(void)
Initialize the search object.
void x_GetIndexString(TIndx key_offset, int length, string &prefix, bool trim_to_null)
Read a string from the index file.
void x_ExtractPageData(const string &term_in, TIndx page_index, const char *beginp, const char *endp, vector< TIndx > &indices_out, vector< string > &keys_out, vector< string > &data_out)
Find matches in the given memory area of a string ISAM file.
void GetIdBounds(Int8 &low_id, Int8 &high_id, int &count)
Get Numeric Bounds.
Int4 m_PageSize
Page size of ISAM index.
TIndx m_DataFileLength
The length of the ISAM data file.
void UnLease()
Return any memory held by this object to the atlas.
int m_Type
The format type of database files found (eNumeric or eString).
TIndx m_KeySampleOffset
Offset of samples in index file.
SIsamKey m_FirstKey
First volume key.
void x_LoadPage(TIndx SampleNum1, TIndx SampleNum2, const char **beginp, const char **endp)
Map a page into memory.
void x_ExtractAllData(const string &term_in, TIndx sample_index, vector< TIndx > &indices_out, vector< string > &keys_out, vector< string > &data_out)
Find matches in the given page of a string ISAM file.
void x_MapDataPage(int sample_index, int &start, int &num_elements, const void **data_page_begin)
Map a data page.
CSeqDBAtlas::TIndx TIndx
Type which is large enough to span the bytes of an ISAM file.
int x_GetNumericData(const void *p)
void StringToOids(const string &acc, vector< TOid > &oids, bool adjusted, bool &version_check)
String translation.
bool x_FindInNegativeList(CSeqDBNegativeList &ids, int &index, Int8 key, bool use_tis)
Find ID in the negative GI list using PBS.
Int4 m_MaxLineSize
Maximum string length in the database.
void x_ExtractData(const char *key_start, const char *entry_end, vector< string > &key_out, vector< string > &data_out)
Extract the data from a key-value pair in memory.
EErrorCode x_NumericSearch(Int8 Number, int *Data, Uint4 *Index)
Numeric identifier lookup.
string m_DataFname
The filename of the ISAM data file.
static bool IndexExists(const string &dbname, char prot_nucl, char file_ext_char)
Check if a given ISAM index exists.
string m_IndexFname
The filename of the ISAM index file.
int m_TermSize
size of the numeric key-data pair
bool x_IdentToOid(Int8 id, TOid &oid)
Numeric identifier lookup.
CSeqDBFileMemMap m_IndexLease
A persistent lease on the ISAM index file.
CSeqDBAtlas & m_Atlas
The memory management layer.
int GetNumTis() const
Get the number of TIs in the array.
void AddIncludedOid(int oid)
Include an OID in the iteration.
void AddVisibleOid(int oid)
Indicate a visible OID.
int GetNumGis() const
Get the number of GIs in the array.
int GetNumSis() const
Get the number of SeqIds in the array.
void InsureOrder()
Sort list if not already sorted.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual bool Exists(void) const
Check existence of file.
const string AsFastaString(void) const
@ fParse_RawText
Try to ID raw non-numeric accessions.
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
const string version
version string
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
static const BitmapCharRec ch1
static const BitmapCharRec ch2
Useful/utility classes and methods.
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eStringId
Each PIG identifier refers to exactly one protein sequence.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
@ eHashId
Some sequence sources uses string identifiers.
#define SEQDB_ISEOL(x)
Macro for EOL chars.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
USING_SCOPE(objects)
Place these definitions in the ncbi namespace.
#define DEFAULT_SISAM_SIZE
Default page size for string indices.
static bool ENDS_ISAM_KEY(char P)
Returns true if the character is a terminator for an ISAM key.
#define DEFAULT_NISAM_SIZE
Default page size for numeric indices.
#define ISAM_VERSION
Format version of the ISAM files.
#define MEMORY_ONLY_PAGE_SIZE
Special page size value which indicates a memory-only string index.
static bool s_IsSameAccession(string acc1, string acc2)
static char s_SeqDBIsam_NullifyEOLs(char c)
Return NUL for nulls or EOL characters.
const char ISAM_DATA_CHAR
The terminating character for string ISAM keys when data is present.
ISAM index database access object.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4