:
CVCFVariantsBase(load_all_info, info_fields, sample_cols), m_ChrName(chr_name)
98 const unsigned char* buf_ptr = (
const unsigned char*)
data.data();
100 boolskip_max_feat_length =
true;
111 if(!found_end_col) {
118 if(
Count() > 100000) {
124 autostart = chrono::steady_clock::now();
127 autoend_it = end_vector.
begin();
128 autoend_stop_it = end_vector.end();
131 for(; end_it != end_stop_it; ++end_it) {
132 if(!end_it.is_null()) {
136 inttmp_delta = end_point - start_point + 1;
138tmp_delta = tmp_delta * (-1);
145 autodiff = chrono::steady_clock::now() - start;
146 LOG_POST(
Info<<
"Finding maxfeat length for "<<
Count() <<
" variants took: " 147<< chrono::duration_cast<chrono::milliseconds>(diff).
count() <<
"ms");
155 char* write_ptr =
m_Buffer->data();
156buf_ptr = (
const unsigned char*)
data.data();
160memcpy(write_ptr, &idx_size,
sizeof(idx_size));
161write_ptr +=
sizeof(idx_size);
162buf_ptr +=
sizeof(idx_size);
165memcpy(write_ptr, &nr_cols,
sizeof(nr_cols));
166write_ptr +=
sizeof(nr_cols);
167buf_ptr +=
sizeof(nr_cols);
173memcpy(write_ptr, buf_ptr,
data.size() -
sizeof(idx_size) -
sizeof(nr_cols));
187 size_tpos = line.find(
"\t");
194 stringremainder = line;
198 booladded = (errno == 0);
209vector<string> var_ids;
211var_ids.push_back(
value);
215 for(
auto& it : var_ids) {
218 sort(var_ids.begin(), var_ids.end());
219var_ids.erase(unique(var_ids.begin(), var_ids.end()), var_ids.end());
225 autocol_it = col_names.begin();
229 if(var_ids.size() == 1) {
230 for(++col_it; col_it != col_names.end() - 1; ++col_it) {
238 value= remainder.substr(pos + 1,
NPOS);
243vector<string> tmp_values;
244 for(++col_it; col_it != col_names.end() - 1; ++col_it) {
247tmp_values.push_back(
value);
253 value= remainder.substr(pos + 1,
NPOS);
256tmp_values.push_back(
value);
258 for(
size_tindex = 1; index < var_ids.size(); ++index) {
263col_it = col_names.begin();
267 for(++col_it; col_it != col_names.end() &&
n< tmp_values.size(); ++col_it, ++
n) {
297line = line.substr(pos + 1,
NPOS);
298pos = line.find(
"\t");
304 returnline.substr(0, pos);
314 for(; it != it_end; ++it) {
316 autoret =
tmp.insert(*it);
317 if(ret.second ==
false) {
319 autocopies_it = find_if(copies.begin(), copies.end(),
320[&search](
constpair<string, unsigned>& elem) { return elem.first == search; });
321 if(copies_it == copies.end()) {
322copies.emplace_back(search, 2);
330 return(copies.empty());
337 if(indices.count() > 0) {
354 sort(positions.begin(), positions.end());
355positions.erase(unique(positions.begin(), positions.end()), positions.end());
390 out<< var_iter.GetPosition() <<
"\t"<< var_iter.GetVariantID() <<
"\t" 391<< var_iter.GetRef() <<
"\t"<< var_iter.GetAlt();
393 out<<
"\t"<< var_iter.GetQual() <<
"\t"<< var_iter.GetFilter()
394<<
"\t"<< var_iter.GetInfo() <<
"\t"<< var_iter.GetFormat();
403 for(
const auto& it : samples) {
408 out<< var_iter.GetPosition() <<
"\t"<< var_iter.GetVariantID() <<
"\t";
409 for(
const auto& it : samples) {
410 out<< var_iter.GetSample(it) <<
"\t";
418vector<string> variants;
420variants.push_back(var_iter.GetVariantID());
430 out<<
"Resetting stat_sum...."<< endl;
432 boolJaccard_index =
false;
435 for(
const auto& it : col_names) {
437 out<<
"-----------------------"<< it <<
" vector-------------------------"<< endl;
438 LOG_POST(
Info<<
"-----------------------"<< it <<
" vector-------------------------");
445str_vector, Jaccard_index);
457 unsignedcum_memory_used = 0;
458 unsignedcum_layout_size = 0;
461*
out<<
"\nStarting to serialize columns for chr: "<<
m_ChrName<< endl;
468 for(
const auto& it : col_names) {
474 for(
const auto& it : info_fields) {
480 for(
const auto& it : samples) {
486*
out<< endl <<
"Total memory used: "<< cum_memory_used << endl << endl;
487*
out<<
"Total layout size: "<< cum_layout_size << endl;
494 catch(
constexception& e) {
507*
out<<
"\nStarting to deserialize blobs for chr: "<<
m_ChrName<< endl;
514 for(
const auto& it : col_names) {
520 for(
const auto& it : info_fields) {
526 for(
const auto& it : samples) {
535 catch(
constexception& e) {
548 for(
const auto& it : col_names) {
555 for(
const auto& it : info_fields) {
560 for(
const auto& it : samples) {
568 catch(
constexception& e) {
585: m_ColsDecode(cols_to_decode)
586, m_BufferPtr(
data)
610 sort(positions.begin(), positions.end());
611positions.erase(unique(positions.begin(), positions.end()), positions.end());
621 if(indices.count() > 0) {
629 out<< var_iter.GetPosition() <<
"\t"<< var_iter.GetVariantID() <<
"\t" 630<< var_iter.GetRef() <<
"\t"<< var_iter.GetAlt() <<
"\t" 631<< var_iter.GetQual() <<
"\t"<< var_iter.GetFilter() <<
"\t" 632<< var_iter.GetInfo();
633 if(var_iter.IsSetFormat()) {
634 out<<
"\t"<< var_iter.GetFormat() <<
"\t"<< var_iter.GetSampleCols();
#define BM_SCALAR_VERSION
Debugging functions (internal). Poorly documented, not well written.
Class for support low level input/output for files.
void GetStatistics(bm::bv_statistics &stat_sum, bool Jaccard_index, CNcbiOstream &out)
void DeserializeVectors(const string &prefix, CNcbiOstream *out)
void FinalizeReading()
Flushes the insert iterators after which it remaps and optimizes each vector.
unsigned GetPositionForIndex(const size_t &index) const
void Lookup(const TSparseStrVector::bvector_type &values, vector< unsigned > &indices) const
const size_t & GetMaxIndex() const
bool Add(const unsigned &index, const unsigned &value)
void SerializeVectors(const string &prefix, CNcbiOstream *out, unsigned &cum_memory_used, unsigned &cum_layout_size)
bool RemoveSerializedOutput(const string &prefix)
CVCFSlicedVariants(const unsigned char *data, const TSeqRange *range=nullptr, const set< string > &cols_to_decode=set< string >(), bool only_start=false)
void List(CNcbiOstream &out) const
virtual void GetPositionsForMissingVarID(vector< unsigned > &positions)
const unsigned char * m_BufferPtr
size_t m_NrCols
number of data columns to be deserialized
virtual bool GetPositionsForVariant(const string &variant_id, vector< unsigned > &positions)
set< string > m_ColsDecode
the name of data columns to be deserialized
virtual bool GetPositionsForVariant(const string &variant_id, vector< unsigned > &positions)
void ParseLine(const string &line)
void GetStatistics(CNcbiOstream &out)
CVCFVariantList(const string &chr_name, bool load_all_info=true, const set< string > &info_fields=set< string >(), const map< unsigned, string > &sample_cols=map< unsigned, string >())
bool RemoveSerializedOutput(const string &prefix)
void List(CNcbiOstream &out, bool only_sv_cols=false) const
virtual void GetPositionsForMissingVarID(vector< unsigned > &positions)
const vector< char > & GetSerializedData() const
bool AreVariantIdsUnique(vector< pair< string, unsigned >> &copies)
void WriteSerializedData(const string &filename)
bool operator==(const CVCFVariantList &other) const
string x_GetFilePrefix(const string &prefix) const
string x_ParseNextColumn(string &line, size_t &pos)
void ListSamples(CNcbiOstream &out) const
vector< string > GetAllVariantIDS() const
bool SerializeVariantData(const string &prefix, CNcbiOstream *out=nullptr)
bool DeserializeAndCheck(const string &prefix, CNcbiOstream *out=nullptr)
bool x_DeserializeColumn(const string &col_name, const unsigned char *buf_ptr, const size_t &nr_cols)
static const string sm_ID
void x_DeserializeIndexVectors(const unsigned char *&buf_ptr, size_t &nr_cols, bool skip_feat_length=false)
size_t GetNumberOfIndexVecs() const
void x_DeserializeDescr_Range(const unsigned char *buf_ptr, const size_t &nr_cols, const TSeqRange *range=nullptr, const set< string > &cols_to_decode=set< string >(), bool only_start=false)
static const vector< string > & s_GetAllColNames()
contains sm_INFO, sm_SAMPLES
static string s_GetPreviousVersion()
static const vector< string > & s_GetColNames()
does not contain sm_INFO and sm_SAMPLES
TSeqPos m_StartPos
in genomic coordinates (1-based)
CPosToIndex m_Posindexmap
void x_DeserializeAllData()
static const string sm_MissingValue
vector< char > * m_Buffer
TSeqPos m_StopPos
in genomic coordinates (1-based)
void x_SerializeData() const
unsigned m_MaxFeatLength
maximum feature variant length
CVariantDescriptors m_Descriptors
bool GetIndicesForVariant(const string &variant_id, TSparseStrVector::bvector_type &indices) const
bool AreInsertersReady() const
const TSparseOptVector & GetSample(const string &name) const
TSparseOptVector & SetSample(const string &name)
const TSparseOptVector & GetInfoField(const string &field_name) const
TSparseOptVector & SetInfoField(const string &field_name)
vector< string > GetInfoFieldNames() const
void FinalizeReading()
Flushes the insert iterators after which it remaps and optimizes each vector.
void PushBackPos(const unsigned &value)
Push back starting position of a variant.
vector< string > GetSampleNames() const
unsigned GetMaxFeatureLength()
Returns the maximum feature length within the set or 0 if the end points are not specified.
void PushBack(const string &label, const string &value)
Push back 'value' into the vector identified by 'label' The 'value' is not actually stored in the vec...
void GetIndicesForMissingVarID(TSparseStrVector::bvector_type &indices) const
const_iterator begin() const noexcept
Provide const iterator access to container content.
std::ofstream out("events_result.xml")
main entry point for tests
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
void Info(CExceptionArgs_Base &args)
void Close(void)
Close file.
void Open(const string &filename, EOpenMode open_mode, EAccessMode access_mode, EShareMode share_mode=eShare)
Open file.
Uint8 GetFileSize(void) const
Get file size.
size_t Read(void *buf, size_t count) const
Read file.
@ eRead
File can be read.
@ eOpen
Open an existing file, or create a new one.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
@ fConvErr_NoThrow
Do not throw an exception on error.
static const char label[]
Lightweight interface for getting lines of data with minimal memory copying.
bool RemoveFile(const string &fname)
void SerializeColumn(SV &vec, const string &prefix, const string &col_name, CNcbiOstream *out, unsigned &cum_memory_used, unsigned &cum_layout_size)
void PrintStats(const bm::bv_statistics &sum, CNcbiOstream &out)
void AddStats(bm::bv_statistics &sum, SV &vec, CNcbiOstream &out)
void DeserializeColumn(SV &vec, const string &prefix, const string &col_name, CNcbiOstream *out)
void PrintToFile(const char *buff, size_t size, const string &fname)
string GenerateColFileName(const string &prefix, const string &col_name)
void print_svector_stat(TOut &tout, const SV &svect, bool print_sim=false)
constexpr auto sort(_Init &&init)
const string version
version string
const GenericPointer< typename T::ValueType > T2 value
Compressed bitset (entry point to bm.h)
static sljit_uw total_size
Structure with statistical information about memory allocation footprint, serialization projection,...
void reset() noexcept
Reset statisctics.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4