edit::CParseTextOptions options;
59options.SetStartText(
"ID=");
60options.SetStopText(
",");
61 m_Name= options.GetSelectedText(line);
63options.SetStartText(
",Number=");
64options.SetStopText(
",");
65 m_Number= options.GetSelectedText(line);
67options.SetStartText(
",Description=\"");
68options.SetStopText(
"\"");
77 unsignednr_lines = 0;
82 if(nr_lines % 200 == 0 && (canceled && canceled->
IsCanceled())) {
108edit::CParseTextOptions options;
109options.SetStartText(
"accession=");
110options.SetStopText(
",");
113options.SetStopText(
">");
130}
while(reader.
PeekChar() ==
'#');
137 "Line starting with ##fileformat is missing",
151 if(header_line.find(
" ") !=
NPOS|| header_line.find(
"\t") ==
NPOS) {
156 "Header line expected to be tab delimited",
160vector<string> col_names;
163 boolis_unique =
false;
165 set<string>unique_strs(col_names.begin(), col_names.end());
166is_unique = (col_names.size() == unique_strs.
size());
174 "Column names are not unique",
179 autoit = col_names.begin();
180 for(; it != col_names.end(); ++it) {
188 for(; it != col_names.end(); ++it, ++index) {
216 "Error allowance exceeded",
227 boolplaced = error_cont->
PutError(err);
235 "Error allowance exceeded",
243 autostart = chrono::steady_clock::now();
246 unsignednr_lines = 0;
247 unsignedlines_per_contig = 0;
250 stringprevious_chrom;
252vector<future<void>> async_calls;
253 autoJoinOptimization = [&async_calls]()
256 for(
auto& task : async_calls) {
263 autotask_start = chrono::steady_clock::now();
265 autoopt_start = chrono::steady_clock::now();
266var_map->FinalizeReading();
267 autodiff_opt = chrono::steady_clock::now() - opt_start;
268 LOG_POST(
Info<<
"Optimization of "<< chr <<
" took "<< chrono::duration_cast<chrono::milliseconds>(diff_opt).
count() <<
" ms");
270 if(on_variants_list_ready) {
271on_variants_list_ready(*var_map);
274 autodiff_opt = chrono::steady_clock::now() - task_start;
279 while(
in.good() && !
in.eof()) {
280 if(nr_lines % 1000 == 0 && (canceled && canceled->
IsCanceled())) {
294 if(line.empty() || (!line.empty() && line[0] ==
'#')) {
298 if(line.find(
"\t") ==
NPOS) {
303 "Has been skipped as it is not tab delimited:\n"+ line,
309 size_tpos = line.find(
"\t");
310 stringchrom = line.substr(0, pos);
311 if(chrom != previous_chrom) {
312 if(!previous_chrom.empty()) {
314async_calls.push_back(async(std::launch::async | std::launch::deferred, OptimizeVariantsList, std::ref(
m_ChromosomeMap.at(previous_chrom))));
318previous_chrom = chrom;
319lines_per_contig = 0;
324vars_list = inserted.first->second.GetPointer();
331chrom +
" data line found out of its block. All entries for a specific CHROM should form a contiguous block within the VCF file.",
338 if(prog_func && lines_per_contig > 0 && lines_per_contig % 500000 == 0) {
345vars_list->ParseLine(line);
359 autodiff_parsing = chrono::steady_clock::now() - start;
360 LOG_POST(
Info<<
"Parsed "<< nr_lines <<
" lines from VCF file in " 361<< chrono::duration_cast<chrono::milliseconds>(diff_parsing).
count() <<
" ms ");
370 if(!
in.eof() && !
in.good()) {
371 LOG_POST(
Error<<
"Reading cannot be completed, as input stream is corrupted");
384 if(on_variants_list_ready) {
396 if(header_line.find(
" ") !=
NPOS|| header_line.find(
"\t") ==
NPOS) {
401 "Header line is expected to be tab delimited",
407 const unsignedkMandatoryCols = 8;
408 unsignednr_tabs =
static_cast<unsigned>(
count(header_line.begin(), header_line.end(),
'\t'));
409 if(nr_tabs + 1 < kMandatoryCols) {
414 "Header line is expected to have at least 8 columns",
458vector<CColumnarVCFReader::TSeqIdVarsListPair>
468 autostart = chrono::steady_clock::now();
470 unsignednr_lines = 0;
471 unsignedlines_per_contig = 0;
474 size_tsearch_chrs = chr_list.size();
477vector<future<void>> async_calls;
478 autoJoinOptimization = [&async_calls]()
481 for(
auto& task : async_calls) {
487 autochr = var_map->GetChrName();
488 autotask_start = chrono::steady_clock::now();
490 autoopt_start = chrono::steady_clock::now();
491var_map->FinalizeReading();
492 autodiff_opt = chrono::steady_clock::now() - opt_start;
493 LOG_POST(
Info<<
"Optimization of "<< chr <<
" took "<< chrono::duration_cast<chrono::milliseconds>(diff_opt).
count() <<
" ms");
495 if(on_variants_list_ready) {
496on_variants_list_ready(*var_map);
499 autodiff_opt = chrono::steady_clock::now() - task_start;
503 autoCallOptimizeVarsList = [&]() {
505async_calls.push_back(async(std::launch::async | std::launch::deferred, OptimizeVariantsList, vcf_vars));
507OptimizeVariantsList(vcf_vars);
511 while(
in.good() && !
in.eof() && search_chrs > 0) {
512 if(nr_lines % 1000 == 0 && (canceled && canceled->
IsCanceled())) {
515variants_list.clear();
516 returnvariants_list;
528 if(line.empty() || (!line.empty() && line[0] ==
'#')) {
532 if(line.find(
"\t") ==
NPOS) {
537 "Has been skipped as it is not tab delimited:\n"+ line,
543 size_tpos = line.find(
"\t");
544 stringchrom = line.substr(0, pos);
545 if(!vcf_vars || (vcf_vars && !
NStr::EqualCase(vcf_vars->GetChrName(), chrom))) {
546 if(prev_chrom == chrom)
550 for(
const auto& syn_it : chr_list) {
551 const auto& seq_id = syn_it.first;
552 const auto& synonyms = syn_it.second;
553 if(find_if(synonyms.begin(), synonyms.end(),
554[&chrom](
const string& elem) { return NStr::EqualCase(chrom, elem); }) != synonyms.end()) {
558CallOptimizeVarsList();
559lines_per_contig = 0;
563 if(find_if(variants_list.begin(), variants_list.end(),
564[&seq_id](
const TSeqIdVarsListPair& elem) { return (seq_id->AsFastaString() == elem.first->AsFastaString()); }) == variants_list.end()) {
566vcf_vars = variants_list.back().second;
573chrom +
" data line found out of its block. All entries for a specific CHROM should form a contiguous block within the VCF file.",
583CallOptimizeVarsList();
585vcf_vars.
Reset(
nullptr);
589lines_per_contig = 0;
594 if(prog_func && lines_per_contig > 0 && lines_per_contig % 500000 == 0) {
599vcf_vars->ParseLine(line);
624 autodiff_parsing = chrono::steady_clock::now() - start;
625 LOG_POST(
Info<<
"Parsed "<< nr_lines <<
" lines in " 626<< chrono::duration_cast<chrono::milliseconds>(diff_parsing).
count() <<
" ms ");
631variants_list.clear();
632 returnvariants_list;
635 if(!
in.good() && !
in.eof()) {
636 LOG_POST(
Error<<
"Reading cannot be completed, as input stream is corrupted");
640variants_list.clear();
645OptimizeVariantsList(vcf_vars);
650 if(chr_list.size() != variants_list.size()) {
651 for(
const auto& chr_it : chr_list) {
652 if(find_if(variants_list.begin(), variants_list.end(),
654{ return elem.first->Equals(*chr_it.first); }) == variants_list.end()) {
656 autoid_str = chr_it.first->AsFastaString();
661 "Chromosome "+ id_str +
" is not in the file",
668 if(!on_variants_list_ready) {
669 for(
auto& var_it : variants_list) {
674 returnvariants_list;
679vector<string>
names;
681 names.push_back(it.first);
700it.second->GetStatistics(
out);
707it.second->SerializeVariantData(prefix,
out);
714it.second->DeserializeAndCheck(prefix,
out);
721it.second->List(
out, only_sv_cols);
728it.second->ListPositionVectors(
out);
Debugging functions (internal). Poorly documented, not well written.
Serialization for sparse_vector<>
void ListColumns(CNcbiOstream &out, bool only_sv_cols=false)
void SerializeToDisk(const string &prefix, CNcbiOstream *out=nullptr)
void GetStatistics(CNcbiOstream &out)
void Deserialize(const string &prefix, CNcbiOstream *out=nullptr)
void ListIndexVectors(CNcbiOstream &out)
bool m_LoadAllInfo
Flag to load every INFO field.
pair< CConstRef< objects::CSeq_id >, CRef< CVCFVariantList > > TSeqIdVarsListPair
void x_ProcessCriticalError(objects::CObjReaderLineException &err, objects::ILineErrorListener *error_cont)
vector< string > GetChromosomeNames() const
Returns a vector, holding the chrs/contigs identifiers, read from the file.
map< unsigned, string > m_SampleCols
List of SAMPLE columns parsed from the last line of the header, order is important.
unsigned x_ProcessHeaderLine(const string &header_line, unsigned line_nr, objects::ILineErrorListener *listener)
map< unsigned, string > m_LoadSamples
List of SAMPLES required to be loaded.
function< void(const string &)> TReportProgress
void x_ProcessError(objects::CObjReaderLineException &err, objects::ILineErrorListener *error_cont)
CRef< CVCFVariantList > GetVariantsForChr(const string &chr_name) const
Retrieves the variants list for a given chr/contig.
bool ReadHeader(CNcbiIstream &in, ICanceled *canceled=nullptr, objects::ILineErrorListener *listener=nullptr)
Reads only the header section of the file.
void x_GetSamplesToLoad(const string &header_line, objects::ILineErrorListener *listener, unsigned line_nr)
set< CConstRef< SVcfFieldData > > m_InfoFields
List of INFO fields parsed from the header of the file.
bool m_LoadAllSamples
Flag to load every SAMPLE column.
void x_ProcessWarning(objects::CObjReaderLineException &err, objects::ILineErrorListener *error_cont)
std::function< void(CVCFVariantList &)> TOnVCFVariantListReady
Defines a callable object, used when a variants list is processed by the reader.
void x_InterruptReading()
void x_GatherSampleColNames(const string &header_line, objects::ILineErrorListener *listener, unsigned line_nr)
unordered_map< string, CRef< CVCFVariantList > > m_ChromosomeMap
set< string > m_LoadInfoFields
List of INFO fields required to be loaded.
vector< TSeqIdVarsListPair > ReadVariantsForChrs(CNcbiIstream &in, const vector< pair< CConstRef< objects::CSeq_id >, vector< string >>> &chr_list, ICanceled *canceled=nullptr, objects::ILineErrorListener *listener=nullptr, TReportProgress prog_func=TReportProgress(), TOnVCFVariantListReady on_variants_list_ready=TOnVCFVariantListReady())
Reads a list of variants.
bool ReadData(CNcbiIstream &in, ICanceled *canceled=nullptr, objects::ILineErrorListener *listener=nullptr, TReportProgress prog_func=TReportProgress(), TOnVCFVariantListReady on_variants_list_ready=TOnVCFVariantListReady())
Reads only the data section of the file.
void Throw(void) const
this function to throw this object.
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
std::string Message() const
Simple implementation of ILineReader for i(o)streams.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
const string & GetChrName() const
static const string sm_FORMAT
Interface for testing cancellation request in a long lasting operation.
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_GeneralParsingError
string SeverityStr() const
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator end() const
const Uint8 kAsyncVarsThreshold
std::ofstream out("events_result.xml")
main entry point for tests
static const struct name_t names[]
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
@ eDiag_Warning
Warning message.
@ eDiag_Critical
Critical error message.
void Error(CExceptionArgs_Base &args)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
void Info(CExceptionArgs_Base &args)
char PeekChar(void) const
Returns the first character of the next string without consuming it.
Uint8 GetLineNumber(void) const
Returns the current line number (counting from 1, not 0).
void Reset(void)
Reset reference object.
TObjectType * Release(void)
Release a reference to the object and return a pointer to the object.
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
virtual bool IsCanceled(void) const =0
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Lightweight interface for getting lines of data with minimal memory copying.
Compressed bitset (entry point to bm.h)
std::istream & in(std::istream &in_, double &x_)
Structure to store characteristics of an INFO field It is constructed from an INFO meta-information l...
string m_Name
INFO ID (name)
string m_Description
INFO Description.
SVcfFieldData(const string &line)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4