:
case 'A':
return0;
67 case 'c':
case 'C':
return1;
68 case 'g':
case 'G':
return2;
69 case 't':
case 'T':
return3;
75 static inline bool ambig(
charc )
77 returnc !=
'a'&& c !=
'A'&& c !=
'c'&& c !=
'C' 78&& c !=
'g'&& c !=
'G'&& c !=
't'&& c !=
'T';
87 if( bioseq.CanGetInst()
88&& bioseq.GetInst().CanGetLength()
89&& bioseq.GetInst().CanGetSeq_data() )
92 const CSeq_data& seqdata( bioseq.GetInst().GetSeq_data() );
93unique_ptr< CSeq_data > dest(
new CSeq_data);
96 returndest->GetIupacna().Get();
125 const string& arg_input,
127 const string& infmt_arg,
128 const string& sformat,
129 const string& arg_th,
132 Uint8arg_genome_size,
135 boolarg_check_duplicates,
139 booluse_ba,
string const& metadata,
140 doublemin_pct,
doubleextend_pct,
doublethres_pct,
doublemax_pct )
141:
input( arg_input ),
143sformat, os, use_ba, metadata ) ),
144max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
145genome_size( arg_genome_size ),
146min_count( arg_min_count == 0 ? 1 : arg_min_count ),
149t_high( arg_max_count ),
150has_min_count( arg_min_count != 0 ),
151no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
152check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
154score_counts( max_count, 0 ),
155ids( arg_ids ), exclude_ids( arg_exclude_ids ),
159string::size_type pos( 0 );
162 while( pos != string::npos &&
count< 4 )
164string::size_type newpos = arg_th.find_first_of(
",", pos );
165 th[
count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
166pos = (newpos == string::npos ) ? newpos : newpos + 1;
172 const string& arg_input,
174 const string& infmt_arg,
175 const string& sformat,
176 const string& arg_th,
179 Uint8arg_genome_size,
182 boolarg_check_duplicates,
186 booluse_ba,
string const& metadata,
187 doublemin_pct,
doubleextend_pct,
doublethres_pct,
doublemax_pct )
188:
input( arg_input ),
190sformat,
output, use_ba, metadata ) ),
191max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
192genome_size( arg_genome_size ),
193min_count( arg_min_count == 0 ? 1 : arg_min_count ),
196t_high( arg_max_count ),
197has_min_count( arg_min_count != 0 ),
198no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
199check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
201score_counts( max_count, 0 ),
202ids( arg_ids ), exclude_ids( arg_exclude_ids ),
206string::size_type pos( 0 );
209 while( pos != string::npos &&
count< 4 )
211string::size_type newpos = arg_th.find_first_of(
",", pos );
212 th[
count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
213pos = (newpos == string::npos ) ? newpos : newpos + 1;
216 if( min_pct >= 0.0 )
th[0] = min_pct;
217 if( extend_pct >= 0.0 )
th[1] = extend_pct;
218 if( thres_pct >= 0.0 )
th[2] = thres_pct;
219 if( max_pct >= 0.0 )
th[3] = max_pct;
229vector< string > file_list;
237 while( getline( fl_stream, line ) ) {
238 if( !line.empty() ) {
239file_list.push_back( line );
254 LOG_POST(
"computing the genome length");
258 i!= file_list.end(); ++
i)
286 while( suffix_size > 0 ) {
287 Uint8units_needed( 1ULL<<(2*suffix_size) );
288 if( units_needed <= n_units )
break;
292 NCBI_ASSERT( suffix_size > 0,
"suffix size is 0");
297 Uint4prefix_exp( 1<<(2*prefix_size) );
301 for(
Uint4prefix( 0 ); prefix < prefix_exp; ++prefix ) {
312 Uint4index[4] = {0, 0, 0, 0};
313 doubleprevious( 0.0 );
339 for(
Uint1j( 0 ); j < 4; ++j )
340 if( previous <
th[j] && current >=
th[j] )
365 for(
Uint4prefix( 0 ); prefix < prefix_exp; ++prefix )
366 process( prefix, prefix_size, file_list,
true);
397s <<
" "<<
th[
i] <<
"%% threshold at "<< index[
i];
411 constvector< string > & input_list,
415 Uint8vector_size( 1ULL<<(2*suffix_size) );
416vector< Uint4 > counts( vector_size, 0 );
418 Uint4prefix_mask( ((1<<(2*prefix_size)) - 1)<<(2*suffix_size) );
419 Uint4suffix_mask( (1<<2*suffix_size) - 1 );
420 if(
unit_size== 16 ) unit_mask = 0xFFFFFFFF;
422 if( suffix_size == 16 )
424suffix_mask = 0xFFFFFFFF;
428 _TRACE(
"prefix: "<< prefix <<
429 "\nprefix_size: "<< (
int)prefix_size <<
430 "\nsuffix_size: "<< (
int)suffix_size <<
431 "\nvector_size: "<< vector_size <<
432 "\nunit_mask: "<< unit_mask <<
433 "\nprefix_mask: "<< prefix_mask <<
434 "\nsufffix_mask: "<< suffix_mask );
446prefix <<= (2*suffix_size);
450it != input_list.end(); ++it )
468 for(
Uint4 i( 0 );
i< length; ++
i) {
477unit = ((unit<<2)&unit_mask) +
letter(
data[
i] );
483 if( unit <= runit && (unit&prefix_mask) == prefix )
485 auto& c( counts[unit&suffix_mask] );
487 if( c < 0xffffffffUL )
494 if( runit <= unit && (runit&prefix_mask) == prefix )
496 auto& c( counts[runit&suffix_mask] );
498 if( c < 0xffffffffUL )
527 for(
Uint8 i( 0 );
i< vector_size; ++
i)
529 Uint4u( prefix +
i), ru( 0 );
531 if( counts[
i] > 0 )
User-defined methods of the data storage class.
Factory of CSeqMaskerOstat objects.
void setComment(const string &msg)
Add a comment to the unit counts file.
void SetCount(Uint4 count, double pct)
void SetMaxCount(Uint4 mc)
void setUnitCount(Uint4 unit, Uint4 count)
Add count value for a particular unit.
void finalize()
Perform any final tasks required to generate unit counts in the particular format.
void setParam(const string &name, Uint4 value)
Set a value of a WindowMasker parameter.
void setUnitSize(Uint1 us)
Set the unit size value.
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
Reverse complement of a unit.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
Exceptions that CWinMaskCountsGenerator may throw.
@ eNullGenome
Genome has 0 size.
virtual const char * GetErrCodeString() const override
Return description string corresponding to an error code.
~CWinMaskCountsGenerator()
Object destructor.
vector< Uint4 > score_counts
void process(Uint4 prefix, Uint1 prefix_size, const vector< string > &input, bool do_output)
CRef< CSeqMaskerOstat > ustat
const CWinMaskUtil::CIdSet * ids
void operator()()
This function does the actual n-mer counting.
Uint8 fastalen(const string &fname) const
CWinMaskCountsGenerator(const string &input, const string &output, const string &infmt, const string &sformat, const string &th, Uint4 mem_avail, Uint1 unit_size, Uint8 genome_size, Uint4 min_count, Uint4 max_count, bool check_duplicates, bool use_list, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids, bool use_ba, string const &metadata, double min_pct=-1.0, double extend_pct=-1.0, double thres_pct=-1.0, double max_pct=-1.0)
Constructor.
const CWinMaskUtil::CIdSet * exclude_ids
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
Function iterating over bioseqs in input.
static bool consider(const objects::CBioseq_Handle &bsh, const CIdSet *ids, const CIdSet *exclude_ids)
Check if the given bioseq should be considered for processing.
static SQLCHAR output[256]
unsigned int TSeqPos
Type for sequence locations and lengths.
#define NCBI_ASSERT(expr, mess)
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
TSeqPos GetBioseqLength(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
const TSeq & GetSeq(void) const
Get the variant data.
@ e_Iupacna
IUPAC 1 letter nuc acid code.
const struct ncbi::grid::netcache::search::fields::SIZE size
CRef< objects::CObjectManager > om
void CheckDuplicates(const vector< string > &input, const string &infmt, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids)
Check for possibly duplicate sequences in the input.
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
static Uint4 letter(char c)
static bool ambig(char c)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4