=
"Low complexity region masker based on Symmetric DUST algorithm";
84arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
86arg_desc->AddDefaultKey(
kInput,
"input_file_name",
89arg_desc->AddDefaultKey(
kOutput,
"output_file_name",
92arg_desc->AddDefaultKey(
"window",
"window_size",
93 "DUST window length",
95arg_desc->AddDefaultKey(
"level",
"level",
96 "DUST level (score threshold for subwindows)",
98arg_desc->AddDefaultKey(
"linker",
"linker",
99 "DUST linker (how close masked intervals " 100 "should be to get merged together).",
102arg_desc->AddDefaultKey(
kInputFormat,
"input_format",
103 "input format (possible values: fasta, blastdb)",
108arg_desc->AddFlag (
"parse_seqids",
109 "Parse Seq-ids in FASTA input",
true);
110arg_desc->AddFlag (
"hard_masking",
111 "Use hard masking for fasta outfmt",
true);
116strings_allowed->
Allow(
"acclist");
129 if(args[
"hard_masking"].AsBoolean() && (
format!=
"fasta")) {
130 throwruntime_error(
"Hard masking can only be applied for fasta output");
133 if(
format==
"interval") {
136}
else if(
format==
"acclist") {
139}
else if(
format==
"fasta") {
141 boolhard_masking = args[
"hard_masking"].AsBoolean();
163 throwruntime_error(
"Unknown output format");
173 if(
format==
"fasta") {
176input_stream,
true, args[
"parse_seqids"] );
178 else if(
format==
"blastdb") {
182 throwruntime_error(
"Unknown input format");
192 unsigned intpos = 0;
194 for(objects::CSeqVector_CI itr=seq.begin();itr!=seq.end(); ++itr) {
200 if((Ns > MAX_Ns ) || (pos == 0)) {
202NsRange.push_back(
r);
213NsRange.push_back(
r);
220 if((!list.empty()) && (list.back().second + linker == new_mask.first)) {
221list.back().second = new_mask.second;
225list.push_back(new_mask);
228std::unique_ptr< CSymDustMasker::TMaskList >
234 if(NsRange.empty()){
240 if(itr->first == 0) {
241seq_start = itr->second + 1;
246std::unique_ptr< CSymDustMasker::TMaskList > s_mask = duster(seq, seq_start, itr->first -1);
247 if(s_mask->size() > 0) {
249 if( s_mask->size() > 1) {
250rv->insert(rv->end(), ++(s_mask->begin()), s_mask->end());
257seq_start = itr->second + 1;
260 if(seq_start < seq.size()){
261std::unique_ptr< CSymDustMasker::TMaskList > s_mask = duster(seq, seq_start, seq.size() -1);
262 if(s_mask->size() > 0) {
264 if( s_mask->size() > 1) {
265rv->insert(rv->end(), ++(s_mask->begin()), s_mask->end());
299 for( ; bs_iter; ++bs_iter)
310writer->Print(bsh, *res,
GetArgs()[
"parse_seqids"] );
315output_stream << flush;
User-defined methods of the data storage class.
virtual int Run(void)
Run the application.
CMaskReader * x_GetReader()
CMaskWriter * x_GetWriter()
virtual void Init(void)
Initialize the application.
static const char *const USAGE_LINE
Class for reading sequences from BLAST databases.
Class for reading sequences from fasta files.
Virtual base class for all input readers.
virtual CRef< objects::CSeq_entry > GetNextSequence()=0
Read the next sequence from the source stream.
Output filter to print masked sequence locations as Blast-db-mask-info objects.
Output filter to write masked data in fasta format.
Output filter to print masked sequences as sets of intervals.
Output filter to print masked sequence locations as NCBI Seq-loc objects.
Output filter to print masked sequences as sets of intervals one per line.
A base class for winmasker output writers.
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
std::pair< size_type, size_type > TMaskedInterval
Type respresenting an interval selected for masking.
sequence_type::size_type size_type
Integer size type corresponding to sequence_type.
std::vector< TMaskedInterval > TMaskList
Type representing a list of masked intervals.
std::unique_ptr< CSymDustMasker::TMaskList > GetDustMasks_SkipNs(objects::CSeqVector &seq, Uint4 level, Uint4 window, Uint4 linker)
void s_InsertMerge(CSymDustMasker::TMaskList &list, CSymDustMasker::TMaskedInterval &new_mask, Uint4 linker)
CSymDustMasker::TMaskList s_FindSegmentWithLongNs(const unsigned int MAX_Ns, objects::CSeqVector &seq)
Operators to edit gaps in sequences.
static SQLCHAR output[256]
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
unsigned int TSeqPos
Type for sequence locations and lengths.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideLogfile
Hide log file description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
@ eInputFile
Name of file (must exist and be readable)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
@ fBinary
Open file in binary mode.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
TSeqPos GetBioseqLength(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
@ eBlast_filter_program_dust
@ eMol_na
just a nucleic acid
The blob sat and sat key Both must be positive integers</td > n< td > Non empty string The interpretation of the blob id depends on a processor Cassandra n processor expects the following format
Lightweight interface for getting lines of data with minimal memory copying.
Contains the command line options common to filtering algorithms.
const char * kOutputFormats[]
Output formats allowed, the first one is the default.
const size_t kNumOutputFormats
Number of elements in kOutputFormats.
const std::string kOutput
Command line flag to specify the output.
const std::string kOutputFormat
Command line flag to specify the output format.
const char * kInputFormats[]
Input formats allowed, the first one is the default.
const std::string kInput
Command line flag to specify the input.
const std::string kInputFormat
Command line flag to specify the input format.
string BuildAlgorithmParametersString(const CArgs &args)
Builds an algorithm options string for the filtering applications (segmasker, dustmasker) by examinin...
NCBI C++ auxiliary debug macros.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
CRef< objects::CObjectManager > om
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4