seq_len_threshold)
88 if(GetCurrentPos(eRawPos) < m_SeqLenThreshold) {
89 _ASSERT( (TestFlag(fAssumeNuc) ^ TestFlag(fAssumeProt) ) );
90SetCurrentSeq().SetInst().SetMol(TestFlag(fAssumeNuc)
121 boolretrieve_seq_data,
122 unsigned intseqlen_thresh2guess,
133 if( !line.empty() &&
isalnum(line.data()[0]&0xff) ) {
139 id.Reset(
new CSeq_id(line));
143retval->
SetSeq(*bioseq);
152}
catch(
constexception&) {
193 "Empty SeqID passed to the molecule type validation");
200 "GI/accession/sequence mismatch: protein input required but nucleotide provided");
205 "GI/accession/sequence mismatch: nucleotide input required but protein provided");
212 stringmessage =
"No sequence available for "+
id->AsFastaString();
248 "GI/accession/sequence mismatch: protein input required but nucleotide provided");
294m_LineReader(iconfig.GetConvertGapsToNs() ?
297m_ReadProteins(iconfig.IsProteinInput())
305m_ReadProteins(iconfig.IsProteinInput())
307 if(user_input.empty()) {
309 "No sequence input was provided");
331 const char* env_var = getenv(
"BLASTINPUT_GEN_DELTA_SEQ");
332 if(env_var ==
NULL|| (env_var &&
string(env_var) ==
kEmptyStr)) {
412 "Nucleotide FASTA provided for protein sequence");
415 "Protein FASTA provided for nucleotide sequence");
428 "Cannot assign nucleotide strand to protein sequence");
446 if(to > 0 && to < from) {
448 "Invalid sequence range");
452 "Invalid from coordinate (greater than sequence length)");
459retval->
SetInt().SetFrom(from);
460retval->
SetInt().SetTo((to > 0 && to < seqlen) ? to : (seqlen-1));
475 SSeqLocretval(seqloc, &scope);
477retval.
mask= lcase_mask;
495 const boolapply_mask_to_both_strands =
true;
499program, apply_mask_to_both_strands);
514m_ParseSeqIds(
false)
527 if(line[0] !=
'>') {
529 "defline expected");
543m_ParseSeqIds(
false)
550 "used with two input files");
564 if(line[0] !=
'>') {
566 "defline expected");
570++(*m_SecondLineReader);
574 if(line[0] !=
'>') {
576 "defline expected");
626 if(it->IsUser() && it->GetUser().GetType().GetStr() ==
"Mapping") {
632 if(seqdesc.
Empty()) {
657 "format x_ReadFastaOrFastq read either FASTA or FASTQ");
674 "format x_ReadFastaOrFastq read either FASTA or " 678 if(
first.NotEmpty()) {
687 if(
first.NotEmpty()) {
696 if(
first.NotEmpty()) {
735 if(line[0] !=
'>') {
737(
string)
"Missing defline before line: "+
745(
string)
"No sequence data for defline: "+
id+
746 "\nTruncated file?");
758(
string)
"No sequence data for defline: "+ line);
763 size_tp = line.
find(
'>');
767(
string)
"FASTC parse error: Sequence separator '><'" 768 " was not found in line: "+
774 char* second = (
char*)line.
data() + p + 2;
775 size_tfirst_len = p;
776 size_tsecond_len = line.
length() - p - 2;
781bioseq.
SetId().clear();
785bioseq.
SetId().push_back(seqid);
790bioseq.
SetDescr().Set().push_back(title);
796 first[first_len] = 0;
798bioseq.
SetDescr().Set().push_back(seqdesc_first);
801bioseq_set.
SetSeq_set().push_back(seq_entry);
807bioseq.
SetId().clear();
811bioseq.
SetId().push_back(seqid);
816bioseq.
SetDescr().Set().push_back(title);
822second[second_len] = 0;
824bioseq.
SetDescr().Set().push_back(seqdesc_last);
827bioseq_set.
SetSeq_set().push_back(seq_entry);
842line = **line_reader;
843 while(line[0] !=
'>') {
846 if(line.
empty() && !line_reader->
AtEOF()) {
848line = **line_reader;
864 if(line_reader->
AtEOF()) {
870line = **line_reader;
877bioseq.
SetId().clear();
881bioseq.
SetId().push_back(seqid);
887bioseq.
SetDescr().Set().push_back(title);
892bioseq.
SetInst().SetLength(start);
910 boolempty_sequence =
false;
914line = **line_reader;
917 while(!line_reader->
AtEOF() && line.
empty()) {
919line = **line_reader;
922 if(line[0] !=
'@') {
924 " defline expected at line: "+
932line = **line_reader;
934 while(!line_reader->
AtEOF() && line.
empty()) {
936line = **line_reader;
940 if(line.
length() > 0) {
943bioseq.
SetId().clear();
947bioseq.
SetId().push_back(seqid);
953bioseq.
SetDescr().Set().push_back(title);
960 if(line[0] ==
'+') {
961bioseq.
SetInst().SetLength(0);
963empty_sequence =
true;
974 if(!empty_sequence) {
977line = **line_reader;
979 while(!line_reader->
AtEOF() && line.
empty()) {
981line = **line_reader;
985 if(line[0] !=
'+') {
987 " defline expected at line: "+
991 if(!empty_sequence) {
994line = **line_reader;
996 if(!line.
empty()) {
1002 while(!line_reader->
AtEOF() && line.
empty()) {
1004line = **line_reader;
1018 "used with two files");
1033 if(
first.NotEmpty()) {
1041 if(
first.NotEmpty()) {
1057 size_tend = line.
find(
' ', 1);
static CUser_object & s_SetSeqdescUser(CSeq_entry &entry)
Interface for reading SRA sequences into blast input.
Auxiliary classes/functions for BLAST input library.
bool HasRawSequenceData(const objects::CBioseq &bioseq)
Returns true if the Bioseq passed as argument has the full, raw sequence data in its Seq-inst field.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definitions and functions associated with the BlastQueryInfo structure.
@ eFirstSegment
The first sequence of a pair with both sequences read and accepted.
Auxiliary class for creating Bioseqs given SeqIds.
CRef< CBioseq > CreateBioseqFromId(CConstRef< CSeq_id > id, bool retrieve_seq_data)
Creates a Bioseq given a SeqId.
bool IsProtein(CConstRef< CSeq_id > id)
Checks the molecule type of the Bioseq identified by the given SeqId.
bool HasSequence(CConstRef< CSeq_id > id)
Checks whether the Bioseq actually contains sequence.
CRef< ILineReader > m_LineReader
interface to read lines
AutoPtr< CFastaReader > m_InputReader
Reader of FASTA sequences or identifiers.
bool m_ReadProteins
read protein sequences?
CBlastFastaInputSource(CNcbiIstream &infile, const CBlastInputSourceConfig &iconfig)
Constructor.
virtual CRef< CBlastSearchQuery > GetNextSequence(CScope &scope)
Retrieve a single sequence (in a CBlastSearchQuery container)
CRef< objects::CSeq_loc > x_FastaToSeqLoc(CRef< objects::CSeq_loc > &lcase_mask, CScope &scope)
Read a single sequence from file and convert to a Seq_loc.
void x_InitInputReader()
Initialization method for the input reader.
virtual bool End()
Signal whether there are any unread sequences left.
CBlastInputSourceConfig m_Config
Configuration for the sequences to be read.
virtual SSeqLoc GetNextSSeqLoc(CScope &scope)
Retrieve a single sequence (in an SSeqLoc container)
Class to read non-FASTA sequence input to BLAST programs using the various data loaders configured in...
CBlastInputReader(const SDataLoaderConfig &dlconfig, bool read_proteins, bool retrieve_seq_data, unsigned int seqlen_thresh2guess, ILineReader &reader, CFastaReader::TFlags flags)
Constructor.
void x_ValidateMoleculeType(CConstRef< CSeq_id > id)
Performs sanity checks to make sure that the sequence requested is of the expected type.
bool m_ReadProteins
True if we're supposed to be reading proteins, else false.
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener)
Overloaded method to attempt to read non-FASTA input types.
CRef< CBlastScopeSource > GetQueryScopeSource() const
Retrieves the CBlastScopeSource object used to fetch the query sequence(s) if these were provided as ...
bool m_RetrieveSeqData
True if the sequence data must be fetched.
CRef< CBioseq > x_CreateBioseq(CRef< CSeq_id > id)
Auxiliary function to create a Bioseq given a CSeq_id ready to be added to a BlastObject,...
CRef< CBlastBioseqMaker > m_BioseqMaker
The object that creates Bioseqs given SeqIds.
const SDataLoaderConfig & m_DLConfig
Configuration options for the CBlastScopeSource.
CRef< CBlastScopeSource > m_QueryScopeSource
The source of CScope objects to fetch sequences if given by Seq-id.
Class that centralizes the configuration data for sequences to be converted.
TSeqRange GetRange() const
Get range for all sequences.
const string & GetLocalIdPrefix() const
Retrieve the custom prefix string used for generating local ids.
objects::ENa_strand GetStrand() const
Retrieve the current strand value.
int GetLocalIdCounterInitValue() const
Retrieve the local id counter initial value.
const SDataLoaderConfig & GetDataLoaderConfig()
Retrieve the data loader configuration object for read-only access.
bool GetBelieveDeflines() const
Retrieve current sequence ID parsing status.
unsigned int GetSeqLenThreshold2Guess() const
Retrieve the sequence length threshold to guess the molecule type.
bool GetSkipSeqCheck() const
Retrieve status of sequence alphabet validation.
bool GetLowercaseMask() const
Retrieve lowercase mask status.
bool RetrieveSeqData() const
True if the sequence data must be fetched.
Class whose purpose is to create CScope objects which have data loaders added with different prioriti...
void AddDataLoaders(CRef< objects::CScope > scope)
Add the data loader configured in the object to the provided scope.
CRef< objects::CScope > NewScope()
Create a new, properly configured CScope.
CFastaReader-derived class which contains customizations for processing BLAST sequence input.
virtual void AssignMolType(ILineErrorListener *pMessageListener)
Override logic for assigning the molecule type.
CCustomizedFastaReader(ILineReader &reader, CFastaReader::TFlags flags, unsigned int seq_len_threshold)
Constructor.
virtual void x_CloseGap(TSeqPos, bool, ILineErrorListener *)
Override this method to force the parent class to ignore gaps.
unsigned int m_SeqLenThreshold
Sequence length threshold for molecule type guessing.
Base class for reading FASTA sequences.
Defines user input exceptions.
@ eSequenceMismatch
Expected sequence type isn't what was expected.
Simple implementation of ILineReader for regions of memory (such as memory-mapped files).
CRef< CSeq_id > x_GetNextSeqId(void)
EInputFormat m_Format
Input format: FASTA, FASTQ, FASTC.
TSeqPos m_BasesAdded
Number of bases added so far.
bool m_ParseSeqIds
Should defline ids be used Bioseq objects.
virtual int GetNextSequence(CBioseq_set &bioseq_set)
Get one sequence (or a pair for NGS reads)
CRef< CSeq_entry > x_ReadFastqOneSeq(CRef< ILineReader > line_reader)
Read one sequence from a FASTQ file.
CRef< ILineReader > m_SecondLineReader
CShortReadFastaInputSource(CNcbiIstream &infile, EInputFormat format=eFasta, bool paired=false)
unsigned int m_Id
A counter for generating local ids.
void x_ReadFastaOrFastq(CBioseq_set &bioseq_set)
Read sequences in FASTA or FASTQ format.
bool x_ReadFromTwoFiles(CBioseq_set &bioseq_set, EInputFormat format)
Read sequences from two FASTA or FASTQ files (for paired reads)
void x_ReadFastc(CBioseq_set &bioseq_set)
Read sequences in FASTC format: defline, new line, a pair of sequences on a single line separated by ...
CRef< CSeq_entry > x_ReadFastaOneSeq(CRef< ILineReader > line_reader)
Read one sequence from a FASTA file.
CRef< ILineReader > m_LineReader
EInputFormat
Input formats.
CTempString x_ParseDefline(CTempString &line)
TSeqPos m_SeqBuffLen
string::capacity() can be used instead
bool m_IsPaired
Are paired sequences in the input.
Stream line reader that converts gaps to Ns before returning each line.
CTempString operator*(void) const
Return the current line, minus its terminator.
CStreamLineReaderConverter & operator++(void)
Make a line available.
CStreamLineReaderConverter(CNcbiIstream &instream)
Simple implementation of ILineReader for i(o)streams.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Template class for iteration on objects of class C (non-medifiable version)
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
@ eProblem_ModifierFoundButNoneExpected
@ eProblem_TooManyAmbiguousResidues
Abstract base class for lightweight line-by-line reading.
Collection of masked regions for a single query sequence.
bool Empty(const CNcbiOstrstream &src)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
TMaskedQueryRegions PackedSeqLocToMaskedQueryRegions(CConstRef< objects::CSeq_loc > sloc, EBlastProgramType program, bool assume_both_strands=false)
Auxiliary function to convert a Seq-loc describing masked query regions to a TMaskedQueryRegions obje...
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
unsigned int TSeqPos
Type for sequence locations and lengths.
element_type * get(void) const
Get pointer.
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
CSeqIdGenerator & SetIDGenerator(void)
long TFlags
binary OR of EFlags
CStreamLineReader & operator++(void)
Make a line available.
virtual void UngetLine(void)=0
Unget current line, which must be valid.
CRef< CSeq_loc > SaveMask(void)
Directs the *following* call to ReadOneSeq to note the locations of lowercase letters.
CTempString operator*(void) const
Return the current line, minus its terminator.
virtual Uint8 GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
virtual void AssignMolType(ILineErrorListener *pMessageListener)
ILineReader & GetLineReader(void)
void IgnoreProblem(ILineError::EProblem problem)
@ fNoParseID
Generate an ID (whole defline -> title)
@ fQuickIDCheck
Just check local IDs' first characters.
@ fDLOptional
Don't require a leading defline.
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
@ fSkipCheck
Skip (rudimentary) body content check.
@ fDisableNoResidues
If no residues found do not raise an error.
@ fParseRawID
Try to identify raw accessions.
@ fNoSplit
Don't split out ambiguous sequence regions.
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
CSeq_id & Set(const CTempString &the_id, TParseFlags flags=fParse_AnyRaw)
Reassign based on flat specifications; arguments interpreted as with constructors.
static int BestRank(const CRef< CSeq_id > &id)
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty â pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty â not pointing to any object, which means having a null value.
static TThisType GetEmpty(void)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static CTempString TruncateSpaces_Unsafe(const CTempString str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
const char * data(void) const
Return a pointer to the array represented.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
size_type length(void) const
Return the length of the represented array.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
static const size_type npos
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetType(TType &value)
Assign a value to Type data member.
bool IsLocal(void) const
Check if variant Local is selected.
@ eNa_strand_both
in forward orientation
@ e_not_set
No variant selected.
const TSeq & GetSeq(void) const
Get the variant data.
TSeq & SetSeq(void)
Select the variant.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
TId & SetId(void)
Assign a value to Id data member.
const TInst & GetInst(void) const
Get the Inst member data.
TTitle & SetTitle(void)
Select the variant.
TLength GetLength(void) const
Get the Length member data.
void SetInst(TInst &value)
Assign a value to Inst data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TUser & SetUser(void)
Select the variant.
@ eRepr_raw
continuous sequence
@ eMol_na
just a nucleic acid
The blob sat and sat key Both must be positive integers</td > n< td > Non empty string The interpretation of the blob id depends on a processor Cassandra n processor expects the following format
Configuration structure for the CBlastScopeSource.
bool UseDataLoaders() const
Determine whether either of the data loaders should be used.
Structure to represent a single sequence to be fed to BLAST.
CRef< objects::CSeq_loc > mask
Seq-loc describing regions to mask in the seqloc field Acceptable types of Seq-loc are Seq-interval a...
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4