bAccnIsProtOnly = (
107 if( bAccnIsProtOnly || ! bSeqIdIsFound )
113 const boolbLocalSeqIdIsfound = (
115 if( bLocalSeqIdIsfound ) {
117cerr <<
"Warning: '"<<
str<<
"' was used as an accession, " 118 "so the local component was ignored."<< endl;
131CCustomAgpToSeqEntry(
CScope* pScope)
139 returns_CustomGetSeqIdFromStr(
str, m_pScope.GetPointer());
160 conststd::list<std::string> & files,
172list<string> compAndObjFiles;
173list<string> agpFiles;
174 ITERATE( std::list<std::string>, file_iter, files ) {
175 const string&
file= *file_iter;
180compAndObjFiles.push_back(
file);
183agpFiles.push_back(
file);
188 if( ! loadlog.empty() ) {
193 if( ! agp_as_fasta_file.empty() ) {
210 COMP_LOG(
"Component seq-id from AGP file(s): " 211<< seq_id_it->AsString());
214 COMP_LOG(
"Object seq-id from AGP file(s): " 215<< seq_id_it->AsString());
221unique_ptr<CTmpFile> ldsdb_file;
234lds_mgr->SetFastaFlags(fasta_flags);
236list<string> objfiles;
237 ITERATE( list<string>, file_iter, compAndObjFiles ) {
242 COMP_LOG(
"Object file: "<< *file_iter);
243objfiles.push_back(*file_iter);
247ifstream file_strm( file_iter->c_str() );
257 SIZE_TYPEafter_seq_id_pos = line.find_first_of(
" \t");
258 if( after_seq_id_pos == string::npos ) {
259after_seq_id_pos = line.length();
261 stringacc_long = line.substr(1, (after_seq_id_pos - 1));
265 COMP_LOG(
"Sample accession from "<< *file_iter
267 if( compSeqIds.
find(acc_h) != compSeqIds.
end() ) {
270 COMP_LOG(
"Component file: "<< *file_iter);
271lds_mgr->AddDataFile( *file_iter );
273}
else if( objSeqIds.
find(acc_h) != objSeqIds.
end() ) {
275 COMP_LOG(
"Object file: "<< *file_iter);
276objfiles.push_back(*file_iter);
284cerr <<
"Warning: This file seems to be unused: '" 285<< *file_iter <<
"'"<< endl;
288lds_mgr->UpdateData();
300unique_ptr<CTmpSeqVecStorage> temp_dir;
301 if( diffs_to_find > 0 ) {
307 if( agpFiles.empty() ) {
308cerr <<
"error: could not find any agp files"<< endl;
315 if( objfiles.empty() ) {
316cerr <<
"error: could not find any obj files"<< endl;
340 for( ; iter1 != iter1_end && iter2 != iter2_end; ) {
341 if(iter1->first < iter2->first) {
342 copy( iter1->second.begin(), iter1->second.end(),
343inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.
begin() ) );
346 else if(iter2->first < iter1->first) {
347 copy( iter2->second.begin(), iter2->second.end(),
348inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.
begin() ) );
351 else if( iter1->second != iter2->second ) {
353set_difference( iter1->second.begin(), iter1->second.end(),
354iter2->second.begin(), iter2->second.end(),
355inserter(vSeqIdFASTAOnly,
356vSeqIdFASTAOnly.
begin() ) );
359set_difference( iter2->second.begin(), iter2->second.end(),
360iter1->second.begin(), iter1->second.end(),
361inserter(vSeqIdAGPOnly,
362vSeqIdAGPOnly.
begin() ) );
373 for( ; iter1 != iter1_end; ++iter1) {
374 copy( iter1->second.begin(), iter1->second.end(),
375inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.
begin() ) );
378 for( ; iter2 != iter2_end; ++iter2) {
379 copy( iter2->second.begin(), iter2->second.end(),
380inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.
begin() ) );
389 const boolbThereWereDifferences = (
390( ! vSeqIdFASTAOnly.
empty() &&
392( ! vSeqIdAGPOnly.
empty() &&
394 if( ! bThereWereDifferences ) {
397 if( bThereWereDifferences ) {
401 if( bThereWereDifferences && diffs_to_find > 0 &&
402! seqIdIntersection.
empty() )
414m_dir( x_GetTmpDir() )
417 throwstd::runtime_error(
"Temp dir already exists: "+
m_dir.
GetPath() );
421 throwstd::runtime_error(
"Could not create temp dir: "+
m_dir.
GetPath() );
427 if( ! m_dir.Remove() ) {
428cerr <<
"Warning: could not delete temporary dir " 429<< m_dir.GetPath() << endl;
439ofstream output_stream( GetFileName(
type, idh).c_str() );
445 intbytes_copied = 0;
446 for( ; iter != vec.
end(); ++iter, ++bytes_copied ) {
447 if( bytes_copied > 0 && (bytes_copied % 60) == 0 ) {
449output_stream <<
'\n';
451output_stream << *iter;
453output_stream << endl;
460std::stringstream file_name_strm;
465file_name_strm <<
"agp";
468file_name_strm <<
"obj";
473file_name_strm <<
"UNKNOWN";
477file_name_strm <<
'.';
482 const stringinitial_seq_id = idh.
AsString();
483std::stringstream final_seq_id;
484 ITERATE(
string, ch_iter, initial_seq_id) {
485 const unsigned charch = *ch_iter;
489final_seq_id <<
'_'<< setfill(
'0') << setw(3) << ch;
492file_name_strm << final_seq_id.str();
495 returnfile_name_strm.str();
500std::stringstream dir_strm;
505 returndir_strm.str();
510 int* in_out_pUniqueBioseqsLoaded,
511 int* in_out_pBioseqsSkipped,
515in_out_pUniqueBioseqsLoaded !=
NULL&&
516in_out_pBioseqsSkipped !=
NULL);
527 if( ! vec.
CanGetRange(0, bioseq_it->GetBioseqLength()) ) {
528 LOG_POST(
Error<<
" Skipping one: could not load due to error " 530 "(length issue or does not include range [1, " 531<< bioseq_it->GetBioseqLength() <<
"] or " 532 "doesn't exist) for "<< idh
533<<
" (though issue could be due to failure to resolve " 534 "one of the contigs. " 535 "Are all necessary components in GenBank or in files " 536 "specified on the command-line?).");
546 LOG_POST(
Error<<
" Skipping one: could not load due to error, " 547 "probably in AGP file, possibly a length issue, for " 549<<
"Raw technical information about error: "<< ex.
what() );
554 if( pDataOutFile !=
NULL) {
564 TKey key(
md5, bioseq_it->GetBioseqLength());
565pair<TSeqIdSet::iterator, bool> insert_result =
567 if( ! insert_result.second ) {
568 LOG_POST(
Error<<
" Error: skipping sequence with same name and values: "<< idh);
576os << setw(2) << setfill(
'0') <<
hex<< (
int)((
unsigned char)*
i);
580<<
" / "<<
key.second);
583++*in_out_pUniqueBioseqsLoaded;
586*in_out_pBioseqsSkipped = ( total - *in_out_pUniqueBioseqsLoaded);
596dataOutFile << '>
' << idh << endl; 598 const SIZE_TYPE data_len = data.length(); 599 SIZE_TYPE next_idx = 0; 600 for( ; next_idx < data_len ; next_idx += kFastaWidth ) { 601 SIZE_TYPE chars_to_copy = min( kFastaWidth, (data_len - next_idx) ); 602 dataOutFile.write( data.c_str() + next_idx, chars_to_copy ); 607 void CAgpFastaComparator::x_PrintDetailsOfLengthIssue( 608 CBioseq_Handle bioseq_h ) 610 const static string kBugInAgpFastaCompare( 611 " This is probably a bug in agp_fasta_compare: could not get " 612 "information on the bioseq with an error" ); 614 const CDelta_ext::Tdata *p_delta_data = NULL; 616 CScope &scope = bioseq_h.GetScope(); 618 p_delta_data = &bioseq_h.GetCompleteBioseq()->GetInst().GetExt().GetDelta().Get(); 620 if( p_delta_data == NULL ) { 621 LOG_POST(Error << kBugInAgpFastaCompare); 626 // put it in a reference to make it easier to work with 627 const CDelta_ext::Tdata &delta_data = *p_delta_data; 629 ITERATE( CDelta_ext::Tdata, delta_iter, delta_data ) { 630 if( (*delta_iter)->IsLiteral() ) { 634 const CSeq_interval & seq_int = (*delta_iter)->GetLoc().GetInt(); 636 const TSeqPos highest_pnt = 637 max( seq_int.GetFrom(), seq_int.GetTo() ); 638 CSeq_id_Handle seq_id_h = 639 CSeq_id_Handle::GetHandle(seq_int.GetId()); 641 CBioseq_Handle inner_bioseq_h; 643 inner_bioseq_h = scope.GetBioseqHandle(seq_id_h); 644 if( ! inner_bioseq_h ) { 645 LOG_POST(Error << " Couldn'tfind bioseq
for " 647 << ". Maybe you need to specify component
file(s).
" ); 648 } else if( ! inner_bioseq_h.IsSetInst_Length() ) { 649 LOG_POST(Error << "Could not get length of bioseq
for " 652 const TSeqPos bioseq_len = inner_bioseq_h.GetInst_Length(); 653 if( highest_pnt >= bioseq_len ) { 654 LOG_POST(Error << "For
" 656 << "length is
" << bioseq_len 657 << "but user tries to access the point
" 658 << (highest_pnt+1) ); // "+1
" because user sees 1-based 662 LOG_POST(Error << "Could not find bioseq
for " 664 << ". Maybe you need to specify component
file(s).
" ); 667 } catch(std::exception & ex) { 668 CNcbiOstrstream bioseq_strm; 669 bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq(); 670 LOG_POST(Error << kBugInAgpFastaCompare << ":
" 672 << "Raw technical information about
error:
" << Endl() 675 << "Bioseq
ASN.1:
" << (string)CNcbiOstrstreamToString(bioseq_strm) ); 678 CNcbiOstrstream bioseq_strm; 679 bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq(); 680 LOG_POST(Error << kBugInAgpFastaCompare << ":
" 681 << "(unknown
error)
" 682 << "Bioseq
ASN.1:
" << (string)CNcbiOstrstreamToString(bioseq_strm) ); 687 bool CAgpFastaComparator::x_GetCompAndObjSeqIds( 688 TSeqIdSet & out_compSeqIds, 689 TSeqIdSet & out_objSeqIds, 690 const std::list<std::string> & agpFiles ) 692 const static CTempString kDelim("\
t"); 694 const static CTempString kNotAGPErr( 697 // what is held in some of the AGP columns 698 const static int kObjSeqIdCol = 0; 699 const static int kCompTypeCol = 4; 700 const static int kCompSeqIdCol = 5; 701 const static int kMaxColUsed = kCompSeqIdCol; 703 vector<CTempString> vecLineTokens; 705 // for speed, we do the parsing ourselves with only very minimal 707 ITERATE( std::list<std::string>, file_iter, agpFiles ) { 708 ifstream file_strm(file_iter->c_str()); 710 while( NcbiGetline(file_strm, line, "\
r\
n") ) { 711 // skip comment lines 712 if( line.empty() || line[0] == '#' ) { 716 vecLineTokens.clear(); 717 NStr::Split(line, kDelim, vecLineTokens, 0); 719 // are there enough columns for an AGP file? 720 if( vecLineTokens.size() <= kMaxColUsed ){ 721 cerr << kNotAGPErr << *file_iter << endl; 726 CTempString sComponentType = vecLineTokens[kCompTypeCol]; 727 if( sComponentType.length() != 1 ) { 728 cerr << kNotAGPErr << *file_iter << endl; 731 const char chCompType = toupper(sComponentType[0]); 732 if( chCompType == 'N' || chCompType == 'U' ) 739 CRef<CSeq_id> objSeqId = s_CustomGetSeqIdFromStr( 740 vecLineTokens[kObjSeqIdCol], NULL); 741 out_objSeqIds.insert( 742 CSeq_id_Handle::GetHandle(*objSeqId)); 744 // get component Seq-id 745 CRef<CSeq_id> comp_seq_id = 746 s_CustomGetSeqIdFromStr( 747 vecLineTokens[kCompSeqIdCol], NULL); 748 out_compSeqIds.insert( 749 CSeq_id_Handle::GetHandle(*comp_seq_id) ); 756 void CAgpFastaComparator::x_ProcessObjects( 757 const list<string> & filenames, 758 TUniqueSeqs& fasta_ids, 759 CTmpSeqVecStorage *temp_dir ) 764 LOG_POST(Error << "Processing
object file(s)...
"); 765 COMP_LOG("Processing
object file(s)...
"); 766 ITERATE( list<string>, file_iter, filenames ) { 767 const string &filename = *file_iter; 769 CFormatGuess guesser( filename ); 770 const CFormatGuess::EFormat format = 771 guesser.GuessFormat(); 773 if( format == CFormatGuess::eFasta ) { 774 CNcbiIfstream file_istrm( filename.c_str(), ios::binary ); 775 CFastaReader reader(file_istrm, CFastaReader::fAddMods); 777 CRef<CSeq_entry> entry = reader.ReadOneSeq(); 779 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 780 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry); 781 x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL ); 783 temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh ); 786 } else if( format == CFormatGuess::eBinaryASN || 787 format == CFormatGuess::eTextASN ) 789 // see if it's a submit 790 CRef<CSeq_submit> submit( new CSeq_submit ); 792 CNcbiIfstream file_istrm( filename.c_str(), ios::binary ); 793 x_SetBinaryVsText( file_istrm, format ); 794 file_istrm >> *submit; 799 if( ! submit->IsEntrys() ) { 800 LOG_POST(Error << "Seq-submits must have
'entrys'.
"); 805 ITERATE( CSeq_submit::C_Data::TEntrys, entry_iter, 806 submit->GetData().GetEntrys() ) 808 const CSeq_entry &entry = **entry_iter; 810 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 811 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(entry); 812 x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL ); 814 temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh ); 820 CRef<CSeq_entry> entry( new CSeq_entry ); 822 CNcbiIfstream file_istrm( filename.c_str(), ios::binary ); 823 x_SetBinaryVsText( file_istrm, format ); 824 file_istrm >> *entry; 826 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 827 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry); 828 x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL ); 830 temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh ); 834 LOG_POST(Error << "Could not determine
formatof
" << filename 835 << ", best guess is:
" << CFormatGuess::GetFormatName(format) ); 840 catch(CObjReaderParseException & ex ) { 841 if( ex.GetErrCode() == CObjReaderParseException::eEOF ) { 842 // end of file; no problem 844 LOG_POST(Error << "Errorreading
object file:
" << ex.what() ); 849 catch (CException& ex ) { 850 LOG_POST(Error << "Errorreading
object file:
" << ex.what() ); 856 LOG_POST(Error << "Loaded
" << iNumLoaded << " object filesequence(s).
"); 857 if( iNumSkipped > 0 ) { 858 LOG_POST(Error << "Skipped
" << iNumSkipped << "FASTA sequence(s).
"); 863 void CAgpFastaComparator::x_ProcessAgps(const list<string> & filenames, 864 TUniqueSeqs& agp_ids, 865 CTmpSeqVecStorage *temp_dir ) 870 LOG_POST(Error << "Processing AGP...
"); 871 COMP_LOG("Processing AGP...
"); 873 CRef<CScope> pAgpToSeqEntryScope(new CScope(*CObjectManager::GetInstance())); 874 pAgpToSeqEntryScope->AddDefaults(); 876 ITERATE( list<string>, file_iter, filenames ) { 877 const string &filename = *file_iter; 878 CNcbiIfstream istr( filename.c_str() ); 880 CCustomAgpToSeqEntry agp_reader(pAgpToSeqEntryScope.GetPointer()); 881 int err_code = agp_reader.ReadStream( istr ); // loads entries 882 if( err_code != 0 ) { 883 LOG_POST(Error << "Error occurred reading AGP
file:
" 884 << agp_reader.GetErrorMessage() ); 888 ITERATE (vector< CRef<CSeq_entry> >, it, agp_reader.GetResult() ) { 889 CRef<CSeq_entry> entry = *it; 891 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 892 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry); 893 scope->AddDefaults(); 895 x_Process(seh, agp_ids, &iNumLoaded, &iNumSkipped, m_pAgpAsFastaFile.get() ); 897 temp_dir->WriteData( CTmpSeqVecStorage::eType_AGP, seh ); 902 LOG_POST(Error << "Loaded
" << iNumLoaded << "AGP sequence(s).
"); 903 if( iNumSkipped > 0 ) { 904 LOG_POST(Error << "Skipped
" << iNumSkipped << "AGP sequence(s).
"); 908 void CAgpFastaComparator::x_OutputDifferingSeqIds( 909 const TSeqIdSet & vSeqIdFASTAOnly, 910 const TSeqIdSet & vSeqIdAGPOnly, 911 TDiffsToHide diffs_to_hide, 912 TSeqIdSet & out_seqIdIntersection ) 914 // find the ones in both 916 vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(), 917 vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(), 918 inserter(out_seqIdIntersection, out_seqIdIntersection.begin()) ); 919 if( ! out_seqIdIntersection.empty() ) { 920 LOG_POST(Error << "These
" << out_seqIdIntersection.size() 921 << "differ between
object fileand AGP:
"); 922 ITERATE( TSeqIdSet, id_iter, out_seqIdIntersection ) { 923 LOG_POST(Error << " " << *id_iter); 927 // find the ones in FASTA only 928 TSeqIdSet vSeqIdTempSet; 930 vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(), 931 vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(), 932 inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) ); 933 if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_ObjfileOnly) ) { 934 LOG_POST(Error << "These
" << vSeqIdTempSet.size() 935 << "are
inObject
fileonly:
" << "\n" 936 << "(
Checkabove: were some AGP sequences skipped due
" 938 ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) { 939 LOG_POST(Error << " " << *id_iter); 943 // find the ones in AGP only 944 vSeqIdTempSet.clear(); 946 vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(), 947 vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(), 948 inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) ); 949 if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_AGPOnly) ) { 950 LOG_POST(Error << "These
" << vSeqIdTempSet.size() 951 << "are
inAGP only:
" << "\
n" 952 << "(
Checkabove: were some FASTA sequences skipped due
" 954 ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) { 955 LOG_POST(Error << " " << *id_iter); 960 void CAgpFastaComparator::x_CheckForDups( TUniqueSeqs & unique_ids, 961 const string & file_type ) 963 ITERATE( TUniqueSeqs, unique_id_iter, unique_ids ) { 964 const TSeqIdSet & id_set = unique_id_iter->second; 965 if( id_set.size() > 1 ) { 966 CNcbiOstrstream errmsg; 967 errmsg << "WARNING: Identical sequences
in " << file_type << ":
"; 968 ITERATE( TSeqIdSet, id_iter, id_set ) { 969 errmsg << " '" << *id_iter << "'"; 971 LOG_POST( Error << (string)CNcbiOstrstreamToString(errmsg) ); 976 void CAgpFastaComparator::x_OutputSeqDifferences( 978 const TSeqIdSet & seqIdIntersection, 979 CTmpSeqVecStorage & temp_dir ) 981 const static string kDiff = "/usr/bin/diff
"; 982 if( ! CExec::IsExecutable(kDiff) ) { 983 cerr << "No differences shown because cannot run
" << kDiff << endl; 987 const static string kAwk = "/usr/bin/awk
"; 988 if( ! CExec::IsExecutable(kAwk) ) { 989 cerr << "No differences shown because cannot run
" << kAwk << endl; 993 ITERATE( TSeqIdSet, id_iter, seqIdIntersection ) { 994 const CSeq_id_Handle & idh = *id_iter; 995 const string agp_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_AGP, idh ); 996 const string obj_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_Obj, idh ); 999 cout << "##### Comparing
" << idh << " forAGP (
'<') and Obj ('>'):" << endl;
1012std::stringstream cmd_strm;
1013cmd_strm << kDiff << " '" << agp_file << "' '" << obj_file << "' 2> /dev/
null| " << kAwk << " '
BEGIN{ max_lines =
" << diffs_to_find << "; left_seen = 0; right_seen = 0; }
" 1014 << "/^</ { left_seen += 1;
if( left_seen <= max_lines ) { print } }
" 1015 << "/^>/ { right_seen += 1;
if( right_seen <= max_lines ) { print } }
" 1016 << "/^[0-9]/ {
if( left_seen > right_seen ) { right_seen = left_seen }
else{ left_seen = right_seen }
if( left_seen >= max_lines && right_seen >= max_lines) {
exit} ; print }
" 1017 << "/^-/ { print }
'"; 1018 CExec::System( cmd_strm.str().c_str() ); 1022 void CAgpFastaComparator::x_SetBinaryVsText( CNcbiIstream & file_istrm, 1023 CFormatGuess::EFormat guess_format ) 1025 // set binary vs. text 1026 switch( guess_format ) { 1027 case CFormatGuess::eBinaryASN: 1028 file_istrm >> MSerial_AsnBinary; 1030 case CFormatGuess::eTextASN: 1031 file_istrm >> MSerial_AsnText; 1035 // a format where binary vs. text is irrelevant 1039 CAgpFastaComparator::EFileType CAgpFastaComparator::x_GuessFileType( const string & filename ) 1041 // To prevent us from reading huge files 1042 int iterations_remaining = 100; 1044 ifstream file_strm(filename.c_str()); 1047 // find first non-blank line 1048 while( file_strm && line.empty() && 1049 iterations_remaining-- > 0 ) 1051 // get line and trim it 1052 NcbiGetline(file_strm, line, "\r\n"); 1053 NStr::TruncateSpacesInPlace( line ); 1056 if( line.empty() ) { 1057 return eFileType_Unknown; 1060 if( line[0] == '>
' ) { 1061 return eFileType_FASTA; 1064 if( line.find("::=") != NPOS ) { 1065 return eFileType_ASN1; 1068 if( line[0] == '#
' ) { 1069 return eFileType_AGP; 1073 // did not use std::count because Sun WorkShop compiler defines it in 1074 // a non-standard way and this is cleaner than preprocessor directives 1075 ITERATE( string, str_iter, line ) { 1076 if( *str_iter == '\t' ) { 1080 if( num_tabs >= 7 ) { 1081 return eFileType_AGP; 1084 return eFileType_Unknown;User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Checksum and hash calculation classes.
void WriteData(EType type, objects::CSeq_entry_Handle seh)
string GetFileName(EType type, objects::CSeq_id_Handle idh)
EFileType x_GuessFileType(const string &filename)
bool x_GetCompAndObjSeqIds(TSeqIdSet &out_compSeqIds, TSeqIdSet &out_objSeqIds, const std::list< std::string > &agpFiles)
void x_CheckForDups(TUniqueSeqs &unique_ids, const string &file_type)
CAgpFastaComparator(void)
EResult Run(const std::list< std::string > &files, const std::string &loadlog, const std::string &agp_as_fasta_file, TDiffsToHide diffsToHide, int diffs_to_find)
void x_Process(const objects::CSeq_entry_Handle seh, TUniqueSeqs &seqs, int *in_out_pUniqueBioseqsLoaded, int *in_out_pBioseqsSkipped, CNcbiOfstream *pDataOutFile)
unique_ptr< CNcbiOfstream > m_pLoadLogFile
void x_ProcessObjects(const list< string > &filenames, TUniqueSeqs &fasta_ids, CTmpSeqVecStorage *temp_dir)
void x_ProcessAgps(const list< string > &filenames, TUniqueSeqs &agp_ids, CTmpSeqVecStorage *temp_dir)
@ fDiffsToHide_ObjfileOnly
pair< string, TSeqPos > TKey
void x_OutputSeqDifferences(int diffs_to_find, const TSeqIdSet &seqIdIntersection, CTmpSeqVecStorage &temp_dir)
void x_PrintDetailsOfLengthIssue(objects::CBioseq_Handle bioseq_h)
bool x_IsLogFileOpen(void)
void x_OutputDifferingSeqIds(const TSeqIdSet &vSeqIdFASTAOnly, const TSeqIdSet &vSeqIdAGPOnly, TDiffsToHide diffs_to_hide, TSeqIdSet &out_seqIdIntersection)
void x_WriteDataAsFasta(CNcbiOfstream &dataOutFile, const objects::CSeq_id_Handle &idh, const std::string &data)
unique_ptr< CNcbiOfstream > m_pAgpAsFastaFile
This class is used to turn an AGP file into a vector of Seq-entry's.
static CRef< objects::CSeq_id > s_DefaultSeqIdFromStr(const std::string &str)
This is the default method used to turn strings into Seq-ids in AGP contexts.
virtual CRef< objects::CSeq_id > x_GetSeqIdFromStr(const std::string &str)
If you must change exactly how strings are turned into Seq-ids, you can override this in a subclass.
static CRef< objects::CSeq_id > s_LocalSeqIdFromStr(const std::string &str)
Turn a string into a local Seq-id (removing "lcl|" from the beginning if needed)
CChecksum â Checksum calculator.
Base class for reading FASTA sequences.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Argument-less loader - for compatibility only, unusable.
Class for managing LDS2 database and related data files.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
SeqVector related exceptions.
container_type::const_iterator const_iterator
const_iterator begin() const
const_iterator end() const
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator find(const key_type &key) const
const_iterator end() const
Operators to edit gaps in sequences.
static const char * str(char *buf, int n)
static void md5(const char *src, const char *out)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void AddLine(const char *line, size_t len)
void GetMD5Digest(unsigned char digest[16]) const
Return calculated MD5 digest.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Error(CExceptionArgs_Base &args)
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
static string GetTmpDir(void)
Get temporary directory.
virtual bool Exists(void) const
Check if directory "dirname" exists.
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
bool Create(TCreateFlags flags=fCreate_Default) const
Create the directory using "dirname" passed in the constructor.
const string & GetPath(void) const
Get entry path.
long TFlags
binary OR of EFlags
@ fAddMods
Parse defline mods and add to SeqEntry.
@ fNoSeqData
Parse the deflines but skip the data.
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
EAccessionInfo
For IdentifyAccession (below)
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
bool CanGetRange(TSeqPos start, TSeqPos stop) const
Check if the sequence data is available for the interval [start, stop).
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
const_iterator begin(void) const
const_iterator end(void) const
void Reset(void)
Reset reference object.
static TPid GetPid(void)
Get process identifier (pid) for the current process.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
const char * Endl(void)
Platform-specific EndOfLine.
NCBI_NS_STD::string::size_type SIZE_TYPE
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
@ eCurrent
Use current time. See also CCurrentTime.
bool IsLocal(void) const
Check if variant Local is selected.
@ eMol_na
just a nucleic acid
unsigned int
A callback function used to compare two keys in a database.
The blob sat and sat key Both must be positive integers</td > n< td > Non empty string The interpretation of the blob id depends on a processor Cassandra n processor expects the following format
static void hex(unsigned char c)
const struct ncbi::grid::netcache::search::fields::KEY key
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines a portable execute class.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
CRef< objects::CObjectManager > om
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4