A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/AgpFastaComparator_8cpp_source.html below:

NCBI C++ ToolKit: src/app/agp_validate/AgpFastaComparator.cpp Source File

70 # error COMP_LOG was already defined 74 #define COMP_LOG(msg) \ 76  if( x_IsLogFileOpen() ) { \ 77  *m_pLoadLogFile << msg << endl; \ 103  const bool

bAccnIsProtOnly = (

107  if

( bAccnIsProtOnly || ! bSeqIdIsFound )

113  const bool

bLocalSeqIdIsfound = (

115  if

( bLocalSeqIdIsfound ) {

117

cerr <<

"Warning: '"

<<

str

<<

"' was used as an accession, " 118  "so the local component was ignored."

<< endl;

131

CCustomAgpToSeqEntry(

CScope

* pScope)

139  return

s_CustomGetSeqIdFromStr(

str

, m_pScope.GetPointer());

160  const

std::list<std::string> & files,

172

list<string> compAndObjFiles;

173

list<string> agpFiles;

174  ITERATE

( std::list<std::string>, file_iter, files ) {

175  const string

&

file

= *file_iter;

180

compAndObjFiles.push_back(

file

);

183

agpFiles.push_back(

file

);

188  if

( ! loadlog.empty() ) {

193  if

( ! agp_as_fasta_file.empty() ) {

210  COMP_LOG

(

"Component seq-id from AGP file(s): " 211

<< seq_id_it->AsString());

214  COMP_LOG

(

"Object seq-id from AGP file(s): " 215

<< seq_id_it->AsString());

221

unique_ptr<CTmpFile> ldsdb_file;

234

lds_mgr->SetFastaFlags(fasta_flags);

236

list<string> objfiles;

237  ITERATE

( list<string>, file_iter, compAndObjFiles ) {

242  COMP_LOG

(

"Object file: "

<< *file_iter);

243

objfiles.push_back(*file_iter);

247

ifstream file_strm( file_iter->c_str() );

257  SIZE_TYPE

after_seq_id_pos = line.find_first_of(

" \t"

);

258  if

( after_seq_id_pos == string::npos ) {

259

after_seq_id_pos = line.length();

261  string

acc_long = line.substr(1, (after_seq_id_pos - 1));

265  COMP_LOG

(

"Sample accession from "

<< *file_iter

267  if

( compSeqIds.

find

(acc_h) != compSeqIds.

end

() ) {

270  COMP_LOG

(

"Component file: "

<< *file_iter);

271

lds_mgr->AddDataFile( *file_iter );

273

}

else if

( objSeqIds.

find

(acc_h) != objSeqIds.

end

() ) {

275  COMP_LOG

(

"Object file: "

<< *file_iter);

276

objfiles.push_back(*file_iter);

284

cerr <<

"Warning: This file seems to be unused: '" 285

<< *file_iter <<

"'"

<< endl;

288

lds_mgr->UpdateData();

300

unique_ptr<CTmpSeqVecStorage> temp_dir;

301  if

( diffs_to_find > 0 ) {

307  if

( agpFiles.empty() ) {

308

cerr <<

"error: could not find any agp files"

<< endl;

315  if

( objfiles.empty() ) {

316

cerr <<

"error: could not find any obj files"

<< endl;

340  for

( ; iter1 != iter1_end && iter2 != iter2_end; ) {

341  if

(iter1->first < iter2->first) {

342  copy

( iter1->second.begin(), iter1->second.end(),

343

inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.

begin

() ) );

346  else if

(iter2->first < iter1->first) {

347  copy

( iter2->second.begin(), iter2->second.end(),

348

inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.

begin

() ) );

351  else if

( iter1->second != iter2->second ) {

353

set_difference( iter1->second.begin(), iter1->second.end(),

354

iter2->second.begin(), iter2->second.end(),

355

inserter(vSeqIdFASTAOnly,

356

vSeqIdFASTAOnly.

begin

() ) );

359

set_difference( iter2->second.begin(), iter2->second.end(),

360

iter1->second.begin(), iter1->second.end(),

361

inserter(vSeqIdAGPOnly,

362

vSeqIdAGPOnly.

begin

() ) );

373  for

( ; iter1 != iter1_end; ++iter1) {

374  copy

( iter1->second.begin(), iter1->second.end(),

375

inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.

begin

() ) );

378  for

( ; iter2 != iter2_end; ++iter2) {

379  copy

( iter2->second.begin(), iter2->second.end(),

380

inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.

begin

() ) );

389  const bool

bThereWereDifferences = (

390

( ! vSeqIdFASTAOnly.

empty

() &&

392

( ! vSeqIdAGPOnly.

empty

() &&

394  if

( ! bThereWereDifferences ) {

397  if

( bThereWereDifferences ) {

401  if

( bThereWereDifferences && diffs_to_find > 0 &&

402

! seqIdIntersection.

empty

() )

414

m_dir( x_GetTmpDir() )

417  throw

std::runtime_error(

"Temp dir already exists: "

+

m_dir

.

GetPath

() );

421  throw

std::runtime_error(

"Could not create temp dir: "

+

m_dir

.

GetPath

() );

427  if

( ! m_dir.Remove() ) {

428

cerr <<

"Warning: could not delete temporary dir " 429

<< m_dir.GetPath() << endl;

439

ofstream output_stream( GetFileName(

type

, idh).c_str() );

445  int

bytes_copied = 0;

446  for

( ; iter != vec.

end

(); ++iter, ++bytes_copied ) {

447  if

( bytes_copied > 0 && (bytes_copied % 60) == 0 ) {

449

output_stream <<

'\n'

;

451

output_stream << *iter;

453

output_stream << endl;

460

std::stringstream file_name_strm;

465

file_name_strm <<

"agp"

;

468

file_name_strm <<

"obj"

;

473

file_name_strm <<

"UNKNOWN"

;

477

file_name_strm <<

'.'

;

482  const string

initial_seq_id = idh.

AsString

();

483

std::stringstream final_seq_id;

484  ITERATE

(

string

, ch_iter, initial_seq_id) {

485  const unsigned char

ch = *ch_iter;

489

final_seq_id <<

'_'

<< setfill(

'0'

) << setw(3) << ch;

492

file_name_strm << final_seq_id.str();

495  return

file_name_strm.str();

500

std::stringstream dir_strm;

505  return

dir_strm.str();

510  int

* in_out_pUniqueBioseqsLoaded,

511  int

* in_out_pBioseqsSkipped,

515

in_out_pUniqueBioseqsLoaded !=

NULL

&&

516

in_out_pBioseqsSkipped !=

NULL

);

527  if

( ! vec.

CanGetRange

(0, bioseq_it->GetBioseqLength()) ) {

528  LOG_POST

(

Error

<<

" Skipping one: could not load due to error " 530  "(length issue or does not include range [1, " 531

<< bioseq_it->GetBioseqLength() <<

"] or " 532  "doesn't exist) for "

<< idh

533

<<

" (though issue could be due to failure to resolve " 534  "one of the contigs. " 535  "Are all necessary components in GenBank or in files " 536  "specified on the command-line?)."

);

546  LOG_POST

(

Error

<<

" Skipping one: could not load due to error, " 547  "probably in AGP file, possibly a length issue, for " 549

<<

"Raw technical information about error: "

<< ex.

what

() );

554  if

( pDataOutFile !=

NULL

) {

564  TKey key

(

md5

, bioseq_it->GetBioseqLength());

565

pair<TSeqIdSet::iterator, bool> insert_result =

567  if

( ! insert_result.second ) {

568  LOG_POST

(

Error

<<

" Error: skipping sequence with same name and values: "

<< idh);

576

os << setw(2) << setfill(

'0'

) <<

hex

<< (

int

)((

unsigned char

)*

i

);

580

<<

" / "

<<

key

.second);

583

++*in_out_pUniqueBioseqsLoaded;

586

*in_out_pBioseqsSkipped = ( total - *in_out_pUniqueBioseqsLoaded);

596

dataOutFile << '>

' << idh << endl; 598  const SIZE_TYPE data_len = data.length(); 599  SIZE_TYPE next_idx = 0; 600  for( ; next_idx < data_len ; next_idx += kFastaWidth ) { 601  SIZE_TYPE chars_to_copy = min( kFastaWidth, (data_len - next_idx) ); 602  dataOutFile.write( data.c_str() + next_idx, chars_to_copy ); 607 void CAgpFastaComparator::x_PrintDetailsOfLengthIssue( 608  CBioseq_Handle bioseq_h ) 610  const static string kBugInAgpFastaCompare( 611  " This is probably a bug in agp_fasta_compare: could not get " 612  "information on the bioseq with an error" ); 614  const CDelta_ext::Tdata *p_delta_data = NULL; 616  CScope &scope = bioseq_h.GetScope(); 618  p_delta_data = &bioseq_h.GetCompleteBioseq()->GetInst().GetExt().GetDelta().Get(); 620  if( p_delta_data == NULL ) { 621  LOG_POST(Error << kBugInAgpFastaCompare); 626  // put it in a reference to make it easier to work with 627  const CDelta_ext::Tdata &delta_data = *p_delta_data; 629  ITERATE( CDelta_ext::Tdata, delta_iter, delta_data ) { 630  if( (*delta_iter)->IsLiteral() ) { 634  const CSeq_interval & seq_int = (*delta_iter)->GetLoc().GetInt(); 636  const TSeqPos highest_pnt = 637  max( seq_int.GetFrom(), seq_int.GetTo() ); 638  CSeq_id_Handle seq_id_h = 639  CSeq_id_Handle::GetHandle(seq_int.GetId()); 641  CBioseq_Handle inner_bioseq_h; 643  inner_bioseq_h = scope.GetBioseqHandle(seq_id_h); 644  if( ! inner_bioseq_h ) { 645  LOG_POST(Error << " Couldn't

find bioseq

for " 647  << "

. Maybe you need to specify component

file

(s).

" ); 648  } else if( ! inner_bioseq_h.IsSetInst_Length() ) { 649  LOG_POST(Error << "

Could not get length of bioseq

for " 652  const TSeqPos bioseq_len = inner_bioseq_h.GetInst_Length(); 653  if( highest_pnt >= bioseq_len ) { 654  LOG_POST(Error << "

For

" 656  << "

length is

" << bioseq_len 657  << "

but user tries to access the point

" 658  << (highest_pnt+1) ); // "

+1

" because user sees 1-based 662  LOG_POST(Error << "

Could not find bioseq

for " 664  << "

. Maybe you need to specify component

file

(s).

" ); 667  } catch(std::exception & ex) { 668  CNcbiOstrstream bioseq_strm; 669  bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq(); 670  LOG_POST(Error << kBugInAgpFastaCompare << "

:

" 672  << "

Raw technical information about

error

:

" << Endl() 675  << "

Bioseq

ASN

.1:

" << (string)CNcbiOstrstreamToString(bioseq_strm) ); 678  CNcbiOstrstream bioseq_strm; 679  bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq(); 680  LOG_POST(Error << kBugInAgpFastaCompare << "

:

" 681  << "

(unknown

error

)

" 682  << "

Bioseq

ASN

.1:

" << (string)CNcbiOstrstreamToString(bioseq_strm) ); 687 bool CAgpFastaComparator::x_GetCompAndObjSeqIds( 688  TSeqIdSet & out_compSeqIds, 689  TSeqIdSet & out_objSeqIds, 690  const std::list<std::string> & agpFiles ) 692  const static CTempString kDelim("

\

t"); 694  const static CTempString kNotAGPErr( 697  // what is held in some of the AGP columns 698  const static int kObjSeqIdCol = 0; 699  const static int kCompTypeCol = 4; 700  const static int kCompSeqIdCol = 5; 701  const static int kMaxColUsed = kCompSeqIdCol; 703  vector<CTempString> vecLineTokens; 705  // for speed, we do the parsing ourselves with only very minimal 707  ITERATE( std::list<std::string>, file_iter, agpFiles ) { 708  ifstream file_strm(file_iter->c_str()); 710  while( NcbiGetline(file_strm, line, "

\

r

\

n") ) { 711  // skip comment lines 712  if( line.empty() || line[0] == '#' ) { 716  vecLineTokens.clear(); 717  NStr::Split(line, kDelim, vecLineTokens, 0); 719  // are there enough columns for an AGP file? 720  if( vecLineTokens.size() <= kMaxColUsed ){ 721  cerr << kNotAGPErr << *file_iter << endl; 726  CTempString sComponentType = vecLineTokens[kCompTypeCol]; 727  if( sComponentType.length() != 1 ) { 728  cerr << kNotAGPErr << *file_iter << endl; 731  const char chCompType = toupper(sComponentType[0]); 732  if( chCompType == 'N' || chCompType == 'U' ) 739  CRef<CSeq_id> objSeqId = s_CustomGetSeqIdFromStr( 740  vecLineTokens[kObjSeqIdCol], NULL); 741  out_objSeqIds.insert( 742  CSeq_id_Handle::GetHandle(*objSeqId)); 744  // get component Seq-id 745  CRef<CSeq_id> comp_seq_id = 746  s_CustomGetSeqIdFromStr( 747  vecLineTokens[kCompSeqIdCol], NULL); 748  out_compSeqIds.insert( 749  CSeq_id_Handle::GetHandle(*comp_seq_id) ); 756 void CAgpFastaComparator::x_ProcessObjects( 757  const list<string> & filenames, 758  TUniqueSeqs& fasta_ids, 759  CTmpSeqVecStorage *temp_dir ) 764  LOG_POST(Error << "

Processing

object file

(s)...

"); 765  COMP_LOG("

Processing

object file

(s)...

"); 766  ITERATE( list<string>, file_iter, filenames ) { 767  const string &filename = *file_iter; 769  CFormatGuess guesser( filename ); 770  const CFormatGuess::EFormat format = 771  guesser.GuessFormat(); 773  if( format == CFormatGuess::eFasta ) { 774  CNcbiIfstream file_istrm( filename.c_str(), ios::binary ); 775  CFastaReader reader(file_istrm, CFastaReader::fAddMods); 777  CRef<CSeq_entry> entry = reader.ReadOneSeq(); 779  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 780  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry); 781  x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL ); 783  temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh ); 786  } else if( format == CFormatGuess::eBinaryASN || 787  format == CFormatGuess::eTextASN ) 789  // see if it's a submit 790  CRef<CSeq_submit> submit( new CSeq_submit ); 792  CNcbiIfstream file_istrm( filename.c_str(), ios::binary ); 793  x_SetBinaryVsText( file_istrm, format ); 794  file_istrm >> *submit; 799  if( ! submit->IsEntrys() ) { 800  LOG_POST(Error << "

Seq-submits must have

'entrys'

.

"); 805  ITERATE( CSeq_submit::C_Data::TEntrys, entry_iter, 806  submit->GetData().GetEntrys() ) 808  const CSeq_entry &entry = **entry_iter; 810  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 811  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(entry); 812  x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL ); 814  temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh ); 820  CRef<CSeq_entry> entry( new CSeq_entry ); 822  CNcbiIfstream file_istrm( filename.c_str(), ios::binary ); 823  x_SetBinaryVsText( file_istrm, format ); 824  file_istrm >> *entry; 826  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 827  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry); 828  x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL ); 830  temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh ); 834  LOG_POST(Error << "

Could not determine

format

of

" << filename 835  << "

, best guess is:

" << CFormatGuess::GetFormatName(format) ); 840  catch(CObjReaderParseException & ex ) { 841  if( ex.GetErrCode() == CObjReaderParseException::eEOF ) { 842  // end of file; no problem 844  LOG_POST(Error << "Error

reading

object file

:

" << ex.what() ); 849  catch (CException& ex ) { 850  LOG_POST(Error << "Error

reading

object file

:

" << ex.what() ); 856  LOG_POST(Error << "

Loaded

" << iNumLoaded << " object file

sequence(s).

"); 857  if( iNumSkipped > 0 ) { 858  LOG_POST(Error << "

Skipped

" << iNumSkipped << "

FASTA sequence(s).

"); 863 void CAgpFastaComparator::x_ProcessAgps(const list<string> & filenames, 864  TUniqueSeqs& agp_ids, 865  CTmpSeqVecStorage *temp_dir ) 870  LOG_POST(Error << "

Processing AGP...

"); 871  COMP_LOG("

Processing AGP...

"); 873  CRef<CScope> pAgpToSeqEntryScope(new CScope(*CObjectManager::GetInstance())); 874  pAgpToSeqEntryScope->AddDefaults(); 876  ITERATE( list<string>, file_iter, filenames ) { 877  const string &filename = *file_iter; 878  CNcbiIfstream istr( filename.c_str() ); 880  CCustomAgpToSeqEntry agp_reader(pAgpToSeqEntryScope.GetPointer()); 881  int err_code = agp_reader.ReadStream( istr ); // loads entries 882  if( err_code != 0 ) { 883  LOG_POST(Error << "

Error occurred reading AGP

file

:

" 884  << agp_reader.GetErrorMessage() ); 888  ITERATE (vector< CRef<CSeq_entry> >, it, agp_reader.GetResult() ) { 889  CRef<CSeq_entry> entry = *it; 891  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance())); 892  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry); 893  scope->AddDefaults(); 895  x_Process(seh, agp_ids, &iNumLoaded, &iNumSkipped, m_pAgpAsFastaFile.get() ); 897  temp_dir->WriteData( CTmpSeqVecStorage::eType_AGP, seh ); 902  LOG_POST(Error << "

Loaded

" << iNumLoaded << "

AGP sequence(s).

"); 903  if( iNumSkipped > 0 ) { 904  LOG_POST(Error << "

Skipped

" << iNumSkipped << "

AGP sequence(s).

"); 908 void CAgpFastaComparator::x_OutputDifferingSeqIds( 909  const TSeqIdSet & vSeqIdFASTAOnly, 910  const TSeqIdSet & vSeqIdAGPOnly, 911  TDiffsToHide diffs_to_hide, 912  TSeqIdSet & out_seqIdIntersection ) 914  // find the ones in both 916  vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(), 917  vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(), 918  inserter(out_seqIdIntersection, out_seqIdIntersection.begin()) ); 919  if( ! out_seqIdIntersection.empty() ) { 920  LOG_POST(Error << "

These

" << out_seqIdIntersection.size() 921  << "

differ between

object file

and AGP:

"); 922  ITERATE( TSeqIdSet, id_iter, out_seqIdIntersection ) { 923  LOG_POST(Error << " " << *id_iter); 927  // find the ones in FASTA only 928  TSeqIdSet vSeqIdTempSet; 930  vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(), 931  vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(), 932  inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) ); 933  if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_ObjfileOnly) ) { 934  LOG_POST(Error << "

These

" << vSeqIdTempSet.size() 935  << "

are

in

Object

file

only:

" << "\n" 936  << "

(

Check

above: were some AGP sequences skipped due

" 938  ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) { 939  LOG_POST(Error << " " << *id_iter); 943  // find the ones in AGP only 944  vSeqIdTempSet.clear(); 946  vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(), 947  vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(), 948  inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) ); 949  if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_AGPOnly) ) { 950  LOG_POST(Error << "

These

" << vSeqIdTempSet.size() 951  << "

are

in

AGP only:

" << "

\

n" 952  << "

(

Check

above: were some FASTA sequences skipped due

" 954  ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) { 955  LOG_POST(Error << " " << *id_iter); 960 void CAgpFastaComparator::x_CheckForDups( TUniqueSeqs & unique_ids, 961  const string & file_type ) 963  ITERATE( TUniqueSeqs, unique_id_iter, unique_ids ) { 964  const TSeqIdSet & id_set = unique_id_iter->second; 965  if( id_set.size() > 1 ) { 966  CNcbiOstrstream errmsg; 967  errmsg << "

WARNING: Identical sequences

in " << file_type << "

:

"; 968  ITERATE( TSeqIdSet, id_iter, id_set ) { 969  errmsg << " '" << *id_iter << "'"; 971  LOG_POST( Error << (string)CNcbiOstrstreamToString(errmsg) ); 976 void CAgpFastaComparator::x_OutputSeqDifferences( 978  const TSeqIdSet & seqIdIntersection, 979  CTmpSeqVecStorage & temp_dir ) 981  const static string kDiff = "

/usr/bin/diff

"; 982  if( ! CExec::IsExecutable(kDiff) ) { 983  cerr << "

No differences shown because cannot run

" << kDiff << endl; 987  const static string kAwk = "

/usr/bin/awk

"; 988  if( ! CExec::IsExecutable(kAwk) ) { 989  cerr << "

No differences shown because cannot run

" << kAwk << endl; 993  ITERATE( TSeqIdSet, id_iter, seqIdIntersection ) { 994  const CSeq_id_Handle & idh = *id_iter; 995  const string agp_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_AGP, idh ); 996  const string obj_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_Obj, idh ); 999  cout << "

##### Comparing

" << idh << " for

AGP (

'<'

) and Obj ('>'):" << endl;

1012

std::stringstream cmd_strm;

1013

cmd_strm << kDiff << " '" << agp_file << "' '" << obj_file << "' 2> /dev/

null

| " << kAwk << " '

BEGIN

{ max_lines =

" << diffs_to_find << "

; left_seen = 0; right_seen = 0; }

" 1014  << "

/^</ { left_seen += 1;

if

( left_seen <= max_lines ) { print } }

" 1015  << "

/^>/ { right_seen += 1;

if

( right_seen <= max_lines ) { print } }

" 1016  << "

/^[0-9]/ {

if

( left_seen > right_seen ) { right_seen = left_seen }

else

{ left_seen = right_seen }

if

( left_seen >= max_lines && right_seen >= max_lines) {

exit

} ; print }

" 1017  << "

/^-/ { print }

'"; 1018  CExec::System( cmd_strm.str().c_str() ); 1022 void CAgpFastaComparator::x_SetBinaryVsText( CNcbiIstream & file_istrm, 1023  CFormatGuess::EFormat guess_format ) 1025  // set binary vs. text 1026  switch( guess_format ) { 1027  case CFormatGuess::eBinaryASN: 1028  file_istrm >> MSerial_AsnBinary; 1030  case CFormatGuess::eTextASN: 1031  file_istrm >> MSerial_AsnText; 1035  // a format where binary vs. text is irrelevant 1039 CAgpFastaComparator::EFileType CAgpFastaComparator::x_GuessFileType( const string & filename ) 1041  // To prevent us from reading huge files 1042  int iterations_remaining = 100; 1044  ifstream file_strm(filename.c_str()); 1047  // find first non-blank line 1048  while( file_strm && line.empty() && 1049  iterations_remaining-- > 0 ) 1051  // get line and trim it 1052  NcbiGetline(file_strm, line, "\r\n"); 1053  NStr::TruncateSpacesInPlace( line ); 1056  if( line.empty() ) { 1057  return eFileType_Unknown; 1060  if( line[0] == '

>

' ) { 1061  return eFileType_FASTA; 1064  if( line.find("::=") != NPOS ) { 1065  return eFileType_ASN1; 1068  if( line[0] == '

#

' ) { 1069  return eFileType_AGP; 1073  // did not use std::count because Sun WorkShop compiler defines it in 1074  // a non-standard way and this is cleaner than preprocessor directives 1075  ITERATE( string, str_iter, line ) { 1076  if( *str_iter == '\t' ) { 1080  if( num_tabs >= 7 ) { 1081  return eFileType_AGP; 1084  return eFileType_Unknown;

User-defined methods of the data storage class.

User-defined methods of the data storage class.

User-defined methods of the data storage class.

User-defined methods of the data storage class.

User-defined methods of the data storage class.

Checksum and hash calculation classes.

void WriteData(EType type, objects::CSeq_entry_Handle seh)

string GetFileName(EType type, objects::CSeq_id_Handle idh)

EFileType x_GuessFileType(const string &filename)

bool x_GetCompAndObjSeqIds(TSeqIdSet &out_compSeqIds, TSeqIdSet &out_objSeqIds, const std::list< std::string > &agpFiles)

void x_CheckForDups(TUniqueSeqs &unique_ids, const string &file_type)

CAgpFastaComparator(void)

EResult Run(const std::list< std::string > &files, const std::string &loadlog, const std::string &agp_as_fasta_file, TDiffsToHide diffsToHide, int diffs_to_find)

void x_Process(const objects::CSeq_entry_Handle seh, TUniqueSeqs &seqs, int *in_out_pUniqueBioseqsLoaded, int *in_out_pBioseqsSkipped, CNcbiOfstream *pDataOutFile)

unique_ptr< CNcbiOfstream > m_pLoadLogFile

void x_ProcessObjects(const list< string > &filenames, TUniqueSeqs &fasta_ids, CTmpSeqVecStorage *temp_dir)

void x_ProcessAgps(const list< string > &filenames, TUniqueSeqs &agp_ids, CTmpSeqVecStorage *temp_dir)

@ fDiffsToHide_ObjfileOnly

pair< string, TSeqPos > TKey

void x_OutputSeqDifferences(int diffs_to_find, const TSeqIdSet &seqIdIntersection, CTmpSeqVecStorage &temp_dir)

void x_PrintDetailsOfLengthIssue(objects::CBioseq_Handle bioseq_h)

bool x_IsLogFileOpen(void)

void x_OutputDifferingSeqIds(const TSeqIdSet &vSeqIdFASTAOnly, const TSeqIdSet &vSeqIdAGPOnly, TDiffsToHide diffs_to_hide, TSeqIdSet &out_seqIdIntersection)

void x_WriteDataAsFasta(CNcbiOfstream &dataOutFile, const objects::CSeq_id_Handle &idh, const std::string &data)

unique_ptr< CNcbiOfstream > m_pAgpAsFastaFile

This class is used to turn an AGP file into a vector of Seq-entry's.

static CRef< objects::CSeq_id > s_DefaultSeqIdFromStr(const std::string &str)

This is the default method used to turn strings into Seq-ids in AGP contexts.

virtual CRef< objects::CSeq_id > x_GetSeqIdFromStr(const std::string &str)

If you must change exactly how strings are turned into Seq-ids, you can override this in a subclass.

static CRef< objects::CSeq_id > s_LocalSeqIdFromStr(const std::string &str)

Turn a string into a local Seq-id (removing "lcl|" from the beginning if needed)

CChecksum – Checksum calculator.

Base class for reading FASTA sequences.

static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)

static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)

Argument-less loader - for compatibility only, unusable.

Class for managing LDS2 database and related data files.

CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:

SeqVector related exceptions.

container_type::const_iterator const_iterator

const_iterator begin() const

const_iterator end() const

iterator_bool insert(const value_type &val)

const_iterator begin() const

const_iterator find(const key_type &key) const

const_iterator end() const

Operators to edit gaps in sequences.

static const char * str(char *buf, int n)

static void md5(const char *src, const char *out)

#define ITERATE(Type, Var, Cont)

ITERATE macro to sequence through container elements.

void AddLine(const char *line, size_t len)

void GetMD5Digest(unsigned char digest[16]) const

Return calculated MD5 digest.

#define LOG_POST(message)

This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...

void Error(CExceptionArgs_Base &args)

virtual const char * what(void) const noexcept

Standard report (includes full backlog).

static string GetTmpDir(void)

Get temporary directory.

virtual bool Exists(void) const

Check if directory "dirname" exists.

static char GetPathSeparator(void)

Get path separator symbol specific for the current platform.

bool Create(TCreateFlags flags=fCreate_Default) const

Create the directory using "dirname" passed in the constructor.

const string & GetPath(void) const

Get entry path.

long TFlags

binary OR of EFlags

@ fAddMods

Parse defline mods and add to SeqEntry.

@ fNoSeqData

Parse the deflines but skip the data.

@ fDisableParseRange

No ranges in seq-ids. Ranges part of seq-id instead.

static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)

Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...

EAccessionInfo

For IdentifyAccession (below)

static CSeq_id_Handle GetHandle(const CSeq_id &id)

Normal way of getting a handle, works for any seq-id.

string AsString(void) const

const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)

If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...

@ eGetId_Best

return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function

static CRef< CObjectManager > GetInstance(void)

Return the existing object manager or create one.

CBioseq_Handle GetBioseqHandle(const CSeq_id &id)

Get bioseq handle by seq-id.

@ eCoding_Iupac

Set coding to printable coding (Iupacna or Iupacaa)

bool CanGetRange(TSeqPos start, TSeqPos stop) const

Check if the sequence data is available for the interval [start, stop).

void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const

Fill the buffer string with the sequence data for the interval [start, stop).

const_iterator begin(void) const

const_iterator end(void) const

void Reset(void)

Reset reference object.

static TPid GetPid(void)

Get process identifier (pid) for the current process.

IO_PREFIX::ofstream CNcbiOfstream

Portable alias for ofstream.

CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)

Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)

const char * Endl(void)

Platform-specific EndOfLine.

NCBI_NS_STD::string::size_type SIZE_TYPE

static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)

Check if a string starts with a specified prefix value.

string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const

Transform time to string.

@ eCurrent

Use current time. See also CCurrentTime.

bool IsLocal(void) const

Check if variant Local is selected.

@ eMol_na

just a nucleic acid

unsigned int

A callback function used to compare two keys in a database.

The blob sat and sat key Both must be positive integers</td > n< td > Non empty string The interpretation of the blob id depends on a processor Cassandra n processor expects the following format

static void hex(unsigned char c)

const struct ncbi::grid::netcache::search::fields::KEY key

Defines the CNcbiApplication and CAppException classes for creating NCBI applications.

Defines command line argument related classes.

Defines unified interface to application:

Defines a portable execute class.

std::istream & in(std::istream &in_, double &x_)

double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)

void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)

CRef< objects::CObjectManager > om


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4