RetroSearch Browse

);

315 unsigned int

main_nuc_content = 0, ambig_content = 0, bad_nuc_content = 0,

316

amino_acid_content = 0, exotic_aa_content = 0, bad_aa_content = 0;

318 for

(

unsigned i

= 0;

< length; ++

) {

319 unsigned char

c =

[

];

330

++amino_acid_content;

338 switch

(strictness) {

341 double

dna_content = (double)main_nuc_content / (

double

)length;

342 double

prot_content = (double)amino_acid_content / (

double

)length;

344 if

(dna_content > 0.7) {

347 if

(prot_content > 0.7) {

353 if

(bad_nuc_content + ambig_content <= main_nuc_content / 9

354

|| (bad_nuc_content + ambig_content <= main_nuc_content / 3 &&

355

bad_nuc_content <= (main_nuc_content + ambig_content) / 19)) {

358

}

else if

(bad_aa_content + exotic_aa_content

359

<= amino_acid_content / 9) {

365 if

(bad_nuc_content == 0 && ambig_content <= main_nuc_content / 3) {

367

}

else if

(bad_aa_content == 0

368

&& exotic_aa_content <= amino_acid_content / 9) {

399

, m_bOwnsStream(

true

)

400

, m_iTestBufferSize(0)

407 const string

& FileName )

408

: m_Stream( * new

CNcbiIfstream

( FileName.c_str(), ios::binary ) )

409

, m_bOwnsStream(

true

)

418

, m_bOwnsStream(

false

)

602 "CFormatGuess::x_TestFormat(): Unsupported format ID ("

612 "sm_FormatNames does not list all possible formats"

);

643 const

streamsize k_TestBufferGranularity = 8096;

662 if

(Multiplier >= 1024) {

714 while

( ! TestBuffer.fail() ) {

718 if

(!strLine.empty()) {

721 size_t size

= strLine.size();

722 bool

is_header =

> 0 && strLine[0] ==

'>'

;

723 for

(

size_t i

=0;

; ++

) {

724 unsigned char

c = strLine[

];

730 else if

(c ==

'{'

|| c ==

'}'

) {

778 if

(line.size()<minLength) {

783 for

(

auto

c : line) {

785 auto

index =

static_cast<int>

(c);

797 return

(nucCount/line.size() > 0.9);

814 bool

foundId =

false

;

837 unsigned int

uGtfLineCount = 0;

845 if

( it->empty() || (*it)[0] ==

'#'

) {

859 return

(uGtfLineCount != 0);

871 unsigned int

uGvfLineCount = 0;

879 if

( it->empty() || (*it)[0] ==

'#'

) {

896 return

(uGvfLineCount != 0);

909 unsigned int

uGffLineCount = 0;

920 if

( it->empty() || (*it)[0] ==

'#'

) {

934 return

(uGffLineCount != 0);

947 unsigned int

uGffLineCount = 0;

958 if

( it->empty() || (*it)[0] ==

'#'

) {

972 return

(uGffLineCount != 0);

985 unsigned int

uGffLineCount = 0;

993 if

( it->empty() || (*it)[0] ==

'#'

) {

1007 return

(uGffLineCount != 0);

1022 if

(it->empty() || (*it)[0] !=

'>'

) {

1068 const int

BUFFSIZE = 8096;

1084 bool

is_nexus =

false

;

1085 bool

has_trees =

false

;

1086 const size_t

check_size = 12;

1089 if

(

!= it->find(

"#NEXUS"

) ) {

1103 char

test_buf[

read_size

+ check_size + 1];

1104

memset(test_buf,

' '

, check_size);

1106 size_t

max_reads = 32768;

1107 for

(

size_t i

= 0;

< max_reads; ++

) {

1109 size_t

num_read =

m_Stream

.gcount();

1111

test_buf[num_read + check_size] = 0;

1119

strncpy(test_buf, test_buf + num_read, check_size);

1140 const size_t

maxSampleSize = 8*1024-1;

1141 size_t

sampleSize = 0;

1142 char

* pSample =

new char

[maxSampleSize+1];

1145 m_Stream

.read(pSample, maxSampleSize);

1146

sampleSize = (size_t)

m_Stream

.gcount();

1149 if

(0 == sampleSize) {

1153

pSample[sampleSize] = 0;

1184 return

(conf ==

eYes

);

1200

list<string>::const_iterator iter =

m_TestLines

.begin();

1205 if

(toks.size() != 1 ||

1206

toks.front().find_first_not_of(

"0123456789"

) != string::npos) {

1213 for

(

size_t i

= 1; iter !=

m_TestLines

.end(); ++

, ++iter) {

1216 if

(toks.size() !=

) {

1218

list<string>::const_iterator it = iter;

1225

list<string>::const_iterator it = toks.begin();

1226 for

(++it; it != toks.end(); ++it) {

1267 if

(it->find(

">Feature "

) != 0 && it->find(

">Features "

) != 0) {

1303 static const char

* known_types[] = {

1306 for

(

size_t i

=0;

ArraySize

(known_types); ++

) {

1340 if

(

!= it->find(

"#NEXUS"

) ) {

1351 for

(

auto

c : line) {

1368

vector<string> toks;

1370 const size_t

num_toks = toks.size();

1372 if

(num_toks != 2 &&

1377 const string

& seqdata = toks[1];

1380 unsigned int

cumulated_res = 0;

1381 if

(num_toks == 3) {

1383 if

(cumulated_res == 0) {

1396 if

(num_toks == 3) {

1397 size_t

num_gaps =

count

(seqdata.begin(), seqdata.end(),

'-'

);

1398 if

(((seqdata.size() - num_gaps) > cumulated_res)) {

1405

seg_length = seqdata.size();

1415 struct

SClustalBlockInfo

1418 unsigned int

m_Size;

1422 void

Reset(

void

) {

1428

SClustalBlockInfo() { Reset(); }

1446

SClustalBlockInfo block_info;

1448 bool

has_valid_block =

false

;

1449 size_t

seg_length = 0;

1450 size_t

seg_length_prev = 0;

1455 while

( !TestBuffer.eof() ) {

1463 if

(TestBuffer.fail()) {

1472 if

(block_info.m_InBlock) {

1473 if

(block_info.m_Size < 2) {

1482 if

(! block_info.m_InBlock || block_info.m_Size<2) {

1494 if

(seg_length > 60) {

1497 if

(block_info.m_InBlock) {

1498 if

(seg_length != seg_length_prev) {

1501

has_valid_block =

true

;

1504 if

(block_info.m_Ids.find(seq_id) != block_info.m_Ids.end()) {

1507

block_info.m_Ids.insert(seq_id);

1509

seg_length_prev = seg_length;

1510

block_info.m_InBlock =

true

;

1511

++(block_info.m_Size);

1514 return

has_valid_block;

1522

list<string>::const_iterator iter =

m_TestLines

.begin();

1528 for

(

size_t i

=5;

<7; ++

)

1535 if

(iter->empty() || (*iter)[0] ==

'#'

|| (*iter)[0] ==

';'

) {

1541

ncols = toks.size();

1552 if

(iter->empty() || (*iter)[0] ==

'#'

|| (*iter)[0] ==

';'

) {

1558 if

(toks.size() != ncols) {

1559

list<string>::const_iterator it = iter;

1568 for

(

const auto

& token : toks) {

1569 auto

it = find_if(token.begin(), token.end(),

1570

[](

unsigned char

c){ return !isprint(c); });

1571 if

(it != token.end()) {

1576 return

( nlines >= 3 );

1665 if

( dAlNumFraction < 0.8 ) {

1671 if

( dDnaFraction > 0.91 || dAaFraction > 0.91 ) {

1698 if

( dAlNumFraction < 0.80 ) {

1706 while

( ! TestBuffer.fail() ) {

1707

vector<string> Fields;

1713 return

( Fields.size() >= 2 && Fields[1] ==

"::="

isalpha

(Fields[0][0]));

1736 int

rsid, chr, pos, numMatched;

1737

numMatched = sscanf( it->c_str(),

"rs%d\t%d\t%d"

, &rsid, &chr, &pos);

1738 if

( numMatched == 3) {

1755 bool

bTrackLineFound(

false

);

1756 bool

bHasStartAndStop (

false

);

1757 size_t

columncount = 0;

1760 if

(

.empty() ) {

1765 if

(

.find(

"chr "

) == 0 ||

1766 str

.find(

"Chr "

) == 0 ||

1767 str

.find(

"CHR "

) == 0)

1775

bTrackLineFound =

true

;

1790 if

(

columns

.size() != columncount ) {

1791 if

( columncount == 0 ) {

1792

columncount =

columns

.size();

1801

bHasStartAndStop =

true

;

1806 return

(bHasStartAndStop || bTrackLineFound);

1818 bool

LineFound =

false

;

1819 size_t

columncount = 15;

1840 if

(

columns

.size() != columncount ) {

1851 if

(strand !=

"+"

&& strand !=

"-"

)

1898 const int

BUFFSIZE = 1024;

1911 unsigned int

uHgvsLineCount = 0;

1915 if

( it->empty() || (*it)[0] ==

'#'

) {

1923 return

(uHgvsLineCount != 0);

2098 bool

ignoreFirstColumn =

false

;

2099 unsigned int

uPslLineCount = 0;

2107 if

IsLinePsl

(*it, ignoreFirstColumn)) {

2108

ignoreFirstColumn =

true

;

2109 if

IsLinePsl

(*it, ignoreFirstColumn)) {

2116 if

( !

IsLinePsl

(*it, ignoreFirstColumn) ) {

2121 return

(uPslLineCount != 0);

2127

list<string>::iterator& lineIt,

2128

list<string>::iterator endIt,

2133 if

(lineIt == endIt) {

2136 if

(lineIt->size() > 79) {

2140

vector<int> validIndents = {0, 2, 3, 5, 12, 21};

2141 auto

firstNotBlank = lineIt->find_first_not_of(

" "

);

2142 while

(firstNotBlank != 0) {

2143 if

(std::find(validIndents.begin(), validIndents.end(), firstNotBlank) ==

2144

validIndents.end()) {

2145 auto

firstNotBlankOrDigit = lineIt->find_first_not_of(

" 1234567890"

);

2146 if

(firstNotBlankOrDigit != 10) {

2151 if

(lineIt == endIt) {

2154

firstNotBlank = lineIt->find_first_not_of(

" "

);

2184 string

keyword,

, lookingFor;

2190

lookingFor =

"LOCUS"

;

2191 if

(keyword != lookingFor) {

2199

lookingFor =

"DEFINITION"

;

2200 if

(keyword != lookingFor) {

2203 while

(keyword == lookingFor) {

2209

lookingFor =

"ACCESSION"

;

2210 if

(keyword != lookingFor) {

2213 while

(keyword == lookingFor) {

2219 bool

nidSeen =

false

;

2220

lookingFor =

"NID"

;

2221 if

(keyword == lookingFor) {

2228

lookingFor =

"VERSION"

;

2229 if

(keyword != lookingFor) {

2237

lookingFor =

"NID"

;

2238 if

(keyword == lookingFor) {

2245

lookingFor =

"PROJECT"

;

2246 while

(keyword == lookingFor) {

2252

lookingFor =

"DBLINK"

;

2253 while

(keyword == lookingFor) {

2259

lookingFor =

"KEYWORDS"

;

2260 if

(keyword != lookingFor) {

2272

list<string>::iterator& lineIt,

2273

list<string>::iterator endIt,

2281 if

(lineIt == endIt) {

2313 string

lineCode, lineData, lookingFor;

2320 if

(lineCode != lookingFor) {

2329 if

(lineCode != lookingFor) {

2332 while

(lineCode == lookingFor) {

2339 while

(lineCode == lookingFor) {

2346 for

(

int i

= 0;

< 2; ++

) {

2347 if

(lineCode != lookingFor) {

2356 if

(lineCode != lookingFor) {

2359 while

(lineCode == lookingFor) {

2366 if

(lineCode != lookingFor) {

2369 while

(lineCode == lookingFor) {

2376 if

(lineCode != lookingFor) {

2379 while

(lineCode == lookingFor) {

2386 if

(lineCode != lookingFor) {

2389 while

(lineCode == lookingFor) {

2402

list<string>::iterator& lineIt,

2403

list<string>::iterator endIt,

2408 if

(lineIt == endIt) {

2444 string

lineCode, lineData, lookingFor;

2451 if

(lineCode != lookingFor) {

2460 if

(lineCode != lookingFor) {

2463 while

(lineCode == lookingFor) {

2470 for

(

int i

= 0;

< 3; ++

) {

2471 if

(lineCode != lookingFor) {

2481 if

(lineCode != lookingFor) {

2484 while

(lineCode == lookingFor) {

2491 if

(lineCode !=

"GN"

&& lineCode !=

"OS"

) {

2531 if

(

limits

.size()%2 == 1) {

2534

testString +=

"\""

;

2535 limits

.push_back(testString.size()-1);

2541 string

complement =

""

;

2543 auto

it =

limits

.begin();

2544 size_t

comp_interval_start = 0;

2545 while

(it !=

limits

.end()) {

2546 const size_t

string_start = *it++;

2547 if

(string_start > comp_interval_start) {

2548 const size_t

comp_interval_length = string_start-comp_interval_start;

2549

complement += testString.substr(comp_interval_start, comp_interval_length);

2552 const size_t

string_stop = *it++;

2553

comp_interval_start = string_stop+1;

2556 if

(comp_interval_start < testString.size()) {

2557

complement += testString.substr(comp_interval_start);

2560

testString = complement;

2570 const string

& double_quotes = R

"(")"; 2572 bool

is_start =

true

;

2575 while

( pos !=

) {

2582

is_start = !is_start;

2589 size_t

s_GetPrecedingFslashCount(

const string

const size_t

pos)

2592

pos >=

.size() ||

2598 int

current_pos =

static_cast<int>

(pos)-1;

2599 size_t

num_fslash = 0;

2600 while

( current_pos >= 0 &&

[current_pos] ==

'\\'

) {

2612 const string

& double_quotes = R

"(")"; 2617 while

(pos !=

) {

2618 const size_t

num_fslash = s_GetPrecedingFslashCount(

, pos);

2621 if

(num_fslash%2 == 0) {

2638

list<string> subStrings;

2642 for

(

auto

it = subStrings.cbegin(); it != subStrings.cend(); ++it) {

2643 const string

subString = *it;

2647 if

(it == subStrings.cend()) {

2648

testString = subString;

2673 const string

extendedString = testString +

"0"

;

2697 const size_t

stringSize = testString.size();

2699 if

(stringSize > 4) {

2703 const string

nullString(

"null"

);

2704 const string

trueString(

"true"

);

2705 const string

falseString(

"false"

);

2707 if

(testString == nullString.substr(0, stringSize) ||

2708

testString == trueString.substr(0, stringSize) ||

2709

testString == falseString.substr(0, stringSize)) {

2725

list<string> numStrings;

2729 for

(

auto

numString : numStrings) {

2744 if

(testString.find_first_of(

"()"

) != string::npos) {

2748 const size_t

punctuation_threshold = 4;

2764 size_t

initial_len = testString.size();

2773 return

testString.size() - initial_len;

2793 const auto

next_pos = testString.find_first_not_of(

"( \t\r\n"

,1);

2794 if

(next_pos !=

&& testString[next_pos] ==

'\"'

) {

2860 string

labels_1st_line[] = {

"SW"

"perc"

"query"

"position"

"matching"

""

};

2861 string

labels_2nd_line[] = {

"score"

"div."

"del."

"ins."

"sequence"

""

};

2881 size_t

current_offset = 0;

2882 for

(

size_t i

=0; labels_1st_line[

] !=

""

; ++

) {

2883

current_offset =

NStr::FindCase

( *it, labels_1st_line[

], current_offset );

2884 if

( current_offset ==

) {

2897 for

(

size_t

j=0; labels_2nd_line[j] !=

""

; ++j ) {

2898

current_offset =

NStr::FindCase

( *it, labels_2nd_line[j], current_offset );

2899 if

( current_offset ==

) {

2948 const string

& cline )

2965 if

( line.empty() || line[0] !=

'('

) {

2971 bool

in_comment =

false

;

2972 for

(

size_t

ii=0; line.c_str()[ii] != 0; ++ii ) {

2973 if

( ! in_comment ) {

2974 if

( line.c_str()[ii] !=

'['

) {

2975

trimmed += line.c_str()[ii];

2982 if

( line.c_str()[ii] ==

']'

) {

2983

in_comment =

false

;

2992 bool

in_quote =

false

;

2993 for

(

size_t

ii=0; line.c_str()[ii] != 0; ++ii ) {

2995 if

( line.c_str()[ii] !=

'\''

) {

2996

trimmed += line.c_str()[ii];

3004 if

( line.c_str()[ii] ==

'\''

) {

3015 while

( line.c_str()[ii] != 0 ) {

3016 if

( line.c_str()[ii] !=

':'

) {

3017

trimmed += line.c_str()[ii++];

3021 if

( line.c_str()[ii] ==

'-'

|| line.c_str()[ii] ==

'+'

) {

3024 while

(

'0'

<= line.c_str()[ii] && line.c_str()[ii] <=

'9'

) {

3027 if

( line.c_str()[ii] ==

'.'

) {

3029 while

(

'0'

<= line.c_str()[ii] && line.c_str()[ii] <=

'9'

) {

3039 if

(line.empty() || line[0] !=

'('

) {

3042 size_t

paren_count = 1;

3043 for

(

size_t

ii=1; line.c_str()[ii] != 0; ++ii ) {

3044 switch

( line.c_str()[ii] ) {

3051 if

( paren_count == 0 ) {

3057 if

( paren_count == 0 ) {

3075 const string

& line )

3080 SIZE_TYPE

pos = line.find_first_not_of(

"0123456789 \t"

);

3081 if

(pos ==

|| pos + 45 >= line.size()) {

3086 char

c = line[pos +

];

3087 if

(

% 11 == 10) {

3092 if

( !

isalpha

'-'

&& c !=

'*'

) {

3104 const string

)

3109 if

(

.find_first_of(

"[]"

) ) {

3112 size_t

colon =

.find(

':'

);

3113 if

(

== colon ) {

3116 size_t

dot =

.find_first_not_of(

"0123456789"

, colon + 1 );

3117 if

(

== dot ) {

3120 if

(

[ dot ] !=

'.'

) {

3123 size_t

end =

.find_first_not_of(

"0123456789"

, dot + 1 );

3124 return

(

== end );

3130 const string

& strLine )

3136 string

line( strLine );

3137 size_t

uCommentStart =

NStr::Find

( line,

"#"

);

3139 if

(

!= uCommentStart ) {

3140

line = line.substr( 0, uCommentStart );

3143 if

( line.empty() ) {

3147

vector<string> tokens;

3152 if

( tokens[1].

() > 1 && tokens[1][0] ==

'-'

) {

3153

tokens[1][0] =

'1'

;

3159 if

( tokens[2].

() > 1 && tokens[2][0] ==

'-'

) {

3160

tokens[2][0] =

'1'

;

3166 if

( tokens[3].

() > 1 && tokens[3][0] ==

'-'

) {

3167

tokens[3][0] =

'1'

;

3173 if

( tokens[4].

() != 1 ||

== tokens[4].find_first_of(

"ADFGPNOW"

) ) {

3176 if

( tokens[4] ==

"N"

) {

3188 if

( tokens.size() != 9 ) {

3191 if

( tokens[8].

() != 1 ||

== tokens[8].find_first_of(

"+-"

) ) {

3202 const string

& line )

3206 if

(toks.size() != 5) {

3210

list<string>::iterator

= toks.begin();

3228 if

(frame < -3 || frame > 3) {

3243 const string

& line )

3245

vector<string> tokens;

3258 if

( tokens[6].

() != 1 ||

== tokens[6].find_first_of(

".+-"

) ) {

3261 if

( tokens[7].

() != 1 ||

== tokens[7].find_first_of(

".0123"

) ) {

3264 if

( tokens.size() < 9 ||

3265

(

== tokens[8].find(

"gene_id"

) &&

== tokens[8].find(

"transcript_id"

) ) ) {

3274 const string

& line )

3277

vector<string> tokens;

3290 bool

typeOk =

false

;

3292

terms.push_back(

"snv"

);

3293

terms.push_back(

"cnv"

);

3294

terms.push_back(

"copy_number_variation"

);

3295

terms.push_back(

"gain"

);

3296

terms.push_back(

"copy_number_gain"

);

3297

terms.push_back(

"loss"

);

3298

terms.push_back(

"copy_number_loss"

);

3299

terms.push_back(

"loss_of_heterozygosity"

);

3300

terms.push_back(

"complex"

);

3301

terms.push_back(

"complex_substitution"

);

3302

terms.push_back(

"complex_sequence_alteration"

);

3303

terms.push_back(

"indel"

);

3304

terms.push_back(

"insertion"

);

3305

terms.push_back(

"inversion"

);

3306

terms.push_back(

"substitution"

);

3307

terms.push_back(

"deletion"

);

3308

terms.push_back(

"duplication"

);

3309

terms.push_back(

"translocation"

);

3310

terms.push_back(

"upd"

);

3311

terms.push_back(

"uniparental_disomy"

);

3312

terms.push_back(

"maternal_uniparental_disomy"

);

3313

terms.push_back(

"paternal_uniparental_disomy"

);

3314

terms.push_back(

"tandom_duplication"

);

3315

terms.push_back(

"structural_variation"

);

3316

terms.push_back(

"sequence_alteration"

);

3317 ITERATE

(list<string>, termiter, terms) {

3330 if

( tokens[6].

() != 1 ||

== tokens[6].find_first_of(

".+-"

) ) {

3333 if

( tokens[7].

() != 1 ||

== tokens[7].find_first_of(

".0123"

) ) {

3338 string

attrs = tokens[8];

3339 if

(string::npos == attrs.find(

"ID="

))

3341 if

(string::npos == attrs.find(

"Variant_seq="

)) {

3350 const string

& line )

3352

vector<string> tokens;

3365 if

( tokens[6].

() != 1 ||

== tokens[6].find_first_of(

".+-?"

) ) {

3368 if

( tokens[7].

() != 1 ||

== tokens[7].find_first_of(

".0123"

) ) {

3371 if

( tokens.size() < 9 || tokens[8].empty()) {

3374 if

( tokens.size() >= 9 && tokens[8].size() > 1) {

3375 const string

& col9 = tokens[8];

3394 const string

& line )

3396

vector<string> tokens;

3397 string

remaining(line),

, tail;

3416 string

featureType =

;

3437 const string

legalStrands{

"+-.?"

};

3439

string::npos == legalStrands.find(

)) {

3445 const string

legalPhases{

".0123"

};

3447

string::npos == legalPhases.find(

)) {

3453 if

(remaining.empty()) {

3457 if

(featureType ==

"gene"

) {

3466 if

(featureType ==

"transcript"

) {

3487 const string

& line )

3489

vector<string> tokens;

3491 if

( num_cols < 8 ) {

3503 if

( tokens[6].

() != 1 ||

== tokens[6].find_first_of(

".+-"

) ) {

3506 if

( tokens[7].

() != 1 ||

== tokens[7].find_first_of(

".0123"

) ) {

3515 const string

& line )

3517

vector<string> values;

3525 if

( values[0] ==

"DNA"

) {

3532 if

( values[0] ==

"AS"

) {

3543 const string

& line )

3545 const size_t

MIN_VALUES_PER_RECORD = 14;

3550

list<string> values;

3560

list<string>::iterator it = values.begin();

3603 if

( *it !=

"+"

&& *it !=

"C"

) {

3617 const string

& line,

3618 bool

ignoreFirstLine)

3621

vector<string> tokens;

3622 int

firstColumn = (ignoreFirstLine ? 1 : 0);

3624 if

(tokens.size() - firstColumn != 21) {

3634 const string

& token = tokens[firstColumn + 8];

3635 if

(token.empty() || token.size() > 2) {

3638 if

(token.find_first_not_of(

"-+"

) != string::npos) {

3663

vector<string> hopefullyInts;

3665 if

(hopefullyInts.size() != blockCount) {

3668 for

(

auto

hopefulInt: hopefullyInts) {

3682 const

vector<string>& Fields )

3684 if

( Fields.size() == 0 ) {

3704 const size_t

MIN_HIGH_RATIO = 20;

3705 size_t

high_count = 0;

3711 if

( 0 < high_count &&

m_iTestDataSize

/ high_count < MIN_HIGH_RATIO ) {

3721 if

( string::npos !=

.find(

"\r\n"

) ) {

3724 else if

( string::npos !=

.find(

"\n"

) ) {

3727 else if

( string::npos !=