A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/compart__matching_8cpp_source.html below:

NCBI C++ ToolKit: src/algo/align/splign/compart_matching.cpp Source File

56  const size_t

kNrMersTotal (1 << (kNr * 2));

59  const double

kRepeatsPercentile (0.995);

68  const char

*kFileExt_Masked =

".rep"

;

71  const char

*kFileExt_Remap =

".idc"

;

72  const char

*kFileExt_Offsets =

".ofs"

;

73  const char

*kFileExt_Positions =

".pos"

;

75  const Uint8

kUI8_LoWord (0xFFFFFFFF);

76  const Uint8

kUI8_LoByte (kUI8_LoWord >> 24);

77  const Uint8

kUI8_HiWord (kUI8_LoWord << 32);

78  const Uint8

kUI8_LoFive (kUI8_LoWord | (kUI8_LoWord << 8));

80  const Uint8

kUI8_LoHalfWordEachByte ((

Uint8

(0x0F0F0F0F) << 32) | 0x0F0F0F0F);

82  const Uint4

kUI4_Lo28 (0xFFFFFFFF >> 4);

84  const Uint8

kUI8_SeqDb_lo (0x03030303);

85  const Uint8

kUI8_SeqDb ((kUI8_SeqDb_lo << 32) | kUI8_SeqDb_lo);

89  const size_t

kMapGran (512 * 1024 * 1024);

97  case

0: rv =

'A'

;

break

;

98  case

1: rv =

'C'

;

break

;

99  case

2: rv =

'G'

;

break

;

100  case

3: rv =

'T'

;

break

;

106 #ifdef ALGO_ALIGN_SPLIGN_QCOMP_DEBUG 108 template

<

typename

T>

109 string

GetBinString(

T

v)

113  const size_t

bitdim (

sizeof

(

T

) * 8);

114  for

(

size_t i

(0);

i

< bitdim; ++

i

) {

120  for

(

size_t i

(bitdim);

i

> 0; --

i

) {

121  if

(

i

< bitdim &&

i

% 8 == 0) {

135  Int8

reported_len (-1);

136  for

(

size_t

attempt(0); attempt < 1; ++attempt) {

138  if

(reported_len >= 0 &&

Uint8

(reported_len) == len_bytes) {

148  if

(reported_len < 0) {

149

ostr <<

"Cannot write "

<< filename

150

<<

" (error code = "

<< reported_len <<

"). "

;

153

ostr <<

"The size of "

<< filename <<

" ("

<< reported_len <<

')' 154

<<

" is different from the expected "

<< len_bytes <<

". "

;

156

ostr <<

"Please make sure there is enough disk space."

;

166 template

<

typename

T>

170  for

(

size_t i

(0), imax (4*

sizeof

(

T

)); i < imax; ++i, v >>= 2) {

171

rv = (rv << 2) | (v & 3);

182 template

<

typename

T>

191  for

(

Uint1 i

(1);

i

< 0xFF; ++

i

) {

202  for

(

size_t i

(0), imax(

sizeof

(

T

));

i

< imax; ++

i

) {

225  const Uint4

kMaxTwoBaseContent (14);

227

vector<Uint4> counts(4, 0);

229  for

(

Uint4

k = 0; k < 14; ++k) {

230

++counts[0x00000003 &

key

];

234  const Uint4

ag (counts[0] + counts[1]);

235  const Uint4

at (counts[0] + counts[2]);

236  const Uint4

ac (counts[0] + counts[3]);

237  const Uint4 gt

(counts[1] + counts[2]);

238  const Uint4

gc (counts[1] + counts[3]);

239  const Uint4

tc (counts[2] + counts[3]);

242

ag >= kMaxTwoBaseContent || at >= kMaxTwoBaseContent ||

243

ac >= kMaxTwoBaseContent ||

gt

>= kMaxTwoBaseContent ||

244

gc >= kMaxTwoBaseContent || tc >= kMaxTwoBaseContent;

250  string

dir, base, ext;

260 string ReplaceExt

(

const string

& extended_name,

const string

& new_ext)

262  string

dir, base, ext;

277 template

<

typename

VectorT>

283  const Uint8

len_bytes (v.size() *

sizeof

(TElem));

286  CNcbiOfstream

tempcgrfile (filename.c_str(), IOS_BASE::binary);

287

tempcgrfile.write((

const char

* ) & v.front(), len_bytes);

297 template

<

typename

VectorT>

306  CNcbiIfstream

tempcgrfile (filename.c_str(), IOS_BASE::binary);

307

tempcgrfile.read((

char

* ) & v.front(), dim *

sizeof

(TElem));

320  const string

sfx (

string

(strand?

".p"

:

".m"

) +

".v*"

);

321  const string

mask_ofs_q (

m_lbn_q

+ sfx + kFileExt_Offsets);

325  const string

filename ((*ii)->GetPath());

329  for

(

const Uint8

* p8 (

reinterpret_cast<const Uint8

*

>

(mf.

Map

())),

330

* p8e (p8 + vol_length / 8); p8 != p8e;

351

ostr <<

"Sequence volumes with total length exceeding " 353

<<

" are not yet supported. Please split your FASTA file and re-run " 359  Uint4

current_offset (0);

361

unique_ptr<vector<Uint4> > pNrCounts (

new

vector<Uint4> (kNrMersTotal, 0));

362

vector<Uint4> & NrCounts (*pNrCounts);

364

cerr <<

" Scanning "

<< subjdb->

GetNumSeqs

() <<

" genomic sequences ... "

;

367  const char

* pcb (0);

369  const char

* pcbe (pcb + bases / 4);

372  if

(npcb < npcb0) npcb += 8;

373  const Uint8

gcbase (4*(npcb - npcb0));

374  const Uint8

* pui8 (

reinterpret_cast<const Uint8

*

>

(npcb));

375  const Uint8

* pui8_e (

reinterpret_cast<const Uint8

*

>

(pcbe));

377  for

(

size_t

gccur (current_offset + gcbase); pui8 < pui8_e; ++pui8) {

383 #define QCOMP_COUNT_NrMERS {{ \ 384  if(gccur + 16 >= current_offset + bases) { \ 387  const Uint4 mer4 (ui8 & kUI8_LoWord); \ 388  ++NrCounts[mer4 >> 4]; \ 398  if

(pui8 + 1 < pui8_e) {

400

ui8 |= (*(pui8 + 1) << 32);

408 #undef QCOMP_COUNT_NMERS 413

current_offset += bases;

418

cerr <<

"Ok"

<< endl;

419

cerr <<

" Constructing FV ... "

;

421  string

filename_temp_01;

422

unique_ptr<vector<Uint4> > pNrCounts2 (

new

vector<Uint4>);

423

vector<Uint4> & NrCounts2 (* pNrCounts2);

425

NrCounts2 = NrCounts;

433  size_t

total_mers (0);

434  ITERATE

(vector<Uint4>, ii, NrCounts) {

435  if

(*ii > 0) ++total_mers;

437  const size_t

percentile ((

size_t

)(kNrMersTotal -

438

total_mers * (1 - kRepeatsPercentile)));

439

nth_element(NrCounts.begin(), NrCounts.begin() + percentile, NrCounts.end());

440  const Uint4

max_repeat_count (NrCounts[percentile]);

442  if

(filename_temp_01.empty()) {

443

NrCounts = NrCounts2;

451  for

(

size_t i

(0);

i

< kNrMersTotal; ++

i

) {

461  const Uint8

len_bytes (kNrMersTotal / 8);

470  copy

(src, src + kNrMersTotal / 64, dest);

477

cerr <<

" Reading/transforming FV ... "

;

489  for

(

size_t i

(0);

i

< kNrMersTotal; ++

i

) {

490  if

(nrmers_plus.

get_at

(

i

)) {

492  const size_t

irc (g_RC(

Uint4

(

i

) << 4) & kUI4_Lo28);

498

cerr <<

"Ok"

<< endl;

509  const size_t

num_seqs (blastdb.

GetNumSeqs

());

510

seq_infos.reserve(num_seqs);

512  Uint4

current_offset (0);

516  if

(seqlen <= 0 ||

size_t

(seqlen) >= 0xFFFFFFFF) {

518

ostr <<

"Cannot create remap data for:\t"

<<

519

blastdb.

GetSeqIDs

(oid).front()->GetSeqIdString(

true

);

524  const Uint4

bases (seqlen);

525

seq_infos.push_back(

SSeqInfo

(current_offset, bases, oid));

526

current_offset += bases;

534  CNcbiOfstream

ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);

535  const Uint8

len_bytes (seq_infos.size() *

sizeof

(

SSeqInfo

));

536

ofstr_remap.write((

const char

*) &seq_infos.front(), len_bytes);

541

cerr <<

" Remap data created for "

<< db <<

"; max offset = " 542

<< current_offset << endl;

551

seq_infos.reserve(num_seqs);

553  Uint4

current_offset (0);

557  if

(seqlen <= 0 ||

size_t

(seqlen) >= 0xFFFFFFFF) {

559

ostr <<

"Cannot create remap data for:\t"

<<

565  const Uint4

bases (seqlen);

567

current_offset += bases;

575  CNcbiOfstream

ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);

576  const Uint8

len_bytes (seq_infos.size() *

sizeof

(

SSeqInfo

));

577

ofstr_remap.write((

const char

*) &seq_infos.front(), len_bytes);

582

cerr <<

" Remap data created for sequences; max offset = " 583

<< current_offset << endl;

592

cerr <<

" Scanning "

<< db <<

" for N-mers and their positions."

<< endl;

599

vector<Uint8> MersAndCoords (mcidx_max, 0);

601  size_t

current_offset (0);

610

ostr <<

"Sequence volumes with total length exceeding " 612

<<

" are not yet supported. Please split your FASTA file and re-run " 621  const char

* pcb (0);

623  const char

* pce (pcb + bases/4);

626  if

(npcb < npcb0) npcb += 8;

627  const Uint8

gcbase (4*(npcb - npcb0));

628  const Uint8

* pui8_start (

reinterpret_cast<const Uint8

*

>

(npcb));

629  const Uint8

* pui8 (pui8_start);

637  if

(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {

638

MersAndCoords.resize(mcidx);

640

MersAndCoords.assign(mcidx_max, 0);

647  for

(

size_t

gccur (current_offset + gcbase);

648

gccur + 16 < current_offset + bases && mcidx < mcidx_max;

653 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \ 654  size_t gccur2 (gccur + 2); \ 655  const Uint8 ui8_2930 (w8 >> 60); \ 656  Uint8 ui8_ls4 (w8 << 4); \ 657  const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \ 658  ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \ 659  ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48); 663 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \ 665  if(gccur + 16 >= current_offset + bases) { \ 668  const Uint8 mer (w8 & kUI8_LoWord); \ 670  if(m_Mers.get_at(mer)) { \ 671  MersAndCoords[mcidx++] = (mer << 32) | gccur; \ 675  const Uint4 rc (g_RC(Uint4(mer))); \ 676  if(m_Mers.get_at(rc)) { \ 677  MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \ 678  ((current_offset + bases - gccur - 16) \ 696  if

(gccur + 32 >= current_offset + bases) {

701

w8 |= ((*(pui8 + 1)) << 32);

715 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX 716 #undef QCOMP_CREATE_GENOMIC_IDX 725  for

(

const char

* p (pcb),

726

*pche ( (

reinterpret_cast<const char

*

>

(pui8)) + 5 );

729  const Uint8

c8 (*p & kUI8_LoByte);

730  const Uint8

ui8curbyte (c8);

732

fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);

736  for

(

Uint4

k(0); k < 4; ++k) {

738  const Uint4

mer (fivebytes & kUI8_LoWord);

739  if

(

m_Mers

.

get_at

(strand? (mer >> 4): (mer & kUI4_Lo28))) {

740  const Uint8

ui8 (mer);

741  const Uint8

coord (current_offset +

742

4*(p - pcb - 5) + k);

743

MersAndCoords[mcidx++] = (ui8 << 32) | coord;

746

fivebytes &= kUI8_LoFive;

748  const Uint8

ui8m ((fivebytes & kUI8_SeqDb) >> 16);

749

fivebytes &= ~kUI8_SeqDb;

752

fivebytes |= ui8curbyte << 32;

758  Uint8

gccur (current_offset + gcbase);

759  for

(; gccur + 32 < current_offset + bases; ++pui8)

763  for

(

Uint8

gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {

765  const Uint8

loword = ui8 & kUI8_LoWord;

767

(loword & kUI4_Lo28)))

769

MersAndCoords[mcidx++] = (loword << 32) | gccur;

772  const Uint8

ui8hi2 ((ui8 >> 62) << 48);

774  const Uint8

ui8m ((ui8 & kUI8_SeqDb) >> 16);

776

ui8 |= (ui8m | ui8hi2);

779  if

(gccur + 32 < current_offset + bases) {

783  const Uint4

* pui4 (

reinterpret_cast<const Uint4

*

>

(

n

));

786

ui8 |= (ui8_4 << 32);

788  for

(

Uint8

gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {

790  const Uint8

loword = ui8 & kUI8_LoWord;

792

(loword & kUI4_Lo28)))

794

MersAndCoords[mcidx++] = (loword << 32) | gccur;

797  const Uint8

ui8hi2 ((ui8 >> 62) << 48);

799  const Uint8

ui8m ((ui8 & kUI8_SeqDb) >> 16);

801

ui8 |= (ui8m | ui8hi2);

807  if

(gccur + 16 <= current_offset + bases) {

809

fivebytes = (gccur == current_offset + gcbase)? fivebytes:

811  const char

* p (

reinterpret_cast<const char

*

>

(pui8_start) + 4

812

+ (gccur - current_offset - gcbase) / 4);

815  const Uint8

loword = fivebytes & kUI8_LoWord;

818

(loword & kUI4_Lo28)))

820

MersAndCoords[mcidx++] = (loword << 32) | gccur;

825  const Uint8

ui8 (*p++ & kUI8_LoByte);

826

fivebytes |= (ui8 << 32);

833

fivebytes &= kUI8_LoFive;

835  const Uint8

ui8m ((fivebytes & kUI8_SeqDb) >> 16);

836

fivebytes &= ~kUI8_SeqDb;

850

current_offset += bases;

852  if

(mcidx >= mcidx_max) {

854

ostr <<

"Selected max volume size is too small: " 855

<<

"it must be large enough to fit the index for the " 856

<<

"longest input sequence."

;

865

MersAndCoords.resize(mcidx);

878

cerr <<

" Scanning sequences for N-mers and their positions."

<< endl;

885

vector<Uint8> MersAndCoords (mcidx_max, 0);

887  size_t

current_offset (0);

895

ostr <<

"Sequence volumes with total length exceeding " 897

<<

" are not yet supported. Please split your FASTA file and re-run " 906  const char

* pcb (0);

908  const char

* pce (pcb + bases/4);

911  if

(npcb < npcb0) npcb += 8;

912  const Uint8

gcbase (4*(npcb - npcb0));

913  const Uint8

* pui8_start (

reinterpret_cast<const Uint8

*

>

(npcb));

914  const Uint8

* pui8 (pui8_start);

922  if

(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {

923

MersAndCoords.resize(mcidx);

925

MersAndCoords.assign(mcidx_max, 0);

932  for

(

size_t

gccur (current_offset + gcbase);

933

gccur + 16 < current_offset + bases && mcidx < mcidx_max;

938 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \ 939  size_t gccur2 (gccur + 2); \ 940  const Uint8 ui8_2930 (w8 >> 60); \ 941  Uint8 ui8_ls4 (w8 << 4); \ 942  const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \ 943  ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \ 944  ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48); 948 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \ 950  if(gccur + 16 >= current_offset + bases) { \ 953  const Uint8 mer (w8 & kUI8_LoWord); \ 955  if(m_Mers.get_at(mer)) { \ 956  MersAndCoords[mcidx++] = (mer << 32) | gccur; \ 960  const Uint4 rc (g_RC(Uint4(mer))); \ 961  if(m_Mers.get_at(rc)) { \ 962  MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \ 963  ((current_offset + bases - gccur - 16) \ 981  if

(gccur + 32 >= current_offset + bases) {

986

w8 |= ((*(pui8 + 1)) << 32);

1000 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX 1001 #undef QCOMP_CREATE_GENOMIC_IDX 1009  Uint8

fivebytes (0);

1010  for

(

const char

* p (pcb),

1011

*pche ( (

reinterpret_cast<const char

*

>

(pui8)) + 5 );

1014  const Uint8

c8 (*p & kUI8_LoByte);

1015  const Uint8

ui8curbyte (c8);

1017

fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);

1021  for

(

Uint4

k(0); k < 4; ++k) {

1023  const Uint4

mer (fivebytes & kUI8_LoWord);

1024  if

(

m_Mers

.

get_at

(strand? (mer >> 4): (mer & kUI4_Lo28))) {

1025  const Uint8

ui8 (mer);

1026  const Uint8

coord (current_offset +

1027

4*(p - pcb - 5) + k);

1028

MersAndCoords[mcidx++] = (ui8 << 32) | coord;

1031

fivebytes &= kUI8_LoFive;

1033  const Uint8

ui8m ((fivebytes & kUI8_SeqDb) >> 16);

1034

fivebytes &= ~kUI8_SeqDb;

1037

fivebytes |= ui8curbyte << 32;

1043  Uint8

gccur (current_offset + gcbase);

1044  for

(; gccur + 32 < current_offset + bases; ++pui8)

1048  for

(

Uint8

gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {

1050  const Uint8

loword = ui8 & kUI8_LoWord;

1052

(loword & kUI4_Lo28)))

1054

MersAndCoords[mcidx++] = (loword << 32) | gccur;

1057  const Uint8

ui8hi2 ((ui8 >> 62) << 48);

1059  const Uint8

ui8m ((ui8 & kUI8_SeqDb) >> 16);

1061

ui8 |= (ui8m | ui8hi2);

1064  if

(gccur + 32 < current_offset + bases) {

1068  const Uint4

* pui4 (

reinterpret_cast<const Uint4

*

>

(

n

));

1071

ui8 |= (ui8_4 << 32);

1073  for

(

Uint8

gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {

1075  const Uint8

loword = ui8 & kUI8_LoWord;

1077

(loword & kUI4_Lo28)))

1079

MersAndCoords[mcidx++] = (loword << 32) | gccur;

1082  const Uint8

ui8hi2 ((ui8 >> 62) << 48);

1084  const Uint8

ui8m ((ui8 & kUI8_SeqDb) >> 16);

1086

ui8 |= (ui8m | ui8hi2);

1092  if

(gccur + 16 <= current_offset + bases) {

1094

fivebytes = (gccur == current_offset + gcbase)? fivebytes:

1095

(ui8 & kUI8_LoWord);

1096  const char

* p (

reinterpret_cast<const char

*

>

(pui8_start) + 4

1097

+ (gccur - current_offset - gcbase) / 4);

1100  const Uint8

loword = fivebytes & kUI8_LoWord;

1103

(loword & kUI4_Lo28)))

1105

MersAndCoords[mcidx++] = (loword << 32) | gccur;

1110  const Uint8

ui8 (*p++ & kUI8_LoByte);

1111

fivebytes |= (ui8 << 32);

1118

fivebytes &= kUI8_LoFive;

1120  const Uint8

ui8m ((fivebytes & kUI8_SeqDb) >> 16);

1121

fivebytes &= ~kUI8_SeqDb;

1135

current_offset += bases;

1137  if

(mcidx >= mcidx_max) {

1139

ostr <<

"Selected max volume size is too small: " 1140

<<

"it must be large enough to fit the index for the " 1141

<<

"longest input sequence."

;

1148

MersAndCoords.resize(mcidx);

1161

vector<Uint8>& MersAndCoords)

1163  const size_t

t0 (time(0));

1175  CNcbiOfstream

ofstr_offs (filename_offs.c_str(), IOS_BASE::binary);

1176  Uint8

bytes_offs (0);

1179  CNcbiOfstream

ofstr_positions (filename_positions.c_str(), IOS_BASE::binary);

1180  Uint8

bytes_positions (0);

1182

cerr <<

" Generating index volume: "

<<

basename

<<

" ... "

;

1187  sort

(MersAndCoords.begin(), MersAndCoords.end());

1189  ITERATE

(vector<Uint8>, ii, MersAndCoords) {

1191  const Uint4

mer ((*ii & kUI8_HiWord) >> 32);

1192  if

(mer != curmer) {

1194

ofstr_offs.write((

const char

*) &mer,

sizeof

mer);

1195

ofstr_offs.write((

const char

*) &curofs,

sizeof

curofs);

1197

bytes_offs +=

sizeof

(mer) +

sizeof

(curofs);

1199  const Uint4

pos (*ii & kUI8_LoWord);

1201

ofstr_positions.write((

const char

*) &pos,

sizeof

pos);

1202

bytes_positions +=

sizeof

(pos);

1207  const Uint4

mer (0);

1208

ofstr_offs.write((

const char

*)&mer,

sizeof

mer);

1209

ofstr_offs.write((

const char

*)&curofs,

sizeof

curofs);

1210

bytes_offs +=

sizeof

(mer) +

sizeof

(curofs);

1213

ofstr_positions.close();

1218

cerr <<

"Ok"

<< endl;

1220  return

time(0) - t0;

1223 #define CHECK_MEMMAP(mm,ofs,len) \ 1225  const size_t ofs1 (mm.GetOffset()); \ 1227  cerr << "Real offset "

<< ofs1 << " different from " << ofs << endl; \

1229

const size_t len1 (mm.GetSize()); \

1231

cerr << "Real length " << len1 << " different from " << len << endl; \

1240

cerr <<

" Matching (strand = "

<< (strand?

"plus"

:

"minus"

) <<

") ... "

;

1245  const string

sfx (

string

(strand?

".p"

:

".m"

) +

".v*"

);

1247  const string

mask_ofs_q (

m_lbn_q

+ sfx + kFileExt_Offsets);

1248  const string

mask_ofs_s (

m_lbn_s

+ sfx + kFileExt_Offsets);

1252  Uint8

elem_hits_total (0);

1256  const string

filename_ofs_s ((*ii_s)->GetPath());

1257  const string

filename_pos_s (

ReplaceExt

(filename_ofs_s, kFileExt_Positions));

1258  const CFile

file_subj (filename_ofs_s);

1259  const size_t

dim_ofs_s (file_subj.

GetLength

() / 8);

1263  const string

filename_ofs_q ((*ii_q)->GetPath());

1264  const string

filename_pos_q (

ReplaceExt

(filename_ofs_q,

1265

kFileExt_Positions));

1267  Uint8

hit_index_dim (0), elem_hits_this_pair (0);

1268  string

filename_hit_index;

1274  const Uint8

* ofs_s (

reinterpret_cast<const Uint8

*

> 1275

(mf_ofs_s.

Map

()));

1277  const CFile

file_query ((*ii_q)->GetPath());

1278  const size_t

dim_ofs_q (file_query.

GetLength

() / 8);

1281  const Uint8

* ofs_q (

reinterpret_cast<const Uint8

*

> 1282

(mf_ofs_q.

Map

()));

1286

hit_index.reserve(

min

(dim_ofs_q, dim_ofs_s));

1288  while

((*ofs_q & kUI8_LoWord) && (*ofs_s & kUI8_LoWord)) {

1290  while

( (*ofs_q & kUI8_LoWord)

1291

&& ((*ofs_q & kUI8_LoWord) < (*ofs_s & kUI8_LoWord)))

1295  if

((*ofs_q & kUI8_LoWord) == 0)

break

;

1297  while

( (*ofs_s & kUI8_LoWord)

1298

&& ((*ofs_s & kUI8_LoWord) < (*ofs_q & kUI8_LoWord)))

1302  if

((*ofs_s & kUI8_LoWord) == 0)

break

;

1304  if

((*ofs_s & kUI8_LoWord) == (*ofs_q & kUI8_LoWord)) {

1326

hit_index_dim = (hit_index.size());

1327

elem_hits_total += elem_hits_this_pair;

1332  const CFile

file_pos_s (filename_pos_s);

1333  const size_t

dim_pos_s (file_pos_s.

GetLength

() / 4);

1336  size_t

map_offset_s (0);

1337  size_t

map_length_s (

min

(kMapGran, 4*dim_pos_s - map_offset_s));

1338  const Uint4

* pos_s (

reinterpret_cast<const Uint4

*

>

(

1339

mf_pos_s.

Map

(map_offset_s, map_length_s)));

1341  const CFile

file_pos_q (filename_pos_q);

1342  const size_t

dim_pos_q (file_pos_q.

GetLength

() / 4);

1345  size_t

map_offset_q (0);

1346  size_t

map_length_q (

min

(kMapGran, 4*dim_pos_q - map_offset_q));

1347  const Uint4

* pos_q (

reinterpret_cast<const Uint4

*

>

(

1348

mf_pos_q.

Map

(map_offset_q, map_length_q)));

1351

vector<Uint8> hits (elem_hits_this_pair);

1354  CNcbiIfstream

ifstr (filename_hit_index.c_str(), IOS_BASE::binary);

1355  for

(

size_t cnt

(0);

cnt

< hit_index_dim; ++

cnt

) {

1358

ifstr.read((

char

*) &hie,

sizeof

(hie));

1362  if

(idx_s_max > dim_pos_s || idx_q_max > dim_pos_q) {

1366  if

(4*idx_s_max >= map_offset_s + map_length_s) {

1369

map_length_s =

min

(kMapGran, 4*dim_pos_s - map_offset_s);

1371

pos_s = (

reinterpret_cast<const Uint4

*

> 1372

(mf_pos_s.

Map

(map_offset_s, map_length_s)));

1375  if

(4*idx_q_max >= map_offset_q + map_length_q) {

1378

map_length_q =

min

(kMapGran, 4*dim_pos_q - map_offset_q);

1380

pos_q = (

reinterpret_cast<const Uint4

*

> 1381

(mf_pos_q.

Map

(map_offset_q, map_length_q)));

1384  for

(

size_t

idx_s (hie.

m_SubjOfs

); idx_s < idx_s_max; ++idx_s) {

1386  Uint8

hiword = pos_s[idx_s - map_offset_s/4];

1388  for

(

size_t

idx_q (hie.

m_QueryOfs

); idx_q < idx_q_max; ++idx_q) {

1390  const Uint8

loword = pos_q[idx_q - map_offset_q/4];

1391

hits[hidx++] = (hiword | loword);

1396  if

(hidx != elem_hits_this_pair) {

1398

ostr <<

"The number of hits found ("

<< hidx

1399

<<

") does not match the expected "

<< elem_hits_this_pair;

1414

cerr <<

"Ok"

<< endl;

1422  return

(lhs & kUI8_LoWord) < (rhs & kUI8_LoWord);

1428  const double

lhs_q (

double

(lhs & kUI8_LoWord) / 2);

1429  const double

lhs_s (

double

(lhs >> 32) / 2);

1430  const double

rhs_q (

double

(rhs & kUI8_LoWord) / 2);

1431  const double

rhs_s (

double

(rhs >> 32) / 2);

1433  const double

lhs_q1 (lhs_q + lhs_s);

1434  const double

lhs_s1 (lhs_s - lhs_q);

1435  const double

rhs_q1 (rhs_q + rhs_s);

1436  const double

rhs_s1 (rhs_s - rhs_q);

1438  if

(lhs_s1 == rhs_s1) {

1439  return

lhs_q1 < rhs_q1;

1442  return

lhs_s1 < rhs_s1;

1459  const string mask

(lbn +

"*"

+ *ii);

1462

fdl.

Add

((*jj)->GetPath());

1470  if

(phits->size() == 0) {

1475  sort

(phits->begin(), phits->end());

1479

TSeqInfos::const_iterator ii_genomic (ii_genomic_b);

1487  size_t

idx_compacted (0);

1488  for

(

size_t

idx (0), idx_hi (phits->size()); idx < idx_hi; ) {

1490  Uint4

gc_s (((*phits)[idx]) >> 32);

1493

ii_genomic = lower_bound(ii_genomic + 1, ii_genomic_e,

SSeqInfo

(gc_s, 0, 0));

1494  if

(ii_genomic == ii_genomic_e || ii_genomic->m_Start > gc_s) {

1498  if

(gc_s < ii_genomic->m_Start ||

1499

ii_genomic->m_Start + ii_genomic->m_Length <= gc_s)

1502

ostr <<

"Global genomic coordinate " 1503

<< gc_s <<

" out of range: [" 1504

<< ii_genomic->m_Start <<

", " 1505

<< (ii_genomic->m_Start + ii_genomic->m_Length) <<

"), " 1511  const Uint4

gc_s_max (ii_genomic->m_Start + ii_genomic->m_Length);

1518  while

(idx < idx_hi && (((*phits)[idx]) >> 32) < gc_s_max) {

1523  sort

(phits->begin() + idx0, phits->begin() + idx,

PLoWord

);

1526

TSeqInfos::const_iterator ii_cdna (ii_cdna_b);

1527  Uint4

gc_q_min (ii_cdna->m_Start);

1528  Uint4

gc_q_max (ii_cdna->m_Start + ii_cdna->m_Length);

1529  size_t

jdx (idx0), jdx0 (jdx);

1530  for

(; jdx < idx; ++jdx) {

1532  Uint4

gc_q (((*phits)[jdx]) & kUI8_LoWord);

1534  if

(gc_q < gc_q_min || gc_q >= gc_q_max) {

1544  if

(

kN

* (jdx - jdx0) >= min_matches / 2) {

1549

jdx0, jdx, &idx_compacted);

1554

ii_cdna = lower_bound(ii_cdna + 1, ii_cdna_e,

SSeqInfo

(gc_q, 0, 0));

1556  if

(ii_cdna == ii_cdna_e || ii_cdna->m_Start > gc_q) {

1560  if

(gc_q < ii_cdna->m_Start ||

1561

ii_cdna->m_Start + ii_cdna->m_Length <= gc_q)

1564

ostr <<

"Global cDNA coordinate " 1565

<< gc_q <<

" out of range: [" 1566

<< ii_cdna->m_Start <<

", " 1567

<< (ii_cdna->m_Start + ii_cdna->m_Length) <<

"), " 1568

<< (

m_cDNASeqIds

[ii_cdna->m_Oid]->GetSeqIdString(

true

));

1573

gc_q_min = ii_cdna->m_Start;

1574

gc_q_max = ii_cdna->m_Start + ii_cdna->m_Length;

1587  if

(

kN

* (jdx - jdx0) >= min_matches / 2) {

1592

jdx0, jdx, &idx_compacted);

1613  const Int8

& right_limit0,

1616  const int

Wm (1), Wms (-2);

1617  int

score (

int

(hitref->GetLength()) * Wm

1618

+

int

(hitref->GetMismatches()) * (Wms - Wm));

1619  int

score_max (score);

1621  const int

overrun (6);

1622  const Int8

left_limit (left_limit0 >= overrun? left_limit0 - overrun: 0);

1623  const Int8

right_limit (right_limit0 >= kDiagMax - overrun?

1624

kDiagMax: right_limit0 + overrun);

1627  Int4

q0 (hitref->GetQueryStart()), s0 (hitref->GetSubjStart());

1628  Uint4

mm (0), mm0 (0);

1629  bool

no_overrun_yet (

true

);

1630  for

(

Int4

q (q0 - 1), s (s0 - 1);

1631

(q + s > left_limit && score +

m_XDropOff

>= score_max

1636  if

(q + s == left_limit0) {

1637

no_overrun_yet =

false

;

1650  if

(score > score_max) {

1654  if

(no_overrun_yet) {

1666  while

(q0 + s0 <= left_limit0) {++q0; ++s0;}

1668  bool

extended_left (

false

);

1669  if

(q0 < hitref->GetQueryStart()) {

1670

hitref->SetQueryStart(q0);

1671

hitref->SetSubjStart(s0);

1672

extended_left =

true

;

1676

q0 = (hitref->GetQueryStop());

1677

s0 = (hitref->GetSubjStop());

1681

no_overrun_yet =

true

;

1683  for

(

Int4

q (q0 + 1), s (s0 + 1);

1684

(q + s < right_limit && score + m_XDropOff >= score_max

1689  if

(q + s == right_limit0) {

1690

no_overrun_yet =

false

;

1703  if

(score > score_max) {

1707  if

(no_overrun_yet) {

1719  while

(q0 + s0 >= right_limit0) {--q0; --s0; }

1721  bool

extended_right (

false

);

1722  if

(q0 > hitref->GetQueryStop()) {

1723

hitref->SetQueryStop(q0);

1724

hitref->SetSubjStop(s0);

1725

extended_right =

true

;

1728  if

(extended_left || extended_right) {

1730

hitref->SetMismatches(mm0);

1731  const THit::TCoord len

(hitref->GetQueryStop() - hitref->GetQueryStart() + 1);

1732

hitref->SetLength(

len

);

1733

hitref->SetIdentity((

len

- mm0) /

double

(

len

));

1734

hitref->SetScore(2 *

len

);

1742

TSeqInfos::const_iterator ii_cdna,

1743

TSeqInfos::const_iterator ii_genomic,

1746  size_t

* pidx_compacted)

1748  if

(idx_start >= idx_stop) {

1757  sort

(pvol->begin() + idx_start, pvol->begin() + idx_stop,

PDiag

);

1759  const size_t

idx_compacted_start (*pidx_compacted);

1762

lens.reserve(pvol->size() / 2);

1764  Uint8

qs0 ((*pvol)[idx_start]);

1765  Uint4

q0 (qs0 & kUI8_LoWord);

1766  Uint4

s0 (qs0 >> 32);

1767

(*pvol)[(*pidx_compacted)++] = qs0;

1770  for

(

size_t

idx (idx_start + 1); idx < idx_stop; ++idx) {

1772  const Uint8

qs ((*pvol)[idx]);

1773  const Uint4

q (qs & kUI8_LoWord);

1774  const Uint4

s (qs >> 32);

1776  if

((q0 >= s0 && q0 - s0 == q - s && q <= q0 + 16) ||

1777

(q0 < s0 && s0 - q0 == s - q && q <= q0 + 16) )

1779

lens.back() += q - q0;

1782

(*pvol)[(*pidx_compacted)++] = qs;

1792  for

(

size_t

idx (idx_compacted_start); idx < *pidx_compacted; ++idx) {

1794  const Uint8

qs ((*pvol)[idx]);

1795  const Uint4

q (qs & kUI8_LoWord);

1796  const Uint4

s (qs >> 32);

1799

hitref->SetQueryStart(q);

1800

hitref->SetSubjStart(s);

1801  const Uint4 len

(lens[idx - idx_compacted_start]);

1802

hitref->SetQueryStop(q +

len

- 1);

1803

hitref->SetSubjStop(s +

len

- 1);

1804

hitref->SetMismatches(0);

1806

hitref->SetEValue(0);

1807

hitref->SetIdentity(1.0);

1808

hitref->SetLength(

len

);

1809

hitref->SetScore(2*

len

);

1810

hitrefs[idx - idx_compacted_start] = hitref;

1813 #define EXTEND_USING_SEQUENCE_CHARS 1814 #ifdef EXTEND_USING_SEQUENCE_CHARS 1818  Int8

left_limit (0);

1819  Int8

right_limit (kDiagMax);

1820  Int8

s_prime_cur (0);

1822  const size_t

kn (hitrefs.size());

1823  for

(

size_t

k(0); k < kn; ++k) {

1828  if

(

Int8

(box[2]) -

Int8

(box[0]) != s_prime_cur) {

1829

left_limit = box[0];

1830

s_prime_cur =

Int8

(box[2]) -

Int8

(box[0]);

1833

right_limit = kDiagMax;

1836  if

(

Int8

(boX[2]) -

Int8

(boX[0]) == s_prime_cur) {

1837

right_limit = boX[0] + boX[2];

1841

left_limit =

x_ExtendHit

(left_limit, right_limit, hitrefs[k]);

1846  Int8

rlimit (0), spc (0);

1847  for

(

size_t

k (0); k < kn; ++k) {

1849  const THit::TCoord

cur_diag_stop (hitrefs[k]->GetQueryStop()

1850

+ hitrefs[k]->GetSubjStop());

1852  const Int8

s_prime (-

Int8

(hitrefs[k]->GetQueryStart())

1853

+

Int8

(hitrefs[k]->GetSubjStart()));

1854  if

(k == 0 || spc != s_prime) {

1856

hitrefs[++d] = hitrefs[k];

1857

rlimit = cur_diag_stop;

1862  const THit::TCoord

cur_diag_start (hitrefs[k]->GetQueryStart()

1863

+ hitrefs[k]->GetSubjStart());

1867  if

(rlimit + 2 == cur_diag_start) {

1869

rlimit = cur_diag_stop;

1870

hrd->SetQueryStop(hitrefs[k]->GetQueryStop());

1871

hrd->SetSubjStop(hitrefs[k]->GetSubjStop());

1872

hrd->SetMismatches(hrd->GetMismatches()

1873

+ hitrefs[k]->GetMismatches());

1875

- hrd->GetQueryStart() + 1);

1876

hrd->SetLength(

len

);

1877

hrd->SetIdentity(

double

(

len

- hrd->GetMismatches()) /

len

);

1878

hrd->SetScore(2 *

len

);

1880  else if

(rlimit + 1 < cur_diag_start) {

1882

hitrefs[++d] = hitrefs[k];

1883

rlimit = cur_diag_stop;

1887  "x_CompartPair(): Unexpected alignment overlap"

);

1891

hitrefs.resize(

min

(

size_t

(d + 1), kn));

1893 #undef EXTEND_USING_SEQUENCE_CHARS 1899  if

((*ii)->GetQueryStop() >= qmax) {

1900

(*ii)->Modify(1, qmax - 1);

1929  const THit

& h (**ii);

1951

ca.

Run

(hitrefs.begin(), hitrefs.end());

1958  const THit

& h (**ii);

1970

pair_results->Set().begin(),

1971

pair_results->Set().end());

1983  CNcbiIfstream

ifstr_genomic (filename_genomic.c_str(), IOS_BASE::binary);

1985

elems_genomic * sizeof (

SSeqInfo

));

1986

ifstr_genomic.close();

1991  CNcbiIfstream

ifstr_cdna (filename_cdna.c_str(), IOS_BASE::binary);

2024

rv = (rv * 3 + (*ii)) % 3571;

2027  return t

- 5000 + rv;

2084

vol_exts.push_back(kFileExt_Offsets);

2085

vol_exts.push_back(kFileExt_Positions);

2106

vol_exts.push_back(kFileExt_Offsets);

2107

vol_exts.push_back(kFileExt_Positions);

2108

vol_exts.push_back(kFileExt_Masked);

2109

vol_exts.push_back(kFileExt_Remap);

ncbi::TMaskedQueryRegions mask

const TCoord * GetBox(void) const

void SetSubjId(const TId &id)

void SetBox(const TCoord box[4])

void SetQueryId(const TId &id)

static CSafeStatic< vector< CSeq_id_Handle > > s_ids

TCoord GetLength(void) const

void Run(typename THitRefs::iterator start, typename THitRefs::iterator finish, CScope *scope=NULL, const vector< pair< TCoord, TCoord > > *gaps=NULL)

Execute: identify compartments.

void SetMaxIntron(TCoord mi)

Assign the maximum intron length, in base pairs.

bool GetFirst(THitRefs &compartment)

Initialize iteration over the results.

CRef< objects::CSeq_align_set > AsSeqAlignSet(void) const

Retrieve all valid compartments in a seq-align-set.

bool GetNext(THitRefs &compartment)

Proceed with iteration over the results.

static TCoord s_GetDefaultMaxIntron(void)

Retrieve the default maximum length of an intron.

void reset_at(const Uint8 idx)

void set_at(const Uint8 idx)

bool get_at(const Uint8 idx) const

void Init(Uint8 dim_bits, bool init_value)

const Uint8 * GetBuffer(void) const

const char * m_CurSeq_cDNA

double m_MinCompartmentIdty

vector< SSeqInfo > TSeqInfos

void x_LoadRemapData(ISequenceSource *m_qsrc, const string &sdb)

void x_InitParticipationVector(bool strand)

void x_CleanVolumes(const string &lbn, const TStrings &vol_extensions)

void x_Search(bool strand)

vector< THitRef > THitRefs

void x_CreateRemapData(const string &db, EIndexMode mode)

vector< string > TStrings

TSeqInfos m_SeqInfos_Genomic

TSeqInfos::const_iterator m_ii_cdna

size_t x_WriteIndexFile(size_t volume, EIndexMode mode, bool strand, vector< Uint8 > &MersAndCoords)

static bool s_IsLowComplexity(size_t key)

double m_MinSingletonIdty

Int8 x_ExtendHit(const Int8 &left_limit, const Int8 &right_limit, THitRef hitref)

TSeqInfos m_SeqInfos_cdna

bool x_IsMatch(Uint8 q, Uint8 s) const

vector< SHitIndexEntry > THitIndex

const char * m_CurSeq_Genomic

void x_CreateIndex(const string &db, EIndexMode index_more, bool strand)

void x_CompartPair(vector< Uint8 > *pvol, TSeqInfos::const_iterator ii_cdna, TSeqInfos::const_iterator ii_genomic, size_t idx_start, size_t idx_stop, size_t *pidx_compacted)

void x_InitFilteringVector(const string &sdb, bool strand)

void x_CompartVolume(vector< Uint8 > *phits)

TSeqInfos::const_iterator m_ii_genomic

CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:

CReverseAndComplement(void)

Uint8 GetTotalLength() const

Returns the sum of the lengths of all available sequences.

list< CRef< CSeq_id > > GetSeqIDs(int oid) const

Gets a list of sequence identifiers.

int GetSeqLength(int oid) const

Returns the sequence length in base pairs or residues.

void RetSequence(const char **buffer) const

Returns any resources associated with the sequence.

int GetNumSeqs() const

Returns the number of sequences available.

int GetSequence(int oid, const char **buffer) const

Get a pointer to raw sequence data.

bool CheckOrFindOID(int &next_oid) const

Find an included OID, incrementing next_oid if necessary.

virtual CConstRef< CSeq_id > GetSeqID(int idx)

virtual Uint8 GetTotalLength(void)

virtual int GetSeqLength(int idx)

virtual bool GetNext(void)

virtual int GetNumSeqs(void)

virtual void ResetIndex(void)

virtual int GetSeq(const char **buffer)

virtual int GetCurrentIndex()

virtual void RetSequence(const char **)

void g_RestoreFromTemp(const string &filename, VectorT *pvd)

string GetLocalBaseName(const string &extended_name, const string &sfx)

#define QCOMP_COUNT_NrMERS

bool PLoWord(const Uint8 &lhs, const Uint8 &rhs)

void CheckWrittenFile(const string &filename, const Uint8 &len_bytes)

char DecodeSeqDbChar(Uint1 c)

CRandom::TValue GenerateSeed(const string &str)

string ReplaceExt(const string &extended_name, const string &new_ext)

string g_SaveToTemp(const VectorT &v, const string &path)

#define QCOMP_CREATE_GENOMIC_IDX(w8, gccur)

T ReverseAndComplement(T v)

#define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX

bool PDiag(const Uint8 &lhs, const Uint8 &rhs)

static const char * str(char *buf, int n)

#define ITERATE(Type, Var, Cont)

ITERATE macro to sequence through container elements.

#define NON_CONST_ITERATE(Type, Var, Cont)

Non constant version of ITERATE macro.

#define NCBI_THROW(exception_class, err_code, message)

Generic macro to throw an exception, given the exception class, error code and message string.

TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const

Get directory entries based on the specified "mask".

static string GetTmpNameEx(const string &dir=kEmptyStr, const string &prefix=kEmptyStr, ETmpFileCreationMode mode=eTmpFileGetName)

Get temporary file name.

void * Map(TOffsetType offset=0, size_t length=0)

Map file.

Int8 GetLength(void) const

Get size of file.

virtual bool Remove(TRemoveFlags flags=eRecursive) const

Remove a directory entry.

bool Unmap(void)

Unmap file if mapped.

static char GetPathSeparator(void)

Get path separator symbol specific for the current platform.

void Add(const string &path)

Add a path for later deletion.

static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)

Split a path string into its basic components.

@ eCreate

Create new file or rewrite existent with zeros.

@ eMMS_Shared

Changes are shared.

@ eMMP_Write

Data can be written.

string GetSeqIdString(bool with_version=false) const

Return seqid string with optional version for text seqid type.

TSeqPos GetLength(const CSeq_id &id, CScope *scope)

Get sequence length if scope not null, else return max possible TSeqPos.

void Reset(void)

Reset reference object.

uint8_t Uint1

1-byte (8-bit) unsigned integer

int32_t Int4

4-byte (32-bit) signed integer

uint32_t Uint4

4-byte (32-bit) unsigned integer

int64_t Int8

8-byte (64-bit) signed integer

uint64_t Uint8

8-byte (64-bit) unsigned integer

Uint4 TValue

Type of the generated integer value and/or the seed value.

TValue GetRand(void)

Get the next random number in the interval [0..GetMax()] (inclusive)

#define END_NCBI_SCOPE

End previously defined NCBI scope.

#define BEGIN_NCBI_SCOPE

Define ncbi namespace.

IO_PREFIX::ofstream CNcbiOfstream

Portable alias for ofstream.

IO_PREFIX::ifstream CNcbiIfstream

Portable alias for ifstream.

static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)

Convert numeric value to string.

constexpr auto sort(_Init &&init)

double value_type

The numeric datatype used by the parser.

const struct ncbi::grid::netcache::search::fields::KEY key

void SleepSec(unsigned long sec, EInterruptOnSignal onsignal=eRestartOnSignal)

Sleep.

Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...

void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)

Defines BLAST database access classes.

const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4