kNrMersTotal (1 << (kNr * 2));
59 const doublekRepeatsPercentile (0.995);
68 const char*kFileExt_Masked =
".rep";
71 const char*kFileExt_Remap =
".idc";
72 const char*kFileExt_Offsets =
".ofs";
73 const char*kFileExt_Positions =
".pos";
75 const Uint8kUI8_LoWord (0xFFFFFFFF);
76 const Uint8kUI8_LoByte (kUI8_LoWord >> 24);
77 const Uint8kUI8_HiWord (kUI8_LoWord << 32);
78 const Uint8kUI8_LoFive (kUI8_LoWord | (kUI8_LoWord << 8));
80 const Uint8kUI8_LoHalfWordEachByte ((
Uint8(0x0F0F0F0F) << 32) | 0x0F0F0F0F);
82 const Uint4kUI4_Lo28 (0xFFFFFFFF >> 4);
84 const Uint8kUI8_SeqDb_lo (0x03030303);
85 const Uint8kUI8_SeqDb ((kUI8_SeqDb_lo << 32) | kUI8_SeqDb_lo);
89 const size_tkMapGran (512 * 1024 * 1024);
97 case0: rv =
'A';
break;
98 case1: rv =
'C';
break;
99 case2: rv =
'G';
break;
100 case3: rv =
'T';
break;
106 #ifdef ALGO_ALIGN_SPLIGN_QCOMP_DEBUG 108 template<
typenameT>
109 stringGetBinString(
Tv)
113 const size_tbitdim (
sizeof(
T) * 8);
114 for(
size_t i(0);
i< bitdim; ++
i) {
120 for(
size_t i(bitdim);
i> 0; --
i) {
121 if(
i< bitdim &&
i% 8 == 0) {
135 Int8reported_len (-1);
136 for(
size_tattempt(0); attempt < 1; ++attempt) {
138 if(reported_len >= 0 &&
Uint8(reported_len) == len_bytes) {
148 if(reported_len < 0) {
149ostr <<
"Cannot write "<< filename
150<<
" (error code = "<< reported_len <<
"). ";
153ostr <<
"The size of "<< filename <<
" ("<< reported_len <<
')' 154<<
" is different from the expected "<< len_bytes <<
". ";
156ostr <<
"Please make sure there is enough disk space.";
166 template<
typenameT>
170 for(
size_t i(0), imax (4*
sizeof(
T)); i < imax; ++i, v >>= 2) {
171rv = (rv << 2) | (v & 3);
182 template<
typenameT>
191 for(
Uint1 i(1);
i< 0xFF; ++
i) {
202 for(
size_t i(0), imax(
sizeof(
T));
i< imax; ++
i) {
225 const Uint4kMaxTwoBaseContent (14);
227vector<Uint4> counts(4, 0);
229 for(
Uint4k = 0; k < 14; ++k) {
230++counts[0x00000003 &
key];
234 const Uint4ag (counts[0] + counts[1]);
235 const Uint4at (counts[0] + counts[2]);
236 const Uint4ac (counts[0] + counts[3]);
237 const Uint4 gt(counts[1] + counts[2]);
238 const Uint4gc (counts[1] + counts[3]);
239 const Uint4tc (counts[2] + counts[3]);
242ag >= kMaxTwoBaseContent || at >= kMaxTwoBaseContent ||
243ac >= kMaxTwoBaseContent ||
gt>= kMaxTwoBaseContent ||
244gc >= kMaxTwoBaseContent || tc >= kMaxTwoBaseContent;
250 stringdir, base, ext;
260 string ReplaceExt(
const string& extended_name,
const string& new_ext)
262 stringdir, base, ext;
277 template<
typenameVectorT>
283 const Uint8len_bytes (v.size() *
sizeof(TElem));
286 CNcbiOfstreamtempcgrfile (filename.c_str(), IOS_BASE::binary);
287tempcgrfile.write((
const char* ) & v.front(), len_bytes);
297 template<
typenameVectorT>
306 CNcbiIfstreamtempcgrfile (filename.c_str(), IOS_BASE::binary);
307tempcgrfile.read((
char* ) & v.front(), dim *
sizeof(TElem));
320 const stringsfx (
string(strand?
".p":
".m") +
".v*");
321 const stringmask_ofs_q (
m_lbn_q+ sfx + kFileExt_Offsets);
325 const stringfilename ((*ii)->GetPath());
329 for(
const Uint8* p8 (
reinterpret_cast<const Uint8*
>(mf.
Map())),
330* p8e (p8 + vol_length / 8); p8 != p8e;
351ostr <<
"Sequence volumes with total length exceeding " 353<<
" are not yet supported. Please split your FASTA file and re-run " 359 Uint4current_offset (0);
361unique_ptr<vector<Uint4> > pNrCounts (
newvector<Uint4> (kNrMersTotal, 0));
362vector<Uint4> & NrCounts (*pNrCounts);
364cerr <<
" Scanning "<< subjdb->
GetNumSeqs() <<
" genomic sequences ... ";
367 const char* pcb (0);
369 const char* pcbe (pcb + bases / 4);
372 if(npcb < npcb0) npcb += 8;
373 const Uint8gcbase (4*(npcb - npcb0));
374 const Uint8* pui8 (
reinterpret_cast<const Uint8*
>(npcb));
375 const Uint8* pui8_e (
reinterpret_cast<const Uint8*
>(pcbe));
377 for(
size_tgccur (current_offset + gcbase); pui8 < pui8_e; ++pui8) {
383 #define QCOMP_COUNT_NrMERS {{ \ 384 if(gccur + 16 >= current_offset + bases) { \ 387 const Uint4 mer4 (ui8 & kUI8_LoWord); \ 388 ++NrCounts[mer4 >> 4]; \ 398 if(pui8 + 1 < pui8_e) {
400ui8 |= (*(pui8 + 1) << 32);
408 #undef QCOMP_COUNT_NMERS 413current_offset += bases;
418cerr <<
"Ok"<< endl;
419cerr <<
" Constructing FV ... ";
421 stringfilename_temp_01;
422unique_ptr<vector<Uint4> > pNrCounts2 (
newvector<Uint4>);
423vector<Uint4> & NrCounts2 (* pNrCounts2);
425NrCounts2 = NrCounts;
433 size_ttotal_mers (0);
434 ITERATE(vector<Uint4>, ii, NrCounts) {
435 if(*ii > 0) ++total_mers;
437 const size_tpercentile ((
size_t)(kNrMersTotal -
438total_mers * (1 - kRepeatsPercentile)));
439nth_element(NrCounts.begin(), NrCounts.begin() + percentile, NrCounts.end());
440 const Uint4max_repeat_count (NrCounts[percentile]);
442 if(filename_temp_01.empty()) {
443NrCounts = NrCounts2;
451 for(
size_t i(0);
i< kNrMersTotal; ++
i) {
461 const Uint8len_bytes (kNrMersTotal / 8);
470 copy(src, src + kNrMersTotal / 64, dest);
477cerr <<
" Reading/transforming FV ... ";
489 for(
size_t i(0);
i< kNrMersTotal; ++
i) {
490 if(nrmers_plus.
get_at(
i)) {
492 const size_tirc (g_RC(
Uint4(
i) << 4) & kUI4_Lo28);
498cerr <<
"Ok"<< endl;
509 const size_tnum_seqs (blastdb.
GetNumSeqs());
510seq_infos.reserve(num_seqs);
512 Uint4current_offset (0);
516 if(seqlen <= 0 ||
size_t(seqlen) >= 0xFFFFFFFF) {
518ostr <<
"Cannot create remap data for:\t"<<
519blastdb.
GetSeqIDs(oid).front()->GetSeqIdString(
true);
524 const Uint4bases (seqlen);
525seq_infos.push_back(
SSeqInfo(current_offset, bases, oid));
526current_offset += bases;
534 CNcbiOfstreamofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
535 const Uint8len_bytes (seq_infos.size() *
sizeof(
SSeqInfo));
536ofstr_remap.write((
const char*) &seq_infos.front(), len_bytes);
541cerr <<
" Remap data created for "<< db <<
"; max offset = " 542<< current_offset << endl;
551seq_infos.reserve(num_seqs);
553 Uint4current_offset (0);
557 if(seqlen <= 0 ||
size_t(seqlen) >= 0xFFFFFFFF) {
559ostr <<
"Cannot create remap data for:\t"<<
565 const Uint4bases (seqlen);
567current_offset += bases;
575 CNcbiOfstreamofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
576 const Uint8len_bytes (seq_infos.size() *
sizeof(
SSeqInfo));
577ofstr_remap.write((
const char*) &seq_infos.front(), len_bytes);
582cerr <<
" Remap data created for sequences; max offset = " 583<< current_offset << endl;
592cerr <<
" Scanning "<< db <<
" for N-mers and their positions."<< endl;
599vector<Uint8> MersAndCoords (mcidx_max, 0);
601 size_tcurrent_offset (0);
610ostr <<
"Sequence volumes with total length exceeding " 612<<
" are not yet supported. Please split your FASTA file and re-run " 621 const char* pcb (0);
623 const char* pce (pcb + bases/4);
626 if(npcb < npcb0) npcb += 8;
627 const Uint8gcbase (4*(npcb - npcb0));
628 const Uint8* pui8_start (
reinterpret_cast<const Uint8*
>(npcb));
629 const Uint8* pui8 (pui8_start);
637 if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
638MersAndCoords.resize(mcidx);
640MersAndCoords.assign(mcidx_max, 0);
647 for(
size_tgccur (current_offset + gcbase);
648gccur + 16 < current_offset + bases && mcidx < mcidx_max;
653 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \ 654 size_t gccur2 (gccur + 2); \ 655 const Uint8 ui8_2930 (w8 >> 60); \ 656 Uint8 ui8_ls4 (w8 << 4); \ 657 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \ 658 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \ 659 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48); 663 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \ 665 if(gccur + 16 >= current_offset + bases) { \ 668 const Uint8 mer (w8 & kUI8_LoWord); \ 670 if(m_Mers.get_at(mer)) { \ 671 MersAndCoords[mcidx++] = (mer << 32) | gccur; \ 675 const Uint4 rc (g_RC(Uint4(mer))); \ 676 if(m_Mers.get_at(rc)) { \ 677 MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \ 678 ((current_offset + bases - gccur - 16) \ 696 if(gccur + 32 >= current_offset + bases) {
701w8 |= ((*(pui8 + 1)) << 32);
715 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX 716 #undef QCOMP_CREATE_GENOMIC_IDX 725 for(
const char* p (pcb),
726*pche ( (
reinterpret_cast<const char*
>(pui8)) + 5 );
729 const Uint8c8 (*p & kUI8_LoByte);
730 const Uint8ui8curbyte (c8);
732fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
736 for(
Uint4k(0); k < 4; ++k) {
738 const Uint4mer (fivebytes & kUI8_LoWord);
739 if(
m_Mers.
get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
740 const Uint8ui8 (mer);
741 const Uint8coord (current_offset +
7424*(p - pcb - 5) + k);
743MersAndCoords[mcidx++] = (ui8 << 32) | coord;
746fivebytes &= kUI8_LoFive;
748 const Uint8ui8m ((fivebytes & kUI8_SeqDb) >> 16);
749fivebytes &= ~kUI8_SeqDb;
752fivebytes |= ui8curbyte << 32;
758 Uint8gccur (current_offset + gcbase);
759 for(; gccur + 32 < current_offset + bases; ++pui8)
763 for(
Uint8gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
765 const Uint8loword = ui8 & kUI8_LoWord;
767(loword & kUI4_Lo28)))
769MersAndCoords[mcidx++] = (loword << 32) | gccur;
772 const Uint8ui8hi2 ((ui8 >> 62) << 48);
774 const Uint8ui8m ((ui8 & kUI8_SeqDb) >> 16);
776ui8 |= (ui8m | ui8hi2);
779 if(gccur + 32 < current_offset + bases) {
783 const Uint4* pui4 (
reinterpret_cast<const Uint4*
>(
n));
786ui8 |= (ui8_4 << 32);
788 for(
Uint8gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
790 const Uint8loword = ui8 & kUI8_LoWord;
792(loword & kUI4_Lo28)))
794MersAndCoords[mcidx++] = (loword << 32) | gccur;
797 const Uint8ui8hi2 ((ui8 >> 62) << 48);
799 const Uint8ui8m ((ui8 & kUI8_SeqDb) >> 16);
801ui8 |= (ui8m | ui8hi2);
807 if(gccur + 16 <= current_offset + bases) {
809fivebytes = (gccur == current_offset + gcbase)? fivebytes:
811 const char* p (
reinterpret_cast<const char*
>(pui8_start) + 4
812+ (gccur - current_offset - gcbase) / 4);
815 const Uint8loword = fivebytes & kUI8_LoWord;
818(loword & kUI4_Lo28)))
820MersAndCoords[mcidx++] = (loword << 32) | gccur;
825 const Uint8ui8 (*p++ & kUI8_LoByte);
826fivebytes |= (ui8 << 32);
833fivebytes &= kUI8_LoFive;
835 const Uint8ui8m ((fivebytes & kUI8_SeqDb) >> 16);
836fivebytes &= ~kUI8_SeqDb;
850current_offset += bases;
852 if(mcidx >= mcidx_max) {
854ostr <<
"Selected max volume size is too small: " 855<<
"it must be large enough to fit the index for the " 856<<
"longest input sequence.";
865MersAndCoords.resize(mcidx);
878cerr <<
" Scanning sequences for N-mers and their positions."<< endl;
885vector<Uint8> MersAndCoords (mcidx_max, 0);
887 size_tcurrent_offset (0);
895ostr <<
"Sequence volumes with total length exceeding " 897<<
" are not yet supported. Please split your FASTA file and re-run " 906 const char* pcb (0);
908 const char* pce (pcb + bases/4);
911 if(npcb < npcb0) npcb += 8;
912 const Uint8gcbase (4*(npcb - npcb0));
913 const Uint8* pui8_start (
reinterpret_cast<const Uint8*
>(npcb));
914 const Uint8* pui8 (pui8_start);
922 if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
923MersAndCoords.resize(mcidx);
925MersAndCoords.assign(mcidx_max, 0);
932 for(
size_tgccur (current_offset + gcbase);
933gccur + 16 < current_offset + bases && mcidx < mcidx_max;
938 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \ 939 size_t gccur2 (gccur + 2); \ 940 const Uint8 ui8_2930 (w8 >> 60); \ 941 Uint8 ui8_ls4 (w8 << 4); \ 942 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \ 943 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \ 944 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48); 948 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \ 950 if(gccur + 16 >= current_offset + bases) { \ 953 const Uint8 mer (w8 & kUI8_LoWord); \ 955 if(m_Mers.get_at(mer)) { \ 956 MersAndCoords[mcidx++] = (mer << 32) | gccur; \ 960 const Uint4 rc (g_RC(Uint4(mer))); \ 961 if(m_Mers.get_at(rc)) { \ 962 MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \ 963 ((current_offset + bases - gccur - 16) \ 981 if(gccur + 32 >= current_offset + bases) {
986w8 |= ((*(pui8 + 1)) << 32);
1000 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX 1001 #undef QCOMP_CREATE_GENOMIC_IDX 1009 Uint8fivebytes (0);
1010 for(
const char* p (pcb),
1011*pche ( (
reinterpret_cast<const char*
>(pui8)) + 5 );
1014 const Uint8c8 (*p & kUI8_LoByte);
1015 const Uint8ui8curbyte (c8);
1017fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
1021 for(
Uint4k(0); k < 4; ++k) {
1023 const Uint4mer (fivebytes & kUI8_LoWord);
1024 if(
m_Mers.
get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
1025 const Uint8ui8 (mer);
1026 const Uint8coord (current_offset +
10274*(p - pcb - 5) + k);
1028MersAndCoords[mcidx++] = (ui8 << 32) | coord;
1031fivebytes &= kUI8_LoFive;
1033 const Uint8ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1034fivebytes &= ~kUI8_SeqDb;
1037fivebytes |= ui8curbyte << 32;
1043 Uint8gccur (current_offset + gcbase);
1044 for(; gccur + 32 < current_offset + bases; ++pui8)
1048 for(
Uint8gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1050 const Uint8loword = ui8 & kUI8_LoWord;
1052(loword & kUI4_Lo28)))
1054MersAndCoords[mcidx++] = (loword << 32) | gccur;
1057 const Uint8ui8hi2 ((ui8 >> 62) << 48);
1059 const Uint8ui8m ((ui8 & kUI8_SeqDb) >> 16);
1061ui8 |= (ui8m | ui8hi2);
1064 if(gccur + 32 < current_offset + bases) {
1068 const Uint4* pui4 (
reinterpret_cast<const Uint4*
>(
n));
1071ui8 |= (ui8_4 << 32);
1073 for(
Uint8gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1075 const Uint8loword = ui8 & kUI8_LoWord;
1077(loword & kUI4_Lo28)))
1079MersAndCoords[mcidx++] = (loword << 32) | gccur;
1082 const Uint8ui8hi2 ((ui8 >> 62) << 48);
1084 const Uint8ui8m ((ui8 & kUI8_SeqDb) >> 16);
1086ui8 |= (ui8m | ui8hi2);
1092 if(gccur + 16 <= current_offset + bases) {
1094fivebytes = (gccur == current_offset + gcbase)? fivebytes:
1095(ui8 & kUI8_LoWord);
1096 const char* p (
reinterpret_cast<const char*
>(pui8_start) + 4
1097+ (gccur - current_offset - gcbase) / 4);
1100 const Uint8loword = fivebytes & kUI8_LoWord;
1103(loword & kUI4_Lo28)))
1105MersAndCoords[mcidx++] = (loword << 32) | gccur;
1110 const Uint8ui8 (*p++ & kUI8_LoByte);
1111fivebytes |= (ui8 << 32);
1118fivebytes &= kUI8_LoFive;
1120 const Uint8ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1121fivebytes &= ~kUI8_SeqDb;
1135current_offset += bases;
1137 if(mcidx >= mcidx_max) {
1139ostr <<
"Selected max volume size is too small: " 1140<<
"it must be large enough to fit the index for the " 1141<<
"longest input sequence.";
1148MersAndCoords.resize(mcidx);
1161vector<Uint8>& MersAndCoords)
1163 const size_tt0 (time(0));
1175 CNcbiOfstreamofstr_offs (filename_offs.c_str(), IOS_BASE::binary);
1176 Uint8bytes_offs (0);
1179 CNcbiOfstreamofstr_positions (filename_positions.c_str(), IOS_BASE::binary);
1180 Uint8bytes_positions (0);
1182cerr <<
" Generating index volume: "<<
basename<<
" ... ";
1187 sort(MersAndCoords.begin(), MersAndCoords.end());
1189 ITERATE(vector<Uint8>, ii, MersAndCoords) {
1191 const Uint4mer ((*ii & kUI8_HiWord) >> 32);
1192 if(mer != curmer) {
1194ofstr_offs.write((
const char*) &mer,
sizeofmer);
1195ofstr_offs.write((
const char*) &curofs,
sizeofcurofs);
1197bytes_offs +=
sizeof(mer) +
sizeof(curofs);
1199 const Uint4pos (*ii & kUI8_LoWord);
1201ofstr_positions.write((
const char*) &pos,
sizeofpos);
1202bytes_positions +=
sizeof(pos);
1207 const Uint4mer (0);
1208ofstr_offs.write((
const char*)&mer,
sizeofmer);
1209ofstr_offs.write((
const char*)&curofs,
sizeofcurofs);
1210bytes_offs +=
sizeof(mer) +
sizeof(curofs);
1213ofstr_positions.close();
1218cerr <<
"Ok"<< endl;
1220 returntime(0) - t0;
1223 #define CHECK_MEMMAP(mm,ofs,len) \ 1225 const size_t ofs1 (mm.GetOffset()); \ 1227 cerr << "Real offset "<< ofs1 << " different from " << ofs << endl; \
1229const size_t len1 (mm.GetSize()); \
1231cerr << "Real length " << len1 << " different from " << len << endl; \
1240cerr <<
" Matching (strand = "<< (strand?
"plus":
"minus") <<
") ... ";
1245 const stringsfx (
string(strand?
".p":
".m") +
".v*");
1247 const stringmask_ofs_q (
m_lbn_q+ sfx + kFileExt_Offsets);
1248 const stringmask_ofs_s (
m_lbn_s+ sfx + kFileExt_Offsets);
1252 Uint8elem_hits_total (0);
1256 const stringfilename_ofs_s ((*ii_s)->GetPath());
1257 const stringfilename_pos_s (
ReplaceExt(filename_ofs_s, kFileExt_Positions));
1258 const CFilefile_subj (filename_ofs_s);
1259 const size_tdim_ofs_s (file_subj.
GetLength() / 8);
1263 const stringfilename_ofs_q ((*ii_q)->GetPath());
1264 const stringfilename_pos_q (
ReplaceExt(filename_ofs_q,
1265kFileExt_Positions));
1267 Uint8hit_index_dim (0), elem_hits_this_pair (0);
1268 stringfilename_hit_index;
1274 const Uint8* ofs_s (
reinterpret_cast<const Uint8*
> 1275(mf_ofs_s.
Map()));
1277 const CFilefile_query ((*ii_q)->GetPath());
1278 const size_tdim_ofs_q (file_query.
GetLength() / 8);
1281 const Uint8* ofs_q (
reinterpret_cast<const Uint8*
> 1282(mf_ofs_q.
Map()));
1286hit_index.reserve(
min(dim_ofs_q, dim_ofs_s));
1288 while((*ofs_q & kUI8_LoWord) && (*ofs_s & kUI8_LoWord)) {
1290 while( (*ofs_q & kUI8_LoWord)
1291&& ((*ofs_q & kUI8_LoWord) < (*ofs_s & kUI8_LoWord)))
1295 if((*ofs_q & kUI8_LoWord) == 0)
break;
1297 while( (*ofs_s & kUI8_LoWord)
1298&& ((*ofs_s & kUI8_LoWord) < (*ofs_q & kUI8_LoWord)))
1302 if((*ofs_s & kUI8_LoWord) == 0)
break;
1304 if((*ofs_s & kUI8_LoWord) == (*ofs_q & kUI8_LoWord)) {
1326hit_index_dim = (hit_index.size());
1327elem_hits_total += elem_hits_this_pair;
1332 const CFilefile_pos_s (filename_pos_s);
1333 const size_tdim_pos_s (file_pos_s.
GetLength() / 4);
1336 size_tmap_offset_s (0);
1337 size_tmap_length_s (
min(kMapGran, 4*dim_pos_s - map_offset_s));
1338 const Uint4* pos_s (
reinterpret_cast<const Uint4*
>(
1339mf_pos_s.
Map(map_offset_s, map_length_s)));
1341 const CFilefile_pos_q (filename_pos_q);
1342 const size_tdim_pos_q (file_pos_q.
GetLength() / 4);
1345 size_tmap_offset_q (0);
1346 size_tmap_length_q (
min(kMapGran, 4*dim_pos_q - map_offset_q));
1347 const Uint4* pos_q (
reinterpret_cast<const Uint4*
>(
1348mf_pos_q.
Map(map_offset_q, map_length_q)));
1351vector<Uint8> hits (elem_hits_this_pair);
1354 CNcbiIfstreamifstr (filename_hit_index.c_str(), IOS_BASE::binary);
1355 for(
size_t cnt(0);
cnt< hit_index_dim; ++
cnt) {
1358ifstr.read((
char*) &hie,
sizeof(hie));
1362 if(idx_s_max > dim_pos_s || idx_q_max > dim_pos_q) {
1366 if(4*idx_s_max >= map_offset_s + map_length_s) {
1369map_length_s =
min(kMapGran, 4*dim_pos_s - map_offset_s);
1371pos_s = (
reinterpret_cast<const Uint4*
> 1372(mf_pos_s.
Map(map_offset_s, map_length_s)));
1375 if(4*idx_q_max >= map_offset_q + map_length_q) {
1378map_length_q =
min(kMapGran, 4*dim_pos_q - map_offset_q);
1380pos_q = (
reinterpret_cast<const Uint4*
> 1381(mf_pos_q.
Map(map_offset_q, map_length_q)));
1384 for(
size_tidx_s (hie.
m_SubjOfs); idx_s < idx_s_max; ++idx_s) {
1386 Uint8hiword = pos_s[idx_s - map_offset_s/4];
1388 for(
size_tidx_q (hie.
m_QueryOfs); idx_q < idx_q_max; ++idx_q) {
1390 const Uint8loword = pos_q[idx_q - map_offset_q/4];
1391hits[hidx++] = (hiword | loword);
1396 if(hidx != elem_hits_this_pair) {
1398ostr <<
"The number of hits found ("<< hidx
1399<<
") does not match the expected "<< elem_hits_this_pair;
1414cerr <<
"Ok"<< endl;
1422 return(lhs & kUI8_LoWord) < (rhs & kUI8_LoWord);
1428 const doublelhs_q (
double(lhs & kUI8_LoWord) / 2);
1429 const doublelhs_s (
double(lhs >> 32) / 2);
1430 const doublerhs_q (
double(rhs & kUI8_LoWord) / 2);
1431 const doublerhs_s (
double(rhs >> 32) / 2);
1433 const doublelhs_q1 (lhs_q + lhs_s);
1434 const doublelhs_s1 (lhs_s - lhs_q);
1435 const doublerhs_q1 (rhs_q + rhs_s);
1436 const doublerhs_s1 (rhs_s - rhs_q);
1438 if(lhs_s1 == rhs_s1) {
1439 returnlhs_q1 < rhs_q1;
1442 returnlhs_s1 < rhs_s1;
1459 const string mask(lbn +
"*"+ *ii);
1462fdl.
Add((*jj)->GetPath());
1470 if(phits->size() == 0) {
1475 sort(phits->begin(), phits->end());
1479TSeqInfos::const_iterator ii_genomic (ii_genomic_b);
1487 size_tidx_compacted (0);
1488 for(
size_tidx (0), idx_hi (phits->size()); idx < idx_hi; ) {
1490 Uint4gc_s (((*phits)[idx]) >> 32);
1493ii_genomic = lower_bound(ii_genomic + 1, ii_genomic_e,
SSeqInfo(gc_s, 0, 0));
1494 if(ii_genomic == ii_genomic_e || ii_genomic->m_Start > gc_s) {
1498 if(gc_s < ii_genomic->m_Start ||
1499ii_genomic->m_Start + ii_genomic->m_Length <= gc_s)
1502ostr <<
"Global genomic coordinate " 1503<< gc_s <<
" out of range: [" 1504<< ii_genomic->m_Start <<
", " 1505<< (ii_genomic->m_Start + ii_genomic->m_Length) <<
"), " 1511 const Uint4gc_s_max (ii_genomic->m_Start + ii_genomic->m_Length);
1518 while(idx < idx_hi && (((*phits)[idx]) >> 32) < gc_s_max) {
1523 sort(phits->begin() + idx0, phits->begin() + idx,
PLoWord);
1526TSeqInfos::const_iterator ii_cdna (ii_cdna_b);
1527 Uint4gc_q_min (ii_cdna->m_Start);
1528 Uint4gc_q_max (ii_cdna->m_Start + ii_cdna->m_Length);
1529 size_tjdx (idx0), jdx0 (jdx);
1530 for(; jdx < idx; ++jdx) {
1532 Uint4gc_q (((*phits)[jdx]) & kUI8_LoWord);
1534 if(gc_q < gc_q_min || gc_q >= gc_q_max) {
1544 if(
kN* (jdx - jdx0) >= min_matches / 2) {
1549jdx0, jdx, &idx_compacted);
1554ii_cdna = lower_bound(ii_cdna + 1, ii_cdna_e,
SSeqInfo(gc_q, 0, 0));
1556 if(ii_cdna == ii_cdna_e || ii_cdna->m_Start > gc_q) {
1560 if(gc_q < ii_cdna->m_Start ||
1561ii_cdna->m_Start + ii_cdna->m_Length <= gc_q)
1564ostr <<
"Global cDNA coordinate " 1565<< gc_q <<
" out of range: [" 1566<< ii_cdna->m_Start <<
", " 1567<< (ii_cdna->m_Start + ii_cdna->m_Length) <<
"), " 1568<< (
m_cDNASeqIds[ii_cdna->m_Oid]->GetSeqIdString(
true));
1573gc_q_min = ii_cdna->m_Start;
1574gc_q_max = ii_cdna->m_Start + ii_cdna->m_Length;
1587 if(
kN* (jdx - jdx0) >= min_matches / 2) {
1592jdx0, jdx, &idx_compacted);
1613 const Int8& right_limit0,
1616 const intWm (1), Wms (-2);
1617 intscore (
int(hitref->GetLength()) * Wm
1618+
int(hitref->GetMismatches()) * (Wms - Wm));
1619 intscore_max (score);
1621 const intoverrun (6);
1622 const Int8left_limit (left_limit0 >= overrun? left_limit0 - overrun: 0);
1623 const Int8right_limit (right_limit0 >= kDiagMax - overrun?
1624kDiagMax: right_limit0 + overrun);
1627 Int4q0 (hitref->GetQueryStart()), s0 (hitref->GetSubjStart());
1628 Uint4mm (0), mm0 (0);
1629 boolno_overrun_yet (
true);
1630 for(
Int4q (q0 - 1), s (s0 - 1);
1631(q + s > left_limit && score +
m_XDropOff>= score_max
1636 if(q + s == left_limit0) {
1637no_overrun_yet =
false;
1650 if(score > score_max) {
1654 if(no_overrun_yet) {
1666 while(q0 + s0 <= left_limit0) {++q0; ++s0;}
1668 boolextended_left (
false);
1669 if(q0 < hitref->GetQueryStart()) {
1670hitref->SetQueryStart(q0);
1671hitref->SetSubjStart(s0);
1672extended_left =
true;
1676q0 = (hitref->GetQueryStop());
1677s0 = (hitref->GetSubjStop());
1681no_overrun_yet =
true;
1683 for(
Int4q (q0 + 1), s (s0 + 1);
1684(q + s < right_limit && score + m_XDropOff >= score_max
1689 if(q + s == right_limit0) {
1690no_overrun_yet =
false;
1703 if(score > score_max) {
1707 if(no_overrun_yet) {
1719 while(q0 + s0 >= right_limit0) {--q0; --s0; }
1721 boolextended_right (
false);
1722 if(q0 > hitref->GetQueryStop()) {
1723hitref->SetQueryStop(q0);
1724hitref->SetSubjStop(s0);
1725extended_right =
true;
1728 if(extended_left || extended_right) {
1730hitref->SetMismatches(mm0);
1731 const THit::TCoord len(hitref->GetQueryStop() - hitref->GetQueryStart() + 1);
1732hitref->SetLength(
len);
1733hitref->SetIdentity((
len- mm0) /
double(
len));
1734hitref->SetScore(2 *
len);
1742TSeqInfos::const_iterator ii_cdna,
1743TSeqInfos::const_iterator ii_genomic,
1746 size_t* pidx_compacted)
1748 if(idx_start >= idx_stop) {
1757 sort(pvol->begin() + idx_start, pvol->begin() + idx_stop,
PDiag);
1759 const size_tidx_compacted_start (*pidx_compacted);
1762lens.reserve(pvol->size() / 2);
1764 Uint8qs0 ((*pvol)[idx_start]);
1765 Uint4q0 (qs0 & kUI8_LoWord);
1766 Uint4s0 (qs0 >> 32);
1767(*pvol)[(*pidx_compacted)++] = qs0;
1770 for(
size_tidx (idx_start + 1); idx < idx_stop; ++idx) {
1772 const Uint8qs ((*pvol)[idx]);
1773 const Uint4q (qs & kUI8_LoWord);
1774 const Uint4s (qs >> 32);
1776 if((q0 >= s0 && q0 - s0 == q - s && q <= q0 + 16) ||
1777(q0 < s0 && s0 - q0 == s - q && q <= q0 + 16) )
1779lens.back() += q - q0;
1782(*pvol)[(*pidx_compacted)++] = qs;
1792 for(
size_tidx (idx_compacted_start); idx < *pidx_compacted; ++idx) {
1794 const Uint8qs ((*pvol)[idx]);
1795 const Uint4q (qs & kUI8_LoWord);
1796 const Uint4s (qs >> 32);
1799hitref->SetQueryStart(q);
1800hitref->SetSubjStart(s);
1801 const Uint4 len(lens[idx - idx_compacted_start]);
1802hitref->SetQueryStop(q +
len- 1);
1803hitref->SetSubjStop(s +
len- 1);
1804hitref->SetMismatches(0);
1806hitref->SetEValue(0);
1807hitref->SetIdentity(1.0);
1808hitref->SetLength(
len);
1809hitref->SetScore(2*
len);
1810hitrefs[idx - idx_compacted_start] = hitref;
1813 #define EXTEND_USING_SEQUENCE_CHARS 1814 #ifdef EXTEND_USING_SEQUENCE_CHARS 1818 Int8left_limit (0);
1819 Int8right_limit (kDiagMax);
1820 Int8s_prime_cur (0);
1822 const size_tkn (hitrefs.size());
1823 for(
size_tk(0); k < kn; ++k) {
1828 if(
Int8(box[2]) -
Int8(box[0]) != s_prime_cur) {
1829left_limit = box[0];
1830s_prime_cur =
Int8(box[2]) -
Int8(box[0]);
1833right_limit = kDiagMax;
1836 if(
Int8(boX[2]) -
Int8(boX[0]) == s_prime_cur) {
1837right_limit = boX[0] + boX[2];
1841left_limit =
x_ExtendHit(left_limit, right_limit, hitrefs[k]);
1846 Int8rlimit (0), spc (0);
1847 for(
size_tk (0); k < kn; ++k) {
1849 const THit::TCoordcur_diag_stop (hitrefs[k]->GetQueryStop()
1850+ hitrefs[k]->GetSubjStop());
1852 const Int8s_prime (-
Int8(hitrefs[k]->GetQueryStart())
1853+
Int8(hitrefs[k]->GetSubjStart()));
1854 if(k == 0 || spc != s_prime) {
1856hitrefs[++d] = hitrefs[k];
1857rlimit = cur_diag_stop;
1862 const THit::TCoordcur_diag_start (hitrefs[k]->GetQueryStart()
1863+ hitrefs[k]->GetSubjStart());
1867 if(rlimit + 2 == cur_diag_start) {
1869rlimit = cur_diag_stop;
1870hrd->SetQueryStop(hitrefs[k]->GetQueryStop());
1871hrd->SetSubjStop(hitrefs[k]->GetSubjStop());
1872hrd->SetMismatches(hrd->GetMismatches()
1873+ hitrefs[k]->GetMismatches());
1875- hrd->GetQueryStart() + 1);
1876hrd->SetLength(
len);
1877hrd->SetIdentity(
double(
len- hrd->GetMismatches()) /
len);
1878hrd->SetScore(2 *
len);
1880 else if(rlimit + 1 < cur_diag_start) {
1882hitrefs[++d] = hitrefs[k];
1883rlimit = cur_diag_stop;
1887 "x_CompartPair(): Unexpected alignment overlap");
1891hitrefs.resize(
min(
size_t(d + 1), kn));
1893 #undef EXTEND_USING_SEQUENCE_CHARS 1899 if((*ii)->GetQueryStop() >= qmax) {
1900(*ii)->Modify(1, qmax - 1);
1929 const THit& h (**ii);
1951ca.
Run(hitrefs.begin(), hitrefs.end());
1958 const THit& h (**ii);
1970pair_results->Set().begin(),
1971pair_results->Set().end());
1983 CNcbiIfstreamifstr_genomic (filename_genomic.c_str(), IOS_BASE::binary);
1985elems_genomic * sizeof (
SSeqInfo));
1986ifstr_genomic.close();
1991 CNcbiIfstreamifstr_cdna (filename_cdna.c_str(), IOS_BASE::binary);
2024rv = (rv * 3 + (*ii)) % 3571;
2027 return t- 5000 + rv;
2084vol_exts.push_back(kFileExt_Offsets);
2085vol_exts.push_back(kFileExt_Positions);
2106vol_exts.push_back(kFileExt_Offsets);
2107vol_exts.push_back(kFileExt_Positions);
2108vol_exts.push_back(kFileExt_Masked);
2109vol_exts.push_back(kFileExt_Remap);
ncbi::TMaskedQueryRegions mask
const TCoord * GetBox(void) const
void SetSubjId(const TId &id)
void SetBox(const TCoord box[4])
void SetQueryId(const TId &id)
static CSafeStatic< vector< CSeq_id_Handle > > s_ids
TCoord GetLength(void) const
void Run(typename THitRefs::iterator start, typename THitRefs::iterator finish, CScope *scope=NULL, const vector< pair< TCoord, TCoord > > *gaps=NULL)
Execute: identify compartments.
void SetMaxIntron(TCoord mi)
Assign the maximum intron length, in base pairs.
bool GetFirst(THitRefs &compartment)
Initialize iteration over the results.
CRef< objects::CSeq_align_set > AsSeqAlignSet(void) const
Retrieve all valid compartments in a seq-align-set.
bool GetNext(THitRefs &compartment)
Proceed with iteration over the results.
static TCoord s_GetDefaultMaxIntron(void)
Retrieve the default maximum length of an intron.
void reset_at(const Uint8 idx)
void set_at(const Uint8 idx)
bool get_at(const Uint8 idx) const
void Init(Uint8 dim_bits, bool init_value)
const Uint8 * GetBuffer(void) const
const char * m_CurSeq_cDNA
double m_MinCompartmentIdty
vector< SSeqInfo > TSeqInfos
void x_LoadRemapData(ISequenceSource *m_qsrc, const string &sdb)
void x_InitParticipationVector(bool strand)
void x_CleanVolumes(const string &lbn, const TStrings &vol_extensions)
void x_Search(bool strand)
vector< THitRef > THitRefs
void x_CreateRemapData(const string &db, EIndexMode mode)
vector< string > TStrings
TSeqInfos m_SeqInfos_Genomic
TSeqInfos::const_iterator m_ii_cdna
size_t x_WriteIndexFile(size_t volume, EIndexMode mode, bool strand, vector< Uint8 > &MersAndCoords)
static bool s_IsLowComplexity(size_t key)
double m_MinSingletonIdty
Int8 x_ExtendHit(const Int8 &left_limit, const Int8 &right_limit, THitRef hitref)
TSeqInfos m_SeqInfos_cdna
bool x_IsMatch(Uint8 q, Uint8 s) const
vector< SHitIndexEntry > THitIndex
const char * m_CurSeq_Genomic
void x_CreateIndex(const string &db, EIndexMode index_more, bool strand)
void x_CompartPair(vector< Uint8 > *pvol, TSeqInfos::const_iterator ii_cdna, TSeqInfos::const_iterator ii_genomic, size_t idx_start, size_t idx_stop, size_t *pidx_compacted)
void x_InitFilteringVector(const string &sdb, bool strand)
void x_CompartVolume(vector< Uint8 > *phits)
TSeqInfos::const_iterator m_ii_genomic
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
CReverseAndComplement(void)
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Gets a list of sequence identifiers.
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
int GetNumSeqs() const
Returns the number of sequences available.
int GetSequence(int oid, const char **buffer) const
Get a pointer to raw sequence data.
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
virtual CConstRef< CSeq_id > GetSeqID(int idx)
virtual Uint8 GetTotalLength(void)
virtual int GetSeqLength(int idx)
virtual bool GetNext(void)
virtual int GetNumSeqs(void)
virtual void ResetIndex(void)
virtual int GetSeq(const char **buffer)
virtual int GetCurrentIndex()
virtual void RetSequence(const char **)
void g_RestoreFromTemp(const string &filename, VectorT *pvd)
string GetLocalBaseName(const string &extended_name, const string &sfx)
#define QCOMP_COUNT_NrMERS
bool PLoWord(const Uint8 &lhs, const Uint8 &rhs)
void CheckWrittenFile(const string &filename, const Uint8 &len_bytes)
char DecodeSeqDbChar(Uint1 c)
CRandom::TValue GenerateSeed(const string &str)
string ReplaceExt(const string &extended_name, const string &new_ext)
string g_SaveToTemp(const VectorT &v, const string &path)
#define QCOMP_CREATE_GENOMIC_IDX(w8, gccur)
T ReverseAndComplement(T v)
#define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
bool PDiag(const Uint8 &lhs, const Uint8 &rhs)
static const char * str(char *buf, int n)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
static string GetTmpNameEx(const string &dir=kEmptyStr, const string &prefix=kEmptyStr, ETmpFileCreationMode mode=eTmpFileGetName)
Get temporary file name.
void * Map(TOffsetType offset=0, size_t length=0)
Map file.
Int8 GetLength(void) const
Get size of file.
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
bool Unmap(void)
Unmap file if mapped.
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
void Add(const string &path)
Add a path for later deletion.
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
@ eCreate
Create new file or rewrite existent with zeros.
@ eMMS_Shared
Changes are shared.
@ eMMP_Write
Data can be written.
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
void Reset(void)
Reset reference object.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
Uint4 TValue
Type of the generated integer value and/or the seed value.
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
constexpr auto sort(_Init &&init)
double value_type
The numeric datatype used by the parser.
const struct ncbi::grid::netcache::search::fields::KEY key
void SleepSec(unsigned long sec, EInterruptOnSignal onsignal=eRestartOnSignal)
Sleep.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Defines BLAST database access classes.
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4