avx2_print256_u32(
const char* prefix,
const__m256i &
value)
81 const size_t n=
sizeof(__m256i) /
sizeof(
unsigned);
84std::cout << prefix <<
" [ ";
85 for(
int i=
n-1; 1; --
i)
91std::cout <<
"]"<< std::endl;
95 voidavx2_print256_u16(
const char* prefix,
const__m256i &
value)
97 const size_t n=
sizeof(__m256i) /
sizeof(
unsigned short);
100std::cout << prefix <<
" [ ";
101 for(
int i=
n-1; 1; --
i)
103std::cout <<
buffer[
i] <<
" ";
107std::cout <<
"]"<< std::endl;
112 #pragma GCC diagnostic push 113 #pragma GCC diagnostic ignored "-Wconversion" 117 #define BM_CSA256(h, l, a, b, c) \ 119 __m256i u = _mm256_xor_si256(a, b); \ 120 h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \ 121 l = _mm256_xor_si256(u, c); \ 124 #define BM_AVX2_BIT_COUNT(ret, v) \ 126 __m256i lo = _mm256_and_si256(v, low_mask); \ 127 __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \ 128 __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \ 129 __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \ 130 ret = _mm256_sad_epu8(cnt1, cnt2); \ 133 #define BM_AVX2_DECL_LOOKUP1 \ 134 __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \ 135 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8); 136 #define BM_AVX2_DECL_LOOKUP2 \ 137 __m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \ 138 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); 140 #define BM_AVX2_POPCNT_PROLOG \ 141 BM_AVX2_DECL_LOOKUP1 \ 142 BM_AVX2_DECL_LOOKUP2 \ 143 __m256i low_mask = _mm256_set1_epi8(0x0f); \ 159__m256i
cnt= _mm256_setzero_si256();
160__m256i ones = _mm256_setzero_si256();
161__m256i twos = _mm256_setzero_si256();
162__m256i fours = _mm256_setzero_si256();
163__m256i eights = _mm256_setzero_si256();
164__m256i sixteens = _mm256_setzero_si256();
165__m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
173 b= _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
176 b= _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
178 BM_CSA256(foursA, twos, twos, twosA, twosB);
180 b= _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
183 b= _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
185 BM_CSA256(foursB, twos, twos, twosA, twosB);
186 BM_CSA256(eightsA, fours, fours, foursA, foursB);
188 b= _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
191 b= _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
193 BM_CSA256(foursA, twos, twos, twosA, twosB);
195 b= _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
198 b= _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
200 BM_CSA256(foursB, twos, twos, twosA, twosB);
201 BM_CSA256(eightsB, fours, fours, foursA, foursB);
202 BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
205 cnt= _mm256_add_epi64(
cnt, bc);
208}
while(block < block_end);
210 cnt= _mm256_slli_epi64(
cnt, 4);
212 cnt= _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 3));
214 cnt= _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 2));
216 cnt= _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 1));
218 cnt= _mm256_add_epi64(
cnt, bc);
222 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
238__m256i
cnt= _mm256_setzero_si256();
246 const__m256i*
BMRESTRICTwave_src = (__m256i*)&block[off];
248__m256i m1A, m1B, m1C, m1D;
249m1A = _mm256_load_si256(wave_src);
250m1B = _mm256_load_si256(wave_src+1);
251 if(!_mm256_testz_si256(m1A, m1A))
254 cnt= _mm256_add_epi64(
cnt, bc);
256 if(!_mm256_testz_si256(m1B, m1B))
259 cnt= _mm256_add_epi64(
cnt, bc);
262m1C = _mm256_load_si256(wave_src+2);
263m1D = _mm256_load_si256(wave_src+3);
264 if(!_mm256_testz_si256(m1C, m1C))
267 cnt= _mm256_add_epi64(
cnt, bc);
269 if(!_mm256_testz_si256(m1D, m1D))
272 cnt= _mm256_add_epi64(
cnt, bc);
278 count= (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
296__m256i
cnt= _mm256_setzero_si256();
302ymm0 = _mm256_load_si256(block);
303ymm1 = _mm256_load_si256(mask_block);
304ymm0 = _mm256_and_si256(ymm0, ymm1);
305++block; ++mask_block;
307 cnt= _mm256_add_epi64(
cnt, bc);
309ymm0 = _mm256_load_si256(block);
310ymm1 = _mm256_load_si256(mask_block);
311ymm0 = _mm256_and_si256(ymm0, ymm1);
312++block; ++mask_block;
314 cnt= _mm256_add_epi64(
cnt, bc);
316ymm0 = _mm256_load_si256(block);
317ymm1 = _mm256_load_si256(mask_block);
318ymm0 = _mm256_and_si256(ymm0, ymm1);
319++block; ++mask_block;
321 cnt= _mm256_add_epi64(
cnt, bc);
323ymm0 = _mm256_load_si256(block);
324ymm1 = _mm256_load_si256(mask_block);
325ymm0 = _mm256_and_si256(ymm0, ymm1);
326++block; ++mask_block;
328 cnt= _mm256_add_epi64(
cnt, bc);
330}
while(block < block_end);
333 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
343__m256i
cnt= _mm256_setzero_si256();
346__m256i
tmp0= _mm256_load_si256(block);
347__m256i
tmp1= _mm256_load_si256(mask_block);
352 cnt= _mm256_add_epi64(
cnt, bc);
354++block; ++mask_block;
356}
while(block < block_end);
359 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
374__m256i
cnt= _mm256_setzero_si256();
375__m256i mA, mB, mC, mD;
378mA = _mm256_xor_si256(_mm256_load_si256(block+0),
379_mm256_load_si256(mask_block+0));
381 cnt= _mm256_add_epi64(
cnt, bc);
383mB = _mm256_xor_si256(_mm256_load_si256(block+1),
384_mm256_load_si256(mask_block+1));
386 cnt= _mm256_add_epi64(
cnt, bc);
388mC = _mm256_xor_si256(_mm256_load_si256(block+2),
389_mm256_load_si256(mask_block+2));
391 cnt= _mm256_add_epi64(
cnt, bc);
393mD = _mm256_xor_si256(_mm256_load_si256(block+3),
394_mm256_load_si256(mask_block+3));
396 cnt= _mm256_add_epi64(
cnt, bc);
398block += 4; mask_block += 4;
400}
while(block < block_end);
403 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
419__m256i
cnt= _mm256_setzero_si256();
422__m256i
tmp0= _mm256_load_si256(block);
423__m256i
tmp1= _mm256_load_si256(mask_block);
428 cnt= _mm256_add_epi64(
cnt, bc);
430++block; ++mask_block;
432}
while(block < block_end);
435 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
452__m256i yM = _mm256_set1_epi32(
int(
mask));
455_mm256_store_si256(dst+0, _mm256_xor_si256(_mm256_load_si256(src+0), yM));
456_mm256_store_si256(dst+1, _mm256_xor_si256(_mm256_load_si256(src+1), yM));
457_mm256_store_si256(dst+2, _mm256_xor_si256(_mm256_load_si256(src+2), yM));
458_mm256_store_si256(dst+3, _mm256_xor_si256(_mm256_load_si256(src+3), yM));
461}
while(src < src_end);
477__m256i yM = _mm256_set1_epi32(
int(
mask));
480_mm256_store_si256(dst+0, _mm256_andnot_si256(_mm256_load_si256(src+0), yM));
481_mm256_store_si256(dst+1, _mm256_andnot_si256(_mm256_load_si256(src+1), yM));
482_mm256_store_si256(dst+2, _mm256_andnot_si256(_mm256_load_si256(src+2), yM));
483_mm256_store_si256(dst+3, _mm256_andnot_si256(_mm256_load_si256(src+3), yM));
486}
while(src < src_end);
499__m256i m1A, m1B, m1C, m1D;
500__m256i accA, accB, accC, accD;
505accA = accB = accC = accD = _mm256_setzero_si256();
509m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
510m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
511m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
512m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
514_mm256_store_si256(dst+0, m1A);
515_mm256_store_si256(dst+1, m1B);
516_mm256_store_si256(dst+2, m1C);
517_mm256_store_si256(dst+3, m1D);
519accA = _mm256_or_si256(accA, m1A);
520accB = _mm256_or_si256(accB, m1B);
521accC = _mm256_or_si256(accC, m1C);
522accD = _mm256_or_si256(accD, m1D);
526}
while(src < src_end);
528accA = _mm256_or_si256(accA, accB);
529accC = _mm256_or_si256(accC, accD);
530accA = _mm256_or_si256(accA, accC);
532 return!_mm256_testz_si256(accA, accA);
546__m256i m1A, m1B, m1C, m1D;
548m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
549m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
550m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
551m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
553_mm256_store_si256(dst+0, m1A);
554_mm256_store_si256(dst+1, m1B);
555_mm256_store_si256(dst+2, m1C);
556_mm256_store_si256(dst+3, m1D);
558m1A = _mm256_or_si256(m1A, m1B);
559m1C = _mm256_or_si256(m1C, m1D);
560m1A = _mm256_or_si256(m1A, m1C);
562 return_mm256_testz_si256(m1A, m1A);
577__m256i m1A, m1B, m1C, m1D;
579m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
580m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
581m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
582m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
584_mm256_store_si256(dst+0, m1A);
585_mm256_store_si256(dst+1, m1B);
586_mm256_store_si256(dst+2, m1C);
587_mm256_store_si256(dst+3, m1D);
589m1A = _mm256_or_si256(m1A, m1B);
590m1C = _mm256_or_si256(m1C, m1D);
591m1A = _mm256_or_si256(m1A, m1C);
593 return_mm256_testz_si256(m1A, m1A);
608 const__m256i maskF = _mm256_set1_epi32(~0u);
610__m256i m1A, m1B, m1C, m1D;
612__m256i mSA, mSB, mSC, mSD;
615mSA = _mm256_load_si256(dst+0);
616mSB = _mm256_load_si256(dst+1);
617mACC1 = _mm256_and_si256(mSA, mSB);
619mSC = _mm256_load_si256(dst+2);
620mSD = _mm256_load_si256(dst+3);
622mACC1 = _mm256_and_si256(mACC1, _mm256_and_si256(mSC, mSD));
624mACC1 = _mm256_xor_si256(mACC1, maskF);
625 if(_mm256_testz_si256(mACC1, mACC1))
629m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
630m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
631m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
632m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
635_mm256_or_si256(_mm256_or_si256(m1A, m1B), _mm256_or_si256(m1C, m1D));
636 boolall_z = _mm256_testz_si256(mACC1, mACC1);
640m1A = _mm256_or_si256(mSA, m1A);
641m1B = _mm256_or_si256(mSB, m1B);
642m1C = _mm256_or_si256(mSC, m1C);
643m1D = _mm256_or_si256(mSD, m1D);
645_mm256_store_si256(dst+0, m1A);
646_mm256_store_si256(dst+1, m1B);
647_mm256_store_si256(dst+2, m1C);
648_mm256_store_si256(dst+3, m1D);
665__m256i m1A, m1B, m1C, m1D;
666__m256i m1E, m1F, m1G, m1H;
669__m256i s1_0, s2_0, s1_1, s2_1;
671s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
672s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
673m1A = _mm256_and_si256(s1_0, s2_0);
674m1B = _mm256_and_si256(s1_1, s2_1);
675s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
676s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
677m1C = _mm256_and_si256(s1_0, s2_0);
678m1D = _mm256_and_si256(s1_1, s2_1);
681__m256i s3_0, s4_0, s3_1, s4_1;
683s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
684s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
685m1E = _mm256_and_si256(s3_0, s4_0);
686m1F = _mm256_and_si256(s3_1, s4_1);
688m1A = _mm256_and_si256(m1A, m1E);
689m1B = _mm256_and_si256(m1B, m1F);
691s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
692s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
693m1G = _mm256_and_si256(s3_0, s4_0);
694m1H = _mm256_and_si256(s3_1, s4_1);
698dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
700m1C = _mm256_and_si256(m1C, m1G);
701m1D = _mm256_and_si256(m1D, m1H);
702m1A = _mm256_and_si256(m1A, dst0);
703m1B = _mm256_and_si256(m1B, dst1);
705dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
707m1C = _mm256_and_si256(m1C, dst0);
708m1D = _mm256_and_si256(m1D, dst1);
710_mm256_store_si256(dst + 0, m1A);
711_mm256_store_si256(dst + 1, m1B);
712_mm256_store_si256(dst + 2, m1C);
713_mm256_store_si256(dst + 3, m1D);
715m1A = _mm256_or_si256(m1A, m1B);
716m1C = _mm256_or_si256(m1C, m1D);
717m1A = _mm256_or_si256(m1A, m1C);
719 return_mm256_testz_si256(m1A, m1A);
731__m256i m1A, m1B, m1C, m1D;
734__m256i s1_0, s2_0, s1_1, s2_1;
736s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
737s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
738m1A = _mm256_and_si256(s1_0, s2_0);
739m1B = _mm256_and_si256(s1_1, s2_1);
740s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
741s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
742m1C = _mm256_and_si256(s1_0, s2_0);
743m1D = _mm256_and_si256(s1_1, s2_1);
747dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
749m1A = _mm256_and_si256(m1A, dst0);
750m1B = _mm256_and_si256(m1B, dst1);
752dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
754m1C = _mm256_and_si256(m1C, dst0);
755m1D = _mm256_and_si256(m1D, dst1);
757_mm256_store_si256(dst + 0, m1A);
758_mm256_store_si256(dst + 1, m1B);
759_mm256_store_si256(dst + 2, m1C);
760_mm256_store_si256(dst + 3, m1D);
762m1A = _mm256_or_si256(m1A, m1B);
763m1C = _mm256_or_si256(m1C, m1D);
764m1A = _mm256_or_si256(m1A, m1C);
766 return_mm256_testz_si256(m1A, m1A);
781__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
782__m256i accA, accB, accC, accD;
784accA = _mm256_setzero_si256();
785accB = _mm256_setzero_si256();
786accC = _mm256_setzero_si256();
787accD = _mm256_setzero_si256();
791m1A = _mm256_loadu_si256(src+0);
792m2A = _mm256_load_si256(dst+0);
793m1A = _mm256_and_si256(m1A, m2A);
794_mm256_store_si256(dst+0, m1A);
795accA = _mm256_or_si256(accA, m1A);
797m1B = _mm256_loadu_si256(src+1);
798m2B = _mm256_load_si256(dst+1);
799m1B = _mm256_and_si256(m1B, m2B);
800_mm256_store_si256(dst+1, m1B);
801accB = _mm256_or_si256(accB, m1B);
803m1C = _mm256_loadu_si256(src+2);
804m2C = _mm256_load_si256(dst+2);
805m1C = _mm256_and_si256(m1C, m2C);
806_mm256_store_si256(dst+2, m1C);
807accC = _mm256_or_si256(accC, m1C);
809m1D = _mm256_loadu_si256(src+3);
810m2D = _mm256_load_si256(dst+3);
811m1D = _mm256_and_si256(m1D, m2D);
812_mm256_store_si256(dst+3, m1D);
813accD = _mm256_or_si256(accD, m1D);
817}
while(src < src_end);
819accA = _mm256_or_si256(accA, accB);
820accC = _mm256_or_si256(accC, accD);
821accA = _mm256_or_si256(accA, accC);
823 return!_mm256_testz_si256(accA, accA);
838__m256i m1A, m1B, m1C, m1D;
840__m256i mAccF0 = _mm256_set1_epi32(~0u);
841__m256i mAccF1 = _mm256_set1_epi32(~0u);
851m1A = _mm256_or_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
852m1B = _mm256_or_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
853mAccF0 = _mm256_and_si256(mAccF0, m1A);
854mAccF0 = _mm256_and_si256(mAccF0, m1B);
856_mm256_stream_si256(dst, m1A);
857_mm256_stream_si256(dst+1, m1B);
861m1C = _mm256_or_si256(_mm256_load_si256(src2), _mm256_load_si256(dst2));
862m1D = _mm256_or_si256(_mm256_load_si256(src2+1), _mm256_load_si256(dst2+1));
863mAccF1 = _mm256_and_si256(mAccF1, m1C);
864mAccF1 = _mm256_and_si256(mAccF1, m1D);
866_mm256_stream_si256(dst2, m1C);
867_mm256_stream_si256(dst2+1, m1D);
869src2 += 2; dst2 += 2;
870}
while(src2 < src_end);
872__m256i maskF = _mm256_set1_epi32(~0u);
873mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
874__m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
875 unsignedmaskA = unsigned(_mm256_movemask_epi8(wcmpA));
876 return(maskA == ~0u);
892__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
893__m256i mAccF0 = _mm256_set1_epi32(~0u);
894__m256i mAccF1 = _mm256_set1_epi32(~0u);
897m1A = _mm256_loadu_si256(src+0);
898m2A = _mm256_load_si256(dst+0);
899m1A = _mm256_or_si256(m1A, m2A);
900_mm256_store_si256(dst+0, m1A);
902m1B = _mm256_loadu_si256(src+1);
903m2B = _mm256_load_si256(dst+1);
904m1B = _mm256_or_si256(m1B, m2B);
905_mm256_store_si256(dst+1, m1B);
907m1C = _mm256_loadu_si256(src+2);
908m2C = _mm256_load_si256(dst+2);
909m1C = _mm256_or_si256(m1C, m2C);
910_mm256_store_si256(dst+2, m1C);
912m1D = _mm256_loadu_si256(src+3);
913m2D = _mm256_load_si256(dst+3);
914m1D = _mm256_or_si256(m1D, m2D);
915_mm256_store_si256(dst+3, m1D);
917mAccF1 = _mm256_and_si256(mAccF1, m1C);
918mAccF1 = _mm256_and_si256(mAccF1, m1D);
919mAccF0 = _mm256_and_si256(mAccF0, m1A);
920mAccF0 = _mm256_and_si256(mAccF0, m1B);
924}
while(src < src_end);
926__m256i maskF = _mm256_set1_epi32(~0u);
927mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
928__m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
929 unsignedmaskA = unsigned(_mm256_movemask_epi8(wcmpA));
930 return(maskA == ~0u);
945__m256i m1A, m1B, m1C, m1D;
946__m256i mAccF0 = _mm256_set1_epi32(~0u);
947__m256i mAccF1 = _mm256_set1_epi32(~0u);
953m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
954m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
955m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
956m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
958_mm256_store_si256(dst+0, m1A);
959_mm256_store_si256(dst+1, m1B);
960_mm256_store_si256(dst+2, m1C);
961_mm256_store_si256(dst+3, m1D);
963mAccF1 = _mm256_and_si256(mAccF1, m1C);
964mAccF1 = _mm256_and_si256(mAccF1, m1D);
965mAccF0 = _mm256_and_si256(mAccF0, m1A);
966mAccF0 = _mm256_and_si256(mAccF0, m1B);
968src1 += 4; src2 += 4; dst += 4;
970}
while(src1 < src_end1);
972__m256i maskF = _mm256_set1_epi32(~0u);
973mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
974__m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
975 unsignedmaskA = unsigned(_mm256_movemask_epi8(wcmpA));
976 return(maskA == ~0u);
991__m256i m1A, m1B, m1C, m1D;
992__m256i mAccF0 = _mm256_set1_epi32(~0u);
993__m256i mAccF1 = _mm256_set1_epi32(~0u);
999m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1000m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1001m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1002m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1004m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1005m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1006m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1007m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1009_mm256_store_si256(dst+0, m1A);
1010_mm256_store_si256(dst+1, m1B);
1011_mm256_store_si256(dst+2, m1C);
1012_mm256_store_si256(dst+3, m1D);
1014mAccF1 = _mm256_and_si256(mAccF1, m1C);
1015mAccF1 = _mm256_and_si256(mAccF1, m1D);
1016mAccF0 = _mm256_and_si256(mAccF0, m1A);
1017mAccF0 = _mm256_and_si256(mAccF0, m1B);
1019src1 += 4; src2 += 4; dst += 4;
1021}
while(src1 < src_end1);
1023__m256i maskF = _mm256_set1_epi32(~0u);
1024mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1025__m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1026 unsignedmaskA = unsigned(_mm256_movemask_epi8(wcmpA));
1027 return(maskA == ~0u);
1045__m256i m1A, m1B, m1C, m1D;
1046__m256i mAccF0 = _mm256_set1_epi32(~0u);
1047__m256i mAccF1 = _mm256_set1_epi32(~0u);
1054m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1055m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1056m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1057m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1059m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1060m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1061m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1062m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1064m1A = _mm256_or_si256(m1A, _mm256_load_si256(src3+0));
1065m1B = _mm256_or_si256(m1B, _mm256_load_si256(src3+1));
1066m1C = _mm256_or_si256(m1C, _mm256_load_si256(src3+2));
1067m1D = _mm256_or_si256(m1D, _mm256_load_si256(src3+3));
1069m1A = _mm256_or_si256(m1A, _mm256_load_si256(src4+0));
1070m1B = _mm256_or_si256(m1B, _mm256_load_si256(src4+1));
1071m1C = _mm256_or_si256(m1C, _mm256_load_si256(src4+2));
1072m1D = _mm256_or_si256(m1D, _mm256_load_si256(src4+3));
1074_mm256_stream_si256(dst+0, m1A);
1075_mm256_stream_si256(dst+1, m1B);
1076_mm256_stream_si256(dst+2, m1C);
1077_mm256_stream_si256(dst+3, m1D);
1079mAccF1 = _mm256_and_si256(mAccF1, m1C);
1080mAccF1 = _mm256_and_si256(mAccF1, m1D);
1081mAccF0 = _mm256_and_si256(mAccF0, m1A);
1082mAccF0 = _mm256_and_si256(mAccF0, m1B);
1084src1 += 4; src2 += 4;
1085src3 += 4; src4 += 4;
1091}
while(src1 < src_end1);
1093__m256i maskF = _mm256_set1_epi32(~0u);
1094mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1095__m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1096 unsignedmaskA = unsigned(_mm256_movemask_epi8(wcmpA));
1097 return(maskA == ~0u);
1111__m256i m1A, m1B, m1C, m1D;
1112__m256i accA, accB, accC, accD;
1117accA = accB = accC = accD = _mm256_setzero_si256();
1121m1A = _mm256_xor_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1122m1B = _mm256_xor_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1123m1C = _mm256_xor_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1124m1D = _mm256_xor_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1126_mm256_store_si256(dst+0, m1A);
1127_mm256_store_si256(dst+1, m1B);
1128_mm256_store_si256(dst+2, m1C);
1129_mm256_store_si256(dst+3, m1D);
1131accA = _mm256_or_si256(accA, m1A);
1132accB = _mm256_or_si256(accB, m1B);
1133accC = _mm256_or_si256(accC, m1C);
1134accD = _mm256_or_si256(accD, m1D);
1138}
while(src < src_end);
1140accA = _mm256_or_si256(accA, accB);
1141accC = _mm256_or_si256(accC, accD);
1142accA = _mm256_or_si256(accA, accC);
1144 return!_mm256_testz_si256(accA, accA);
1158__m256i m1A, m1B, m1C, m1D;
1159__m256i accA, accB, accC, accD;
1164accA = accB = accC = accD = _mm256_setzero_si256();
1168m1A = _mm256_xor_si256(_mm256_load_si256(src1 + 0), _mm256_load_si256(src2 + 0));
1169m1B = _mm256_xor_si256(_mm256_load_si256(src1 + 1), _mm256_load_si256(src2 + 1));
1170m1C = _mm256_xor_si256(_mm256_load_si256(src1 + 2), _mm256_load_si256(src2 + 2));
1171m1D = _mm256_xor_si256(_mm256_load_si256(src1 + 3), _mm256_load_si256(src2 + 3));
1173_mm256_store_si256(dst + 0, m1A);
1174_mm256_store_si256(dst + 1, m1B);
1175_mm256_store_si256(dst + 2, m1C);
1176_mm256_store_si256(dst + 3, m1D);
1178accA = _mm256_or_si256(accA, m1A);
1179accB = _mm256_or_si256(accB, m1B);
1180accC = _mm256_or_si256(accC, m1C);
1181accD = _mm256_or_si256(accD, m1D);
1183src1 += 4; src2 += 4; dst += 4;
1185}
while(src1 < src1_end);
1187accA = _mm256_or_si256(accA, accB);
1188accC = _mm256_or_si256(accC, accD);
1189accA = _mm256_or_si256(accA, accC);
1191 return!_mm256_testz_si256(accA, accA);
1207__m256i m1A, m1B, m1C, m1D;
1208__m256i accA, accB, accC, accD;
1210accA = accB = accC = accD = _mm256_setzero_si256();
1217m1A = _mm256_andnot_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
1218m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1219m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1220m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1222_mm256_store_si256(dst+2, m1C);
1223_mm256_store_si256(dst+3, m1D);
1224_mm256_store_si256(dst+0, m1A);
1225_mm256_store_si256(dst+1, m1B);
1227accA = _mm256_or_si256(accA, m1A);
1228accB = _mm256_or_si256(accB, m1B);
1229accC = _mm256_or_si256(accC, m1C);
1230accD = _mm256_or_si256(accD, m1D);
1233}
while(src < src_end);
1235accA = _mm256_or_si256(accA, accB);
1236accC = _mm256_or_si256(accC, accD);
1237accA = _mm256_or_si256(accA, accC);
1239 return!_mm256_testz_si256(accA, accA);
1253__m256i m1A, m1B, m1C, m1D;
1255m1A = _mm256_andnot_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1256m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1257m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1258m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1260_mm256_store_si256(dst+0, m1A);
1261_mm256_store_si256(dst+1, m1B);
1262_mm256_store_si256(dst+2, m1C);
1263_mm256_store_si256(dst+3, m1D);
1265m1A = _mm256_or_si256(m1A, m1B);
1266m1C = _mm256_or_si256(m1C, m1D);
1267m1A = _mm256_or_si256(m1A, m1C);
1269 return_mm256_testz_si256(m1A, m1A);
1284__m256i m1A, m1B, m1C, m1D;
1286m1A = _mm256_andnot_si256(_mm256_load_si256(src2+0), _mm256_load_si256(src1+0));
1287m1B = _mm256_andnot_si256(_mm256_load_si256(src2+1), _mm256_load_si256(src1+1));
1288m1C = _mm256_andnot_si256(_mm256_load_si256(src2+2), _mm256_load_si256(src1+2));
1289m1D = _mm256_andnot_si256(_mm256_load_si256(src2+3), _mm256_load_si256(src1+3));
1291_mm256_store_si256(dst+0, m1A);
1292_mm256_store_si256(dst+1, m1B);
1293_mm256_store_si256(dst+2, m1C);
1294_mm256_store_si256(dst+3, m1D);
1296m1A = _mm256_or_si256(m1A, m1B);
1297m1C = _mm256_or_si256(m1C, m1D);
1298m1A = _mm256_or_si256(m1A, m1C);
1300 return_mm256_testz_si256(m1A, m1A);
1316__m256i m1A, m1B, m1C, m1D;
1317__m256i m1E, m1F, m1G, m1H;
1318 const__m256i maskF = _mm256_set1_epi32(~0u);
1321__m256i s1_0, s2_0, s1_1, s2_1;
1323s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1324s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1325s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1326s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1328m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1330s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1331s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1332s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1333s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1335m1C = _mm256_and_si256(s1_0, s2_0);
1336m1D = _mm256_and_si256(s1_1, s2_1);
1339__m256i s3_0, s4_0, s3_1, s4_1;
1341s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
1342s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
1343s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1344s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1346m1E = _mm256_and_si256(s3_0, s4_0);
1347m1F = _mm256_and_si256(s3_1, s4_1);
1349m1A = _mm256_and_si256(m1A, m1E);
1350m1B = _mm256_and_si256(m1B, m1F);
1352s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
1353s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
1354s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1355s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1357m1G = _mm256_and_si256(s3_0, s4_0);
1358m1H = _mm256_and_si256(s3_1, s4_1);
1362dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1364m1C = _mm256_and_si256(m1C, m1G);
1365m1D = _mm256_and_si256(m1D, m1H);
1366m1A = _mm256_and_si256(m1A, dst0);
1367m1B = _mm256_and_si256(m1B, dst1);
1369dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1371m1C = _mm256_and_si256(m1C, dst0);
1372m1D = _mm256_and_si256(m1D, dst1);
1374_mm256_store_si256(dst + 0, m1A);
1375_mm256_store_si256(dst + 1, m1B);
1376_mm256_store_si256(dst + 2, m1C);
1377_mm256_store_si256(dst + 3, m1D);
1379m1A = _mm256_or_si256(m1A, m1B);
1380m1C = _mm256_or_si256(m1C, m1D);
1381m1A = _mm256_or_si256(m1A, m1C);
1383 return_mm256_testz_si256(m1A, m1A);
1396__m256i m1A, m1B, m1C, m1D;
1398 const__m256i maskF = _mm256_set1_epi32(~0u);
1401__m256i s1_0, s2_0, s1_1, s2_1;
1403s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1404s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1405s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1406s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1408m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1410s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1411s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1412s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1413s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1415m1C = _mm256_and_si256(s1_0, s2_0);
1416m1D = _mm256_and_si256(s1_1, s2_1);
1444dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1448m1A = _mm256_and_si256(m1A, dst0);
1449m1B = _mm256_and_si256(m1B, dst1);
1451dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1453m1C = _mm256_and_si256(m1C, dst0);
1454m1D = _mm256_and_si256(m1D, dst1);
1456_mm256_store_si256(dst + 0, m1A);
1457_mm256_store_si256(dst + 1, m1B);
1458_mm256_store_si256(dst + 2, m1C);
1459_mm256_store_si256(dst + 3, m1D);
1461m1A = _mm256_or_si256(m1A, m1B);
1462m1C = _mm256_or_si256(m1C, m1D);
1463m1A = _mm256_or_si256(m1A, m1C);
1465 return_mm256_testz_si256(m1A, m1A);
1482__m256i ymm0 = _mm256_set1_epi32(
int(
value));
1485_mm256_store_si256(dst, ymm0);
1486_mm256_store_si256(dst+1, ymm0);
1487_mm256_store_si256(dst+2, ymm0);
1488_mm256_store_si256(dst+3, ymm0);
1491}
while(dst < dst_end);
1506__m256i ymm0, ymm1, ymm2, ymm3;
1513ymm0 = _mm256_load_si256(src+0);
1514ymm1 = _mm256_load_si256(src+1);
1515ymm2 = _mm256_load_si256(src+2);
1516ymm3 = _mm256_load_si256(src+3);
1518_mm256_store_si256(dst+0, ymm0);
1519_mm256_store_si256(dst+1, ymm1);
1520_mm256_store_si256(dst+2, ymm2);
1521_mm256_store_si256(dst+3, ymm3);
1523ymm0 = _mm256_load_si256(src+4);
1524ymm1 = _mm256_load_si256(src+5);
1525ymm2 = _mm256_load_si256(src+6);
1526ymm3 = _mm256_load_si256(src+7);
1528_mm256_store_si256(dst+4, ymm0);
1529_mm256_store_si256(dst+5, ymm1);
1530_mm256_store_si256(dst+6, ymm2);
1531_mm256_store_si256(dst+7, ymm3);
1535}
while(src < src_end);
1548__m256i ymm0, ymm1, ymm2, ymm3;
1555ymm0 = _mm256_loadu_si256(src+0);
1556ymm1 = _mm256_loadu_si256(src+1);
1557ymm2 = _mm256_loadu_si256(src+2);
1558ymm3 = _mm256_loadu_si256(src+3);
1560_mm256_store_si256(dst+0, ymm0);
1561_mm256_store_si256(dst+1, ymm1);
1562_mm256_store_si256(dst+2, ymm2);
1563_mm256_store_si256(dst+3, ymm3);
1565ymm0 = _mm256_loadu_si256(src+4);
1566ymm1 = _mm256_loadu_si256(src+5);
1567ymm2 = _mm256_loadu_si256(src+6);
1568ymm3 = _mm256_loadu_si256(src+7);
1570_mm256_store_si256(dst+4, ymm0);
1571_mm256_store_si256(dst+5, ymm1);
1572_mm256_store_si256(dst+6, ymm2);
1573_mm256_store_si256(dst+7, ymm3);
1577}
while(src < src_end);
1592__m256i ymm0, ymm1, ymm2, ymm3;
1599ymm0 = _mm256_load_si256(src+0);
1600ymm1 = _mm256_load_si256(src+1);
1601ymm2 = _mm256_load_si256(src+2);
1602ymm3 = _mm256_load_si256(src+3);
1604_mm256_stream_si256(dst+0, ymm0);
1605_mm256_stream_si256(dst+1, ymm1);
1606_mm256_stream_si256(dst+2, ymm2);
1607_mm256_stream_si256(dst+3, ymm3);
1609ymm0 = _mm256_load_si256(src+4);
1610ymm1 = _mm256_load_si256(src+5);
1611ymm2 = _mm256_load_si256(src+6);
1612ymm3 = _mm256_load_si256(src+7);
1614_mm256_stream_si256(dst+4, ymm0);
1615_mm256_stream_si256(dst+5, ymm1);
1616_mm256_stream_si256(dst+6, ymm2);
1617_mm256_stream_si256(dst+7, ymm3);
1621}
while(src < src_end);
1634__m256i ymm0, ymm1, ymm2, ymm3;
1641ymm0 = _mm256_loadu_si256(src+0);
1642ymm1 = _mm256_loadu_si256(src+1);
1643ymm2 = _mm256_loadu_si256(src+2);
1644ymm3 = _mm256_loadu_si256(src+3);
1646_mm256_stream_si256(dst+0, ymm0);
1647_mm256_stream_si256(dst+1, ymm1);
1648_mm256_stream_si256(dst+2, ymm2);
1649_mm256_stream_si256(dst+3, ymm3);
1651ymm0 = _mm256_loadu_si256(src+4);
1652ymm1 = _mm256_loadu_si256(src+5);
1653ymm2 = _mm256_loadu_si256(src+6);
1654ymm3 = _mm256_loadu_si256(src+7);
1656_mm256_stream_si256(dst+4, ymm0);
1657_mm256_stream_si256(dst+5, ymm1);
1658_mm256_stream_si256(dst+6, ymm2);
1659_mm256_stream_si256(dst+7, ymm3);
1663}
while(src < src_end);
1679__m256i maskFF = _mm256_set1_epi32(-1);
1686ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+0), maskFF);
1687ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+1), maskFF);
1689_mm256_store_si256(dst+0, ymm0);
1690_mm256_store_si256(dst+1, ymm1);
1692ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+2), maskFF);
1693ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+3), maskFF);
1695_mm256_store_si256(dst+2, ymm0);
1696_mm256_store_si256(dst+3, ymm1);
1700}
while(dst < dst_end);
1715__m256i w0 = _mm256_load_si256(block+0);
1716__m256i w1 = _mm256_load_si256(block+1);
1718__m256i wA = _mm256_or_si256(w0, w1);
1720__m256i w2 = _mm256_load_si256(block+2);
1721__m256i w3 = _mm256_load_si256(block+3);
1723__m256i wB = _mm256_or_si256(w2, w3);
1724wA = _mm256_or_si256(wA, wB);
1726 if(!_mm256_testz_si256(wA, wA))
1729}
while(block < block_end);
1740__m256i wA = _mm256_or_si256(_mm256_load_si256(block+0), _mm256_load_si256(block+1));
1741__m256i wB = _mm256_or_si256(_mm256_load_si256(block+2), _mm256_load_si256(block+3));
1742wA = _mm256_or_si256(wA, wB);
1744 return_mm256_testz_si256(wA, wA);
1754__m256i mV = _mm256_set1_epi32(
int(
value));
1755_mm256_store_si256(dst, mV);
1756_mm256_store_si256(dst + 1, mV);
1757_mm256_store_si256(dst + 2, mV);
1758_mm256_store_si256(dst + 3, mV);
1769 const__m256i maskF = _mm256_set1_epi32(~0u);
1774__m256i m1A = _mm256_load_si256(block+0);
1775__m256i m1B = _mm256_load_si256(block+1);
1776m1A = _mm256_xor_si256(m1A, maskF);
1777m1B = _mm256_xor_si256(m1B, maskF);
1778m1A = _mm256_or_si256(m1A, m1B);
1779 if(!_mm256_testz_si256(m1A, m1A))
1782}
while(block < block_end);
1793__m256i maskF = _mm256_set1_epi32(~0u);
1794__m256i wcmpA = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)ptr), maskF);
1795 unsignedmaskA = unsigned(_mm256_movemask_epi8(wcmpA));
1796 return(maskA == ~0u);
1807__m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
1808 return_mm256_testz_si256(w0, w0);
1818__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1819__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1820w0 = _mm256_or_si256(w0, w1);
1821 return_mm256_testz_si256(w0, w0);
1831__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1832__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1833w0 = _mm256_xor_si256(w0, w1);
1834 return_mm256_testz_si256(w0, w0);
1844__m256i* block_end =
1847__m256i m1COshft, m2COshft;
1848__m256i mAcc = _mm256_set1_epi32(0);
1849__m256i mMask1 = _mm256_set1_epi32(1);
1850__m256i mCOidx = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
1853 for(--block_end; block_end >= block; block_end -= 2)
1855__m256i m1A = _mm256_load_si256(block_end);
1856__m256i m2A = _mm256_load_si256(block_end-1);
1858__m256i m1CO = _mm256_and_si256(m1A, mMask1);
1859__m256i m2CO = _mm256_and_si256(m2A, mMask1);
1861co2 = _mm256_extract_epi32(m1CO, 0);
1863m1A = _mm256_srli_epi32(m1A, 1);
1864m2A = _mm256_srli_epi32(m2A, 1);
1867m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1868m1COshft = _mm256_insert_epi32(m1COshft, co1, 7);
1872co2 = _mm256_extract_epi32(m2CO, 0);
1874m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1875m2COshft = _mm256_insert_epi32(m2COshft, co1, 7);
1877m1COshft = _mm256_slli_epi32(m1COshft, 31);
1878m2COshft = _mm256_slli_epi32(m2COshft, 31);
1880m1A = _mm256_or_si256(m1A, m1COshft);
1881m2A = _mm256_or_si256(m2A, m2COshft);
1883_mm256_store_si256(block_end, m1A);
1884_mm256_store_si256(block_end-1, m2A);
1886mAcc = _mm256_or_si256(mAcc, m1A);
1887mAcc = _mm256_or_si256(mAcc, m2A);
1893*empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1905 const__m256i* block_end =
1908__m256i m1COshft, m2COshft;
1909__m256i mAcc = _mm256_set1_epi32(0);
1910__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1913 for(;block < block_end; block+=2)
1915__m256i m1A = _mm256_load_si256(block);
1916__m256i m2A = _mm256_load_si256(block+1);
1918__m256i m1CO = _mm256_srli_epi32(m1A, 31);
1919__m256i m2CO = _mm256_srli_epi32(m2A, 31);
1921co2 = _mm256_extract_epi32(m1CO, 7);
1923m1A = _mm256_slli_epi32(m1A, 1);
1924m2A = _mm256_slli_epi32(m2A, 1);
1927m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1928m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
1932co2 = _mm256_extract_epi32(m2CO, 7);
1933m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1934m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1936m1A = _mm256_or_si256(m1A, m1COshft);
1937m2A = _mm256_or_si256(m2A, m2COshft);
1939_mm256_store_si256(block, m1A);
1940_mm256_store_si256(block+1, m2A);
1942mAcc = _mm256_or_si256(mAcc, m1A);
1943mAcc = _mm256_or_si256(mAcc, m2A);
1948*empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1969__m256i m1COshft, m2COshft;
1970__m256i mAcc = _mm256_set1_epi32(0);
1971__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1976 unsigneddi = co1 ? 0 : unsigned(_tzcnt_u64(d));
1977 for(; di < 64 ; ++di)
1983mAcc = _mm256_xor_si256(mAcc, mAcc);
1985mask_block = (__m256i*) &mblock[d_base];
1988block = (__m256i*) &wblock[d_base];
1990 for(
unsigned i= 0;
i< 2; ++
i, block += 2, mask_block += 2)
1992__m256i m1A = _mm256_load_si256(block);
1993__m256i m2A = _mm256_load_si256(block+1);
1995__m256i m1CO = _mm256_srli_epi32(m1A, 31);
1996__m256i m2CO = _mm256_srli_epi32(m2A, 31);
1998co2 = _mm256_extract_epi32(m1CO, 7);
2000m1A = _mm256_slli_epi32(m1A, 1);
2001m2A = _mm256_slli_epi32(m2A, 1);
2003__m256i m1M = _mm256_load_si256(mask_block);
2004__m256i m2M = _mm256_load_si256(mask_block+1);
2007m1COshft = _mm256_insert_epi32(
2008_mm256_permutevar8x32_epi32(m1CO, mCOidx),
2012co2 = _mm256_extract_epi32(m2CO, 7);
2013m2COshft = _mm256_insert_epi32(
2014_mm256_permutevar8x32_epi32(m2CO, mCOidx),
2017m1A = _mm256_or_si256(m1A, m1COshft);
2018m2A = _mm256_or_si256(m2A, m2COshft);
2020m1A = _mm256_and_si256(m1A, m1M);
2021m2A = _mm256_and_si256(m2A, m2M);
2023_mm256_store_si256(block, m1A);
2024_mm256_store_si256(block+1, m2A);
2026mAcc = _mm256_or_si256(mAcc, m1A);
2027mAcc = _mm256_or_si256(mAcc, m2A);
2033 if(_mm256_testz_si256(mAcc, mAcc))
2045 bm::id64_tw0 = wblock[d_base] = (co1 & mblock[d_base]);
2046d |= (dmask & (w0 << di));
2088 const__m256i* block_end =
2091__m256i m1COshft, m2COshft;
2092__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2093__m256i cntAcc = _mm256_setzero_si256();
2100 unsignedco2, co1 = 0;
2101 for(;block < block_end; block+=2)
2103__m256i m1A = _mm256_load_si256(block);
2104__m256i m2A = _mm256_load_si256(block+1);
2106__m256i m1CO = _mm256_srli_epi32(m1A, 31);
2107__m256i m2CO = _mm256_srli_epi32(m2A, 31);
2109co2 = _mm256_extract_epi32(m1CO, 7);
2111__m256i m1As = _mm256_slli_epi32(m1A, 1);
2112__m256i m2As = _mm256_slli_epi32(m2A, 1);
2115m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2116m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
2120co2 = _mm256_extract_epi32(m2CO, 7);
2121m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2122m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2124m1As = _mm256_or_si256(m1As, m1COshft);
2125m2As = _mm256_or_si256(m2As, m2COshft);
2130m1A = _mm256_xor_si256(m1A, m1As);
2131m2A = _mm256_xor_si256(m2A, m2As);
2135cntAcc = _mm256_add_epi64(cntAcc, bc);
2137cntAcc = _mm256_add_epi64(cntAcc, bc);
2142_mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2143 count+= (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2165__m256i m1COshft, m2COshft;
2166__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2168__m256i cntAcc = _mm256_setzero_si256();
2169__m256i cntAcc2 = _mm256_setzero_si256();
2172 unsignedbit_count = 0;
2173 unsignedgap_count = 1;
2177 unsignedco2, co1 = 0;
2178 for(;block < block_end; block+=2, xor_block+=2)
2180__m256i m1A = _mm256_load_si256(block);
2181__m256i m2A = _mm256_load_si256(block+1);
2182__m256i m1B = _mm256_load_si256(xor_block);
2183__m256i m2B = _mm256_load_si256(xor_block+1);
2185m1A = _mm256_xor_si256 (m1A, m1B);
2186m2A = _mm256_xor_si256 (m2A, m2B);
2190cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2192cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2195__m256i m1CO = _mm256_srli_epi32(m1A, 31);
2196__m256i m2CO = _mm256_srli_epi32(m2A, 31);
2198co2 = _mm256_extract_epi32(m1CO, 7);
2200__m256i m1As = _mm256_slli_epi32(m1A, 1);
2201__m256i m2As = _mm256_slli_epi32(m2A, 1);
2204m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2205m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
2209co2 = _mm256_extract_epi32(m2CO, 7);
2210m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2211m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2213m1As = _mm256_or_si256(m1As, m1COshft);
2214m2As = _mm256_or_si256(m2As, m2COshft);
2219m1A = _mm256_xor_si256(m1A, m1As);
2220m2A = _mm256_xor_si256(m2A, m2As);
2224cntAcc = _mm256_add_epi64(cntAcc, bc);
2226cntAcc = _mm256_add_epi64(cntAcc, bc);
2231_mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2232gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2233gap_count -= (w0 & 1u);
2237_mm256_store_si256 ((__m256i*)cnt_v, cntAcc2);
2238bit_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2240*gcount = gap_count;
2241*bcount = bit_count;
2252 unsigned* gcount,
unsigned* bcount)
2256 const__m256i* block_end =
2259__m256i m1COshft, m2COshft;
2260__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2261__m256i cntAcc = _mm256_setzero_si256();
2264 unsignedbit_count = 0;
2265 unsignedgap_count = 1;
2269 unsignedco2, co1 = 0;
2270 for(;block < block_end; block+=2)
2272__m256i m1A = _mm256_load_si256(block);
2273__m256i m2A = _mm256_load_si256(block+1);
2286__m256i m1CO = _mm256_srli_epi32(m1A, 31);
2287__m256i m2CO = _mm256_srli_epi32(m2A, 31);
2289co2 = _mm256_extract_epi32(m1CO, 7);
2291__m256i m1As = _mm256_slli_epi32(m1A, 1);
2292__m256i m2As = _mm256_slli_epi32(m2A, 1);
2295m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2296m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
2300co2 = _mm256_extract_epi32(m2CO, 7);
2301m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2302m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2304m1As = _mm256_or_si256(m1As, m1COshft);
2305m2As = _mm256_or_si256(m2As, m2COshft);
2310m1A = _mm256_xor_si256(m1A, m1As);
2311m2A = _mm256_xor_si256(m2A, m2As);
2315cntAcc = _mm256_add_epi64(cntAcc, bc);
2317cntAcc = _mm256_add_epi64(cntAcc, bc);
2322_mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2323gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2324gap_count -= (w0 & 1u);
2326*gcount = gap_count;
2327*bcount = bit_count;
2342 const__m256i* block1_end =
2344__m256i maskZ = _mm256_setzero_si256();
2346 unsignedsimd_lane = 0;
2349mA = _mm256_xor_si256(_mm256_load_si256(block1),
2350_mm256_load_si256(block2));
2351mB = _mm256_xor_si256(_mm256_load_si256(block1+1),
2352_mm256_load_si256(block2+1));
2353__m256i mOR = _mm256_or_si256(mA, mB);
2354 if(!_mm256_testz_si256(mOR, mOR))
2356 if(!_mm256_testz_si256(mA, mA))
2359 unsigned mask= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2361 intbsf = bm::bsf_asm32(
mask);
2362_mm256_store_si256 ((__m256i*)simd_buf, mA);
2363 unsignedwidx = bsf >> 2;
2364 unsignedw = simd_buf[widx];
2365bsf = bm::bsf_asm32(w);
2366*pos = (simd_lane * 256) + (widx * 32) + bsf;
2370 unsigned mask= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2372 intbsf = bm::bsf_asm32(
mask);
2373_mm256_store_si256 ((__m256i*)simd_buf, mB);
2374 unsignedwidx = bsf >> 2;
2375 unsignedw = simd_buf[widx];
2376bsf = bm::bsf_asm32(w);
2377*pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2382block1+=2; block2+=2;
2384}
while(block1 < block1_end);
2398block = (
const__m256i*)((
bm::word_t*)(block) + off);
2399 const__m256i* block_end =
2401__m256i maskZ = _mm256_setzero_si256();
2403 unsignedsimd_lane = 0;
2406mA = _mm256_load_si256(block); mB = _mm256_load_si256(block+1);
2407__m256i mOR = _mm256_or_si256(mA, mB);
2408 if(!_mm256_testz_si256(mOR, mOR))
2410 if(!_mm256_testz_si256(mA, mA))
2413 unsigned mask= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2415 intbsf = bm::bsf_asm32(
mask);
2416_mm256_store_si256 ((__m256i*)simd_buf, mA);
2417 unsignedwidx = bsf >> 2;
2418 unsignedw = simd_buf[widx];
2419bsf = bm::bsf_asm32(w);
2420*pos = (off * 32) + (simd_lane * 256) + (widx * 32) + bsf;
2424 unsigned mask= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2426 intbsf = bm::bsf_asm32(
mask);
2427_mm256_store_si256 ((__m256i*)simd_buf, mB);
2428 unsignedwidx = bsf >> 2;
2429 unsignedw = simd_buf[widx];
2430bsf = bm::bsf_asm32(w);
2431*pos = (off * 32) + ((++simd_lane) * 256) + (widx * 32) + bsf;
2438}
while(block < block_end);
2454 unsignedavx_vect_waves,
2457__m256i xcnt = _mm256_setzero_si256();
2462 for(
unsigned i= 0;
i< avx_vect_waves; ++
i)
2464__m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
2465__m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
2466__m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
2467xcnt = _mm256_add_epi16(xcnt, ymm_s2);
2472xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
2477xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
2478xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
2479 __m128ixcnt2 =
_mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
2493 unsignednb,
unsignedstart)
2495 const unsignedunroll_factor = 16;
2496 const unsigned len= (
size- start);
2497 const unsignedlen_unr =
len- (
len% unroll_factor);
2502__m256i nbM = _mm256_set1_epi32(
int(nb));
2504 for(k = 0; k < len_unr; k+=unroll_factor)
2506__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2509__m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
2510 if(~0u !=
unsigned(_mm256_movemask_epi8(wcmpA)))
2512__m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
2515__m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
2516 if(~0u !=
unsigned(_mm256_movemask_epi8(wcmpB)))
2519 for(; k <
len; ++k)
2535 unsignedstart,
unsignedstop )
2537 const unsignedunroll_factor = 8;
2538 const unsigned len= (stop - start);
2539 const unsignedlen_unr =
len- (
len% unroll_factor);
2545__m256i mask1 = _mm256_set1_epi32(1);
2551 unsignedk = 0,
mask, w_idx;
2552 for(; k < len_unr; k+=unroll_factor)
2554__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2555__m256i nbitA = _mm256_and_si256 (idxA, sb_mask);
2558nbitA = _mm256_and_si256 (nbitA, sw_mask);
2560__m256i maskA = _mm256_sllv_epi32(mask1, nbitA);
2562_mm256_store_si256 ((__m256i*)mword_v, nwordA);
2565mask_tmp = _mm256_shuffle_epi32 (nwordA,
_MM_SHUFFLE(1,1,1,1));
2566mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2567 mask= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2571mask_tmp = _mm256_xor_si256 (mask_tmp, mask_tmp);
2572mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2576__m256i mtmp0 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 0);
2577__m256i mtmp1 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 1);
2578mask_tmp = _mm256_or_si256 (mtmp0, mtmp1);
2579mtmp0 = _mm256_bsrli_epi128(mask_tmp, 4);
2580mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2581mtmp0 = _mm256_bsrli_epi128(mask_tmp, 8);
2582mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2584 intu0 = _mm256_extract_epi32(mask_tmp, 0);
2589_mm256_store_si256 ((__m256i*)mask_v, maskA);
2597mask_tmp = _mm256_bsrli_epi128(maskA, 4);
2598mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2599__m256i m0 = _mm256_bsrli_epi128(mask_tmp, 8);
2600mask_tmp = _mm256_or_si256 (m0, mask_tmp);
2602u0 = _mm256_extract_epi32(mask_tmp, 0);
2603u4 = _mm256_extract_epi32(mask_tmp, 4);
2608mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 0);
2609__m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2610 mask= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2618block[mword_v[0]] |= mask_v[0];
2619block[mword_v[1]] |= mask_v[1];
2620block[mword_v[2]] |= mask_v[2];
2621block[mword_v[3]] |= mask_v[3];
2628mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 1);
2629__m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2630 mask= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2638block[mword_v[4]] |= mask_v[4];
2639block[mword_v[5]] |= mask_v[5];
2640block[mword_v[6]] |= mask_v[6];
2641block[mword_v[7]] |= mask_v[7];
2647 for(; k <
len; ++k)
2649 unsigned n= idx[k];
2653block[nword] |= (1u << nbit);
2664__m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2665__m256i mask1 = _mm256_set1_epi32(1);
2667__m256i v0, v1, acc1, acc2;
2668v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(0));
2669v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(1));
2670v0 = _mm256_sub_epi32(v0, stride_idx);
2671v1 = _mm256_sub_epi32(v1, stride_idx);
2672v0 = _mm256_sllv_epi32(mask1, v0);
2673v1 = _mm256_sllv_epi32(mask1, v1);
2674acc1 = _mm256_or_si256(v1, v0);
2675v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(2));
2676v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(3));
2677v0 = _mm256_sub_epi32(v0, stride_idx);
2678v1 = _mm256_sub_epi32(v1, stride_idx);
2679v0 = _mm256_sllv_epi32(mask1, v0);
2680v1 = _mm256_sllv_epi32(mask1, v1);
2681acc2 = _mm256_or_si256(v1, v0);
2682target = _mm256_or_si256(target, acc1);
2683v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(4));
2684v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(5));
2685v0 = _mm256_sub_epi32(v0, stride_idx);
2686v1 = _mm256_sub_epi32(v1, stride_idx);
2687v0 = _mm256_sllv_epi32(mask1, v0);
2688v1 = _mm256_sllv_epi32(mask1, v1);
2689acc1 = _mm256_or_si256(v1, v0);
2690target = _mm256_or_si256(target, acc2);
2691v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(6));
2692v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(7));
2693v0 = _mm256_sub_epi32(v0, stride_idx);
2694v1 = _mm256_sub_epi32(v1, stride_idx);
2695v0 = _mm256_sllv_epi32(mask1, v0);
2696v1 = _mm256_sllv_epi32(mask1, v1);
2697acc2 = _mm256_or_si256(v1, v0);
2699target = _mm256_or_si256(target, acc1);
2700target = _mm256_or_si256(target, acc2);
2711 unsignedstart,
unsignedstop )
2713__m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2714__m256i mask1 = _mm256_set1_epi32(1);
2715__m256i* block_avx = (__m256i*)block;
2717 unsignedstride = 0;
2718__m256i* avx_stride_p = block_avx + stride;
2719__m256i blkA = _mm256_load_si256(avx_stride_p);
2721 for(
unsigned i= start;
i< stop; ++
i)
2723 unsigned n= idx[
i];
2725 unsignednew_stride = nbit >> 8;
2726 unsignedstride_bit = nbit & 0xFF;
2727 if(new_stride != stride)
2729_mm256_store_si256(avx_stride_p, blkA);
2730stride = new_stride;
2731avx_stride_p = block_avx + stride;
2732blkA = _mm256_load_si256(avx_stride_p);
2735__m256i v0 = _mm256_set1_epi32(stride_bit);
2736__m256i s0 = _mm256_sub_epi32(v0, stride_idx);
2737__m256i k0 = _mm256_sllv_epi32(mask1, s0);
2738blkA = _mm256_or_si256(blkA, k0);
2741_mm256_store_si256(avx_stride_p, blkA);
2750 unsignedstart,
unsignedstop )
2752 const unsignedunroll_factor = 8;
2753 const unsigned len= (stop - start);
2754 const unsignedlen_unr =
len- (
len% unroll_factor);
2758__m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2759__m256i mask1 = _mm256_set1_epi32(1);
2762__m256i stride_bit_mask = _mm256_set1_epi32(0xFF);
2770__m256i* block_avx = (__m256i*)block;
2771__m256i* avx_stride_p = block_avx + stride;
2773__m256i blkA = _mm256_load_si256(avx_stride_p);
2775 unsignedk = 0,
mask;
2776 for(; k < len_unr; k+=unroll_factor)
2778__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2779__m256i nbitA = _mm256_and_si256 (idxA, sb_mask);
2780__m256i strideA = _mm256_srli_epi32 (nbitA, 8);
2781__m256i strideBitA = _mm256_and_si256 (nbitA, stride_bit_mask);
2784__m256i mask_tmp = _mm256_shuffle_epi32 (strideA, 0x0);
2785mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2786 mask= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, strideA));
2789 unsignednew_stride = (unsigned)_mm256_extract_epi32(strideA, 0);
2790 if(new_stride != stride)
2792_mm256_store_si256(avx_stride_p, blkA);
2793stride = new_stride;
2794avx_stride_p = block_avx + stride;
2795blkA = _mm256_load_si256(avx_stride_p);
2802_mm256_store_si256 ((__m256i*)mstride_bit_v, strideBitA);
2803_mm256_store_si256 ((__m256i*)mstride_v, strideA);
2804 for(
unsignedj = 0; j < 8; ++j)
2806 unsignednew_stride = mstride_v[j];
2807 if(new_stride != stride)
2809_mm256_store_si256(avx_stride_p, blkA);
2810stride = new_stride;
2811avx_stride_p = block_avx + stride;
2812blkA = _mm256_load_si256(avx_stride_p);
2815mask_tmp = _mm256_set1_epi32(mstride_bit_v[j]);
2816mask_tmp = _mm256_sub_epi32(mask_tmp, stride_idx);
2817mask_tmp = _mm256_sllv_epi32(mask1, mask_tmp);
2818blkA = _mm256_or_si256(blkA, mask_tmp);
2822_mm256_store_si256(avx_stride_p, blkA);
2825 for(; k <
len; ++k)
2827 unsigned n= idx[k];
2831block[nword] |= (1u << nbit);
2843__m256i stride_idx1 = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2844__m256i stride_idx2 = _mm256_add_epi32(stride_idx1, _mm256_set1_epi32(32));
2845__m256i maskFF = _mm256_set1_epi32(-1);
2846__m256i maskZ = _mm256_setzero_si256();
2848__m256i v0 = _mm256_set1_epi32(
i);
2849__m256i s0 = _mm256_sub_epi32(v0, stride_idx1);
2850__m256i k1 = _mm256_sllv_epi32(maskFF, s0);
2853__m256i cmp_eq = _mm256_cmpeq_epi32(k1, maskZ);
2854cmp_eq = _mm256_xor_si256(maskFF, cmp_eq);
2855k1 = _mm256_xor_si256(k1, cmp_eq);
2858__m256i cmp_gt = _mm256_cmpgt_epi32 (stride_idx2, v0);
2859cmp_gt = _mm256_xor_si256(maskFF, cmp_gt);
2860__m256i
r= _mm256_xor_si256(k1, cmp_gt);
2880__m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2881__m256i mm_val = _mm256_set1_epi32(
value);
2883__m256i norm_vect8 = _mm256_sub_epi32(vect8, mask0x8);
2884__m256i norm_val = _mm256_sub_epi32(mm_val, mask0x8);
2886__m256i cmp_mask_gt = _mm256_cmpgt_epi32(norm_vect8, norm_val);
2887__m256i cmp_mask_eq = _mm256_cmpeq_epi32(mm_val, vect8);
2889__m256i cmp_mask_ge = _mm256_or_si256(cmp_mask_gt, cmp_mask_eq);
2890 int mask= _mm256_movemask_epi8(cmp_mask_ge);
2893 intbsf = bm::bsf_asm32(
mask);
2909__m256i mZ = _mm256_setzero_si256();
2910__m256i mVal = _mm256_set1_epi16(
value);
2913__m256i mSub = _mm256_subs_epu16(mVal, vect16);
2914__m256i mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2915 unsigned mask= _mm256_movemask_epi8(mge_mask);
2918 intlz = _tzcnt_u32(
mask);
2938 template<
boolRET_TEST=false>
2944 const unsignedlinear_cutoff = 64;
2945 const unsignedunroll_factor = 16;
2951 unsignedend = ((*buf) >> 3);
2953 const unsignedarr_end = end + 1;
2954 if(end <= unroll_factor)
2956 for(;
true; ++start)
2957 if(
buf[start] >= pos)
2964 unsigneddsize = end - start;
2965 for(; dsize >= 64; dsize = end - start)
2967 unsignedmid = (start + end) >> 1;
2968 if(
buf[mid] < pos)
2972 if(
buf[mid = (start + end) >> 1] < pos)
2976 if(
buf[mid = (start + end) >> 1] < pos)
2980 if(
buf[mid = (start + end) >> 1] < pos)
2987dsize = end - start + 1;
2988 if(dsize < linear_cutoff)
2993dsize = arr_end - start;
2995__m256i mZ = _mm256_setzero_si256();
2996__m256i mPos = _mm256_set1_epi16((
unsigned short)pos);
2997__m256i vect16, mSub, mge_mask;
2999 for(
unsignedlen_unr = start + (dsize - (dsize % unroll_factor));
3000start < len_unr; start += unroll_factor)
3002vect16 = _mm256_loadu_si256((__m256i*)(&
buf[start]));
3003mSub = _mm256_subs_epu16(mPos, vect16);
3004mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3005 if(
int mask= _mm256_movemask_epi8(mge_mask);
mask)
3007 intlz = _tzcnt_u32(
mask);
3016vect16 = _mm256_loadu_si256((__m256i*)(&
buf[start]));
3017mSub = _mm256_subs_epu16(mPos, vect16);
3018mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3019 int mask= _mm256_movemask_epi8(mge_mask);
3021 intlz = _tzcnt_u32(
mask);
3025 for(;
true; ++start)
3026 if(
buf[start] >= pos)
3031 if(
unsignedmid = (start + end) >> 1;
buf[mid] < pos)
3035 if(
unsignedmid = (start + end) >> 1;
buf[mid] < pos)
3041res = ((*buf) & 1) ^ ((start-1) & 1);
3042 ifconstexpr(RET_TEST)
3059 returnbm::avx2_gap_bfind<true>(
buf, pos, 0);
3079 unsignedunroll_factor = 8;
3080 unsigned len= to - from + 1;
3081 unsignedlen_unr =
len- (
len% unroll_factor);
3083__m256i mask0x8 = _mm256_set1_epi32(0x80000000);
3084__m256i vect_target = _mm256_set1_epi32(target);
3085__m256i norm_target = _mm256_sub_epi32(vect_target, mask0x8);
3088__m256i vect80, norm_vect80, cmp_mask_ge;
3091 for(; k < len_unr; k += unroll_factor)
3093vect80 = _mm256_loadu_si256((__m256i*)(&arr_base[k]));
3094norm_vect80 = _mm256_sub_epi32(vect80, mask0x8);
3096cmp_mask_ge = _mm256_or_si256(
3097_mm256_cmpgt_epi32(norm_vect80, norm_target),
3098_mm256_cmpeq_epi32(vect80, vect_target)
3100 mask= _mm256_movemask_epi8(cmp_mask_ge);
3103 intbsf = bm::bsf_asm32(
mask);
3104 returnfrom + k + (bsf / 4);
3108 for(; k <
len; ++k)
3110 if(arr_base[k] >= target)
3148 const unsignedunroll_factor = 8;
3149 const unsigned len= (
size- start);
3150 const unsignedlen_unr =
len- (
len% unroll_factor);
3154__m256i maskFF = _mm256_set1_epi32(~0u);
3156__m256i mask_tmp, mask_0;
3160 unsignedk = 0,
mask, w_idx;
3161 for(; k < len_unr; k+=unroll_factor)
3163__m256i nbitA, nwordA;
3164 const unsignedbase = start + k;
3165__m256i* idx_ptr = (__m256i*)(idx+base);
3167nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask);
3171mask_tmp = _mm256_shuffle_epi32 (nwordA,
_MM_SHUFFLE(1,1,1,1));
3172mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
3173 mask= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
3174_mm256_store_si256((__m256i*)mword_v, nwordA);
3179mask_tmp = _mm256_set1_epi32(blk[w_idx]);
3183mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
3184blk[mword_v[5]], blk[mword_v[4]],
3185blk[mword_v[3]], blk[mword_v[2]],
3186blk[mword_v[1]], blk[mword_v[0]]);
3191__m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
3192__m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
3193mask_0 = _mm256_sllv_epi32(mask1, shiftA);
3195mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
3196 if(!_mm256_testz_si256(mask_tmp, mask_tmp))
3198__m256i* target_ptr = (__m256i*)(
arr+base);
3200__m256i maskZ = _mm256_xor_si256(maskFF, maskFF);
3201mask1 = _mm256_slli_epi32(mask1, bit_idx);
3202mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ);
3203mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF);
3204mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
3205_mm256_storeu_si256 (target_ptr,
3206_mm256_or_si256 (mask_tmp,
3207_mm256_loadu_si256(target_ptr)));
3212 for(; k <
len; ++k)
3214 const unsignedbase = start + k;
3235 unsignedbitval = (*block) & 1u;
3238 unsignedbit_idx = 0;
3240 const unsignedvCAP = 64;
3241__m256i maskZ = _mm256_set1_epi32(0);
3243 for(; block < block_end; block += 8)
3249__m256i accA = _mm256_load_si256((__m256i*)block);
3250__m256i cmpA = _mm256_cmpeq_epi8(accA, maskZ);
3251 unsigned mask= ~_mm256_movemask_epi8(cmpA);
3257 unsignedw64_idx = _tzcnt_u32(
mask);
3259bit_idx += k * vCAP;
3266 if(!
val||
val== ~0ull)
3272bitval ^= unsigned(
cmp);
3273 unsigned long longpcu =
reinterpret_cast<unsigned long long>(pcurr);
3275pcurr =
reinterpret_cast<gap_word_t*
>(pcu);
3283 unsignedbits_consumed = 0;
3287 if(bitval != (
val& tz))
3292 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3297tz = (unsigned)_tzcnt_u64(bitval ? ~
val:
val);
3300 bool cmp= ((bits_consumed+=tz) < vCAP);
3308bitval ^= unsigned(
cmp);
3309bit_idx += tz & (vCAP - bits_consumed);
3310 unsigned long longpcu =
reinterpret_cast<unsigned long long>(pcurr);
3312pcurr =
reinterpret_cast<gap_word_t*
>(pcu);
3314 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3324 unsigned len= (unsigned)(pcurr - dest);
3349 const__m256i* sub_block = (__m256i*) (block + off);
3350__m256i* t_sub_block = (__m256i*)(target_block + off);
3354 const__m256i* xor_sub_block = (__m256i*) (xor_block + off);
3355__m256i mA, mB, mC, mD;
3356mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3357_mm256_load_si256(xor_sub_block));
3358mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3359_mm256_load_si256(xor_sub_block+1));
3360mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3361_mm256_load_si256(xor_sub_block+2));
3362mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3363_mm256_load_si256(xor_sub_block+3));
3365_mm256_store_si256(t_sub_block, mA);
3366_mm256_store_si256(t_sub_block+1, mB);
3367_mm256_store_si256(t_sub_block+2, mC);
3368_mm256_store_si256(t_sub_block+3, mD);
3372_mm256_store_si256(t_sub_block , _mm256_load_si256(sub_block));
3373_mm256_store_si256(t_sub_block+1, _mm256_load_si256(sub_block+1));
3374_mm256_store_si256(t_sub_block+2, _mm256_load_si256(sub_block+2));
3375_mm256_store_si256(t_sub_block+3, _mm256_load_si256(sub_block+3));
3402 const__m256i* sub_block = (
const__m256i*) (xor_block + off);
3403__m256i* t_sub_block = (__m256i*)(target_block + off);
3405__m256i mA, mB, mC, mD;
3406mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3407_mm256_load_si256(t_sub_block));
3408mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3409_mm256_load_si256(t_sub_block+1));
3410mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3411_mm256_load_si256(t_sub_block+2));
3412mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3413_mm256_load_si256(t_sub_block+3));
3415_mm256_store_si256(t_sub_block, mA);
3416_mm256_store_si256(t_sub_block+1, mB);
3417_mm256_store_si256(t_sub_block+2, mC);
3418_mm256_store_si256(t_sub_block+3, mD);
3428 #pragma GCC diagnostic pop 3432 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 3433 avx2_xor_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask) 3435 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 3436 avx2_andnot_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask) 3438 #define VECT_BITCOUNT(first, last) \ 3439 avx2_bit_count((__m256i*) (first), (__m256i*) (last)) 3441 #define VECT_BITCOUNT_AND(first, last, mask) \ 3442 avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3444 #define VECT_BITCOUNT_OR(first, last, mask) \ 3445 avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3447 #define VECT_BITCOUNT_XOR(first, last, mask) \ 3448 avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3450 #define VECT_BITCOUNT_SUB(first, last, mask) \ 3451 avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3453 #define VECT_INVERT_BLOCK(first) \ 3454 avx2_invert_block((__m256i*)first); 3456 #define VECT_AND_BLOCK(dst, src) \ 3457 avx2_and_block((__m256i*) dst, (const __m256i*) (src)) 3459 #define VECT_AND_DIGEST(dst, src) \ 3460 avx2_and_digest((__m256i*) dst, (const __m256i*) (src)) 3462 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ 3463 avx2_and_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3465 #define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \ 3466 avx2_and_or_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3468 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \ 3469 avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4)) 3471 #define VECT_AND_DIGEST_3WAY(dst, src1, src2) \ 3472 avx2_and_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3474 #define VECT_OR_BLOCK(dst, src) \ 3475 avx2_or_block((__m256i*) dst, (__m256i*) (src)) 3477 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 3478 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3480 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \ 3481 avx2_or_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3483 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 3484 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3486 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \ 3487 avx2_or_block_5way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2), (__m256i*) (src3), (__m256i*) (src4)) 3489 #define VECT_SUB_BLOCK(dst, src) \ 3490 avx2_sub_block((__m256i*) dst, (__m256i*) (src)) 3492 #define VECT_SUB_DIGEST(dst, src) \ 3493 avx2_sub_digest((__m256i*) dst, (const __m256i*) (src)) 3495 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \ 3496 avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3498 #define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \ 3499 avx2_sub_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4)) 3501 #define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \ 3502 avx2_sub_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3504 #define VECT_XOR_BLOCK(dst, src) \ 3505 avx2_xor_block((__m256i*) dst, (__m256i*) (src)) 3507 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \ 3508 avx2_xor_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3510 #define VECT_COPY_BLOCK(dst, src) \ 3511 avx2_copy_block((__m256i*) dst, (__m256i*) (src)) 3513 #define VECT_COPY_BLOCK_UNALIGN(dst, src) \ 3514 avx2_copy_block_unalign((__m256i*) dst, (__m256i*) (src)) 3516 #define VECT_STREAM_BLOCK(dst, src) \ 3517 avx2_stream_block((__m256i*) dst, (__m256i*) (src)) 3519 #define VECT_STREAM_BLOCK_UNALIGN(dst, src) \ 3520 avx2_stream_block_unalign((__m256i*) dst, (__m256i*) (src)) 3522 #define VECT_SET_BLOCK(dst, value) \ 3523 avx2_set_block((__m256i*) dst, (value)) 3525 #define VECT_IS_ZERO_BLOCK(dst) \ 3526 avx2_is_all_zero((__m256i*) dst) 3528 #define VECT_IS_ONE_BLOCK(dst) \ 3529 avx2_is_all_one((__m256i*) dst) 3531 #define VECT_IS_DIGEST_ZERO(start) \ 3532 avx2_is_digest_zero((__m256i*)start) 3534 #define VECT_BLOCK_SET_DIGEST(dst, val) \ 3535 avx2_block_set_digest((__m256i*)dst, val) 3537 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \ 3538 avx2_lower_bound_scan_u32(arr, target, from, to) 3540 #define VECT_SHIFT_L1(b, acc, co) \ 3541 avx2_shift_l1((__m256i*)b, acc, co) 3543 #define VECT_SHIFT_R1(b, acc, co) \ 3544 avx2_shift_r1((__m256i*)b, acc, co) 3546 #define VECT_SHIFT_R1_AND(b, co, m, digest) \ 3547 avx2_shift_r1_and((__m256i*)b, co, (__m256i*)m, digest) 3549 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \ 3550 avx2_idx_arr_block_lookup(idx, size, nb, start) 3552 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \ 3553 avx2_set_block_bits3(block, idx, start, stop) 3555 #define VECT_BLOCK_CHANGE(block, size) \ 3556 avx2_bit_block_calc_change((__m256i*)block, size) 3558 #define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \ 3559 avx2_bit_block_calc_xor_change((__m256i*)block, (__m256i*)xor_block, size, gc, bc) 3561 #define VECT_BLOCK_CHANGE_BC(block, gc, bc) \ 3562 avx2_bit_block_calc_change_bc((__m256i*)block, gc, bc) 3564 #define VECT_BIT_TO_GAP(dest, src, dest_len) \ 3565 avx2_bit_to_gap(dest, src, dest_len) 3567 #define VECT_BIT_FIND_FIRST(src1, off, pos) \ 3568 avx2_bit_find_first((__m256i*) src1, off, pos) 3570 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \ 3571 avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos) 3573 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \ 3574 avx2_bit_block_xor(t, src, src_xor, d) 3576 #define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \ 3577 avx2_bit_block_xor_2way(t, src_xor, d) 3579 #define VECT_GAP_BFIND(buf, pos, is_set) \ 3580 avx2_gap_bfind(buf, pos, is_set) 3582 #define VECT_GAP_TEST(buf, pos) \ 3583 avx2_gap_test(buf, pos) 3586 #define VECT_BIT_COUNT_DIGEST(blk, d) \ 3587 avx2_bit_block_count(blk, d)ncbi::TMaskedQueryRegions mask
#define BM_AVX2_POPCNT_PROLOG
#define BM_CSA256(h, l, a, b, c)
#define BM_AVX2_BIT_COUNT(ret, v)
Bit manipulation primitives (internal)
static vector< string > arr
void avx2_copy_block(__m256i *dst, const __m256i *src)
AVX2 block copy dst = *src.
unsigned avx2_and_block(__m256i *dst, const __m256i *src)
AND array elements against another array dst &= *src.
void avx2_xor_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all NULL
void avx2_bit_block_calc_change_bc(const __m256i *block, unsigned *gcount, unsigned *bcount)
unsigned avx2_gap_test(const unsigned short *buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
bool avx2_is_all_one(const __m256i *block)
check if block is all one bits
bool avx2_and_or_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND-OR block digest stride 2 way dst |= *src1 & *src2.
bool avx2_or_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
OR array elements against another unaligned array dst |= *src.
bool avx2_or_block_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all the same (NULL or FULL)
unsigned avx2_and_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
AND array elements against another array (unaligned) dst &= *src.
bool avx2_and_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND block digest stride 2 way dst = *src1 & *src2.
bool avx2_or_block(__m256i *dst, const __m256i *src)
OR array elements against another array dst |= *src.
unsigned avx2_xor_block(__m256i *dst, const __m256i *src)
XOR block against another dst ^= *src.
bool avx2_sub_digest(__m256i *dst, const __m256i *src)
SUB (AND NOT) block digest stride dst &= ~*src.
bm::id_t avx2_bit_count_sub(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND NOT bit count for two aligned bit-blocks.
unsigned avx2_gap_bfind(const unsigned short *buf, unsigned pos, unsigned *is_set)
Hybrid binary search, starts as binary, then switches to scan.
unsigned avx2_bit_to_gap(gap_word_t *dest, const unsigned *block, unsigned dest_len)
Convert bit block to GAP block.
bool avx2_sub_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
SUB block digest stride.
bool avx2_shift_r1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift right by 1
bool avx2_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
unsigned avx2_bit_block_calc_change(const __m256i *block, unsigned size)
bool avx2_sub_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
2-operand SUB (AND NOT) block digest stride dst = *src1 & ~*src2
bool avx2_sub_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
SUB block digest stride.
void avx2_andnot_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
void avx2_block_set_digest(__m256i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
bm::id_t avx2_bit_count_xor(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
XOR bit count for two aligned bit-blocks.
bm::id_t avx2_bit_count(const __m256i *block, const __m256i *block_end)
AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...
bm::id_t avx2_bit_count_and(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND bit count for two aligned bit-blocks.
void avx2_stream_block(__m256i *dst, const __m256i *src)
AVX2 block copy dst = *src.
void avx2_copy_block_unalign(__m256i *dst, const __m256i *src)
AVX2 block copy (unaligned SRC) dst = *src.
void avx2_invert_block(__m256i *dst)
Invert bit-block dst = ~*dst or dst ^= *dst.
void avx2_bit_block_calc_xor_change(const __m256i *block, const __m256i *xor_block, unsigned size, unsigned *gcount, unsigned *bcount)
bool avx2_shift_r1_and(__m256i *block, bm::word_t co1, const __m256i *mask_block, bm::id64_t *digest)
fused block shift right by 1 plus AND
bool avx2_bit_find_first_diff(const __m256i *block1, const __m256i *block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
bm::id_t avx2_bit_block_count(const bm::word_t *const block, bm::id64_t digest)
Calculate population count based on digest.
bool avx2_and_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND block digest stride.
bool avx2_and_digest(__m256i *dst, const __m256i *src)
AND block digest stride dst &= *src.
unsigned avx2_sub_block(__m256i *dst, const __m256i *src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
void avx2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
void avx2_set_block(__m256i *dst, bm::word_t value)
AVX2 block memset dst = value.
bool avx2_test_all_one_wave(const void *ptr)
check if wave of pointers is all 0xFFF
void avx2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
bool avx2_or_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
OR 2 arrays and copy to the destination dst = *src1 | src2.
unsigned avx2_xor_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
3 operand XOR dst = *src1 ^ src2
bool avx2_is_digest_zero(const __m256i *block)
check if digest stride is all zero bits
bool avx2_bit_find_first(const __m256i *block, unsigned off, unsigned *pos)
Find first bit set.
void avx2_stream_block_unalign(__m256i *dst, const __m256i *src)
AVX2 block copy (unaligned SRC) dst = *src.
bool avx2_shift_l1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift left by 1
int avx2_cmpge_u16(__m256i vect16, unsigned short value)
Experimental (test) function to do SIMD vector search in sorted, growing array.
int avx2_cmpge_u32(__m256i vect8, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
bool avx2_or_block_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
OR array elements against another 4 arrays dst |= *src1 | src2.
bool avx2_is_all_zero(const __m256i *block)
check if block is all zero bits
bool avx2_and_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
AND block digest stride.
unsigned avx2_lower_bound_scan_u32(const unsigned *arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
static void hex(unsigned char c)
const unsigned set_block_digest_wave_size
unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
__m256i avx2_setbit_to256(unsigned i)
Experiemntal.
const unsigned set_block_mask
const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)
bm::id_t avx2_bit_count_or(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
void avx2_set_block_bits2(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
void avx2_set_block_bits(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
void avx2_set_block_bits3(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
const unsigned set_word_shift
const unsigned set_block_size
unsigned long long int id64_t
const unsigned block_waves
__m256i avx2_setbit_256(__m256i target, __m256i source)
Set a bits in an AVX target, by indexes (int4) from the source.
unsigned short gap_word_t
void avx2_bit_block_gather_scatter(unsigned *arr, const unsigned *blk, const unsigned *idx, unsigned size, unsigned start, unsigned bit_idx)
const unsigned gap_max_bits
const unsigned set_block_shift
const unsigned set_word_mask
unsigned long long bmi_blsi_u64(unsigned long long w)
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
const CharType(& source)[N]
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int _mm_cvtsi128_si32(__m128i a)
static __m128i _mm_add_epi16(__m128i a, __m128i b)
static void _mm_prefetch(const void *p, int i)
static int64_t _mm_popcnt_u64(uint64_t a)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
MACRO for shuffle parameter for _mm_shuffle_ps().
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4