__mmask16 m16F = __mmask16(~0u);
108__mmask16 eq_m = _mm512_cmpeq_epi32_mask(m, _mm512_set1_epi64(0ull));
109 return(eq_m == m16F);
119 const__mmask16 m16F = __mmask16(~0u);
120__mmask16 eq_m = _mm512_cmpeq_epi32_mask(m, _mm512_set1_epi64(-1));
121 return(eq_m == m16F);
126 #define BM_CSA256(h, l, a, b, c) \ 128 __m256i u = _mm256_xor_si256(a, b); \ 129 h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \ 130 l = _mm256_xor_si256(u, c); \ 133 #define BM_AVX2_BIT_COUNT(ret, v) \ 135 __m256i lo = _mm256_and_si256(v, low_mask); \ 136 __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \ 137 __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \ 138 __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \ 139 ret = _mm256_sad_epu8(cnt1, cnt2); \ 142 #define BM_AVX2_DECL_LOOKUP1 \ 143 __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \ 144 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8); 145 #define BM_AVX2_DECL_LOOKUP2 \ 146 __m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \ 147 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); 149 #define BM_AVX2_POPCNT_PROLOG \ 150 BM_AVX2_DECL_LOOKUP1 \ 151 BM_AVX2_DECL_LOOKUP2 \ 152 __m256i low_mask = _mm256_set1_epi8(0x0f); \ 168__m256i
cnt= _mm256_setzero_si256();
169__m256i ones = _mm256_setzero_si256();
170__m256i twos = _mm256_setzero_si256();
171__m256i fours = _mm256_setzero_si256();
172__m256i eights = _mm256_setzero_si256();
173__m256i sixteens = _mm256_setzero_si256();
174__m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
182 b= _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
185 b= _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
187 BM_CSA256(foursA, twos, twos, twosA, twosB);
189 b= _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
192 b= _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
194 BM_CSA256(foursB, twos, twos, twosA, twosB);
195 BM_CSA256(eightsA, fours, fours, foursA, foursB);
197 b= _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
200 b= _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
202 BM_CSA256(foursA, twos, twos, twosA, twosB);
204 b= _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
207 b= _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
209 BM_CSA256(foursB, twos, twos, twosA, twosB);
210 BM_CSA256(eightsB, fours, fours, foursA, foursB);
211 BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
214 cnt= _mm256_add_epi64(
cnt, bc);
217}
while(block < block_end);
219 cnt= _mm256_slli_epi64(
cnt, 4);
221 cnt= _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 3));
223 cnt= _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 2));
225 cnt= _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 1));
227 cnt= _mm256_add_epi64(
cnt, bc);
231 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
245__m256i
cnt= _mm256_setzero_si256();
251ymm0 = _mm256_load_si256(block);
252ymm1 = _mm256_load_si256(mask_block);
253ymm0 = _mm256_and_si256(ymm0, ymm1);
254++block; ++mask_block;
256 cnt= _mm256_add_epi64(
cnt, bc);
258ymm0 = _mm256_load_si256(block);
259ymm1 = _mm256_load_si256(mask_block);
260ymm0 = _mm256_and_si256(ymm0, ymm1);
261++block; ++mask_block;
263 cnt= _mm256_add_epi64(
cnt, bc);
265ymm0 = _mm256_load_si256(block);
266ymm1 = _mm256_load_si256(mask_block);
267ymm0 = _mm256_and_si256(ymm0, ymm1);
268++block; ++mask_block;
270 cnt= _mm256_add_epi64(
cnt, bc);
272ymm0 = _mm256_load_si256(block);
273ymm1 = _mm256_load_si256(mask_block);
274ymm0 = _mm256_and_si256(ymm0, ymm1);
275++block; ++mask_block;
277 cnt= _mm256_add_epi64(
cnt, bc);
279}
while(block < block_end);
282 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
292__m256i
cnt= _mm256_setzero_si256();
295__m256i
tmp0= _mm256_load_si256(block);
296__m256i
tmp1= _mm256_load_si256(mask_block);
301 cnt= _mm256_add_epi64(
cnt, bc);
303++block; ++mask_block;
305}
while(block < block_end);
308 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
323__m256i
cnt= _mm256_setzero_si256();
324__m256i mA, mB, mC, mD;
327mA = _mm256_xor_si256(_mm256_load_si256(block+0),
328_mm256_load_si256(mask_block+0));
330 cnt= _mm256_add_epi64(
cnt, bc);
332mB = _mm256_xor_si256(_mm256_load_si256(block+1),
333_mm256_load_si256(mask_block+1));
335 cnt= _mm256_add_epi64(
cnt, bc);
337mC = _mm256_xor_si256(_mm256_load_si256(block+2),
338_mm256_load_si256(mask_block+2));
340 cnt= _mm256_add_epi64(
cnt, bc);
342mD = _mm256_xor_si256(_mm256_load_si256(block+3),
343_mm256_load_si256(mask_block+3));
345 cnt= _mm256_add_epi64(
cnt, bc);
347block += 4; mask_block += 4;
349}
while(block < block_end);
352 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
368__m256i
cnt= _mm256_setzero_si256();
371__m256i
tmp0= _mm256_load_si256(block);
372__m256i
tmp1= _mm256_load_si256(mask_block);
377 cnt= _mm256_add_epi64(
cnt, bc);
379++block; ++mask_block;
381}
while(block < block_end);
384 return(
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
401__m512i yM = _mm512_set1_epi32(
int(
mask));
404_mm512_store_si512(dst+0, _mm512_xor_si512(_mm512_load_si512(src+0), yM));
405_mm512_store_si512(dst+1, _mm512_xor_si512(_mm512_load_si512(src+1), yM));
406_mm512_store_si512(dst+2, _mm512_xor_si512(_mm512_load_si512(src+2), yM));
407_mm512_store_si512(dst+3, _mm512_xor_si512(_mm512_load_si512(src+3), yM));
410}
while(src < src_end);
426__m512i yM = _mm512_set1_epi32(
int(
mask));
429_mm512_store_si512(dst+0, _mm512_andnot_si512(_mm512_load_si512(src+0), yM));
430_mm512_store_si512(dst+1, _mm512_andnot_si512(_mm512_load_si512(src+1), yM));
431_mm512_store_si512(dst+2, _mm512_andnot_si512(_mm512_load_si512(src+2), yM));
432_mm512_store_si512(dst+3, _mm512_andnot_si512(_mm512_load_si512(src+3), yM));
435}
while(src < src_end);
448__m512i m1A, m1B, m1C, m1D;
449__m512i accA, accB, accC, accD;
454accA = accB = accC = accD = _mm512_setzero_si512();
458m1A = _mm512_and_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
459m1B = _mm512_and_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
460m1C = _mm512_and_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));
461m1D = _mm512_and_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));
463_mm512_store_si512(dst+0, m1A);
464_mm512_store_si512(dst+1, m1B);
465_mm512_store_si512(dst+2, m1C);
466_mm512_store_si512(dst+3, m1D);
468accA = _mm512_or_si512(accA, m1A);
469accB = _mm512_or_si512(accB, m1B);
470accC = _mm512_or_si512(accC, m1C);
471accD = _mm512_or_si512(accD, m1D);
475}
while(src < src_end);
477accA = _mm512_or_si512(accA, accB);
478accC = _mm512_or_si512(accC, accD);
479accA = _mm512_or_si512(accA, accC);
497m1A = _mm512_and_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
498m1B = _mm512_and_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
500_mm512_store_si512(dst+0, m1A);
501_mm512_store_si512(dst+1, m1B);
503m1A = _mm512_or_si512(m1A, m1B);
522m1A = _mm512_and_si512(_mm512_load_si512(src1+0), _mm512_load_si512(src2+0));
523m1B = _mm512_and_si512(_mm512_load_si512(src1+1), _mm512_load_si512(src2+1));
525_mm512_store_si512(dst+0, m1A);
526_mm512_store_si512(dst+1, m1B);
528m1A = _mm512_or_si512(m1A, m1B);
546__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
547__m256i accA, accB, accC, accD;
549accA = _mm256_setzero_si256();
550accB = _mm256_setzero_si256();
551accC = _mm256_setzero_si256();
552accD = _mm256_setzero_si256();
556m1A = _mm256_loadu_si256(src+0);
557m2A = _mm256_load_si256(dst+0);
558m1A = _mm256_and_si256(m1A, m2A);
559_mm256_store_si256(dst+0, m1A);
560accA = _mm256_or_si256(accA, m1A);
562m1B = _mm256_loadu_si256(src+1);
563m2B = _mm256_load_si256(dst+1);
564m1B = _mm256_and_si256(m1B, m2B);
565_mm256_store_si256(dst+1, m1B);
566accB = _mm256_or_si256(accB, m1B);
568m1C = _mm256_loadu_si256(src+2);
569m2C = _mm256_load_si256(dst+2);
570m1C = _mm256_and_si256(m1C, m2C);
571_mm256_store_si256(dst+2, m1C);
572accC = _mm256_or_si256(accC, m1C);
574m1D = _mm256_loadu_si256(src+3);
575m2D = _mm256_load_si256(dst+3);
576m1D = _mm256_and_si256(m1D, m2D);
577_mm256_store_si256(dst+3, m1D);
578accD = _mm256_or_si256(accD, m1D);
582}
while(src < src_end);
584accA = _mm256_or_si256(accA, accB);
585accC = _mm256_or_si256(accC, accD);
586accA = _mm256_or_si256(accA, accC);
588 return!_mm256_testz_si256(accA, accA);
603__m512i m1A, m1B, m1C, m1D;
605__m512i mAccF0, mAccF1;
606mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);
616m1A = _mm512_or_si512(_mm512_load_si512(src), _mm512_load_si512(dst));
617m1B = _mm512_or_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
618mAccF0 = _mm512_and_si512(mAccF0, m1A);
619mAccF0 = _mm512_and_si512(mAccF0, m1B);
621_mm512_stream_si512(dst, m1A);
622_mm512_stream_si512(dst+1, m1B);
626m1C = _mm512_or_si512(_mm512_load_si512(src2), _mm512_load_si512(dst2));
627m1D = _mm512_or_si512(_mm512_load_si512(src2+1), _mm512_load_si512(dst2+1));
628mAccF1 = _mm512_and_si512(mAccF1, m1C);
629mAccF1 = _mm512_and_si512(mAccF1, m1D);
631_mm512_stream_si512(dst2, m1C);
632_mm512_stream_si512(dst2+1, m1D);
634src2 += 2; dst2 += 2;
635}
while(src2 < src_end);
637mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
655__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
656__m256i mAccF0 = _mm256_set1_epi32(~0u);
657__m256i mAccF1 = _mm256_set1_epi32(~0u);
660m1A = _mm256_loadu_si256(src+0);
661m2A = _mm256_load_si256(dst+0);
662m1A = _mm256_or_si256(m1A, m2A);
663_mm256_store_si256(dst+0, m1A);
665m1B = _mm256_loadu_si256(src+1);
666m2B = _mm256_load_si256(dst+1);
667m1B = _mm256_or_si256(m1B, m2B);
668_mm256_store_si256(dst+1, m1B);
670m1C = _mm256_loadu_si256(src+2);
671m2C = _mm256_load_si256(dst+2);
672m1C = _mm256_or_si256(m1C, m2C);
673_mm256_store_si256(dst+2, m1C);
675m1D = _mm256_loadu_si256(src+3);
676m2D = _mm256_load_si256(dst+3);
677m1D = _mm256_or_si256(m1D, m2D);
678_mm256_store_si256(dst+3, m1D);
680mAccF1 = _mm256_and_si256(mAccF1, m1C);
681mAccF1 = _mm256_and_si256(mAccF1, m1D);
682mAccF0 = _mm256_and_si256(mAccF0, m1A);
683mAccF0 = _mm256_and_si256(mAccF0, m1B);
687}
while(src < src_end);
689__m256i maskF = _mm256_set1_epi32(~0u);
690mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
691__m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
692 unsignedmaskA = unsigned(_mm256_movemask_epi8(wcmpA));
693 return(maskA == ~0u);
708__m512i m1A, m1B, m1C, m1D;
709__m512i mAccF0, mAccF1;
711mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);
717m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(src2+0));
718m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(src2+1));
719m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(src2+2));
720m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(src2+3));
722_mm512_store_si512(dst+0, m1A);
723_mm512_store_si512(dst+1, m1B);
724_mm512_store_si512(dst+2, m1C);
725_mm512_store_si512(dst+3, m1D);
727mAccF1 = _mm512_and_si512(mAccF1, m1C);
728mAccF1 = _mm512_and_si512(mAccF1, m1D);
729mAccF0 = _mm512_and_si512(mAccF0, m1A);
730mAccF0 = _mm512_and_si512(mAccF0, m1B);
732src1 += 4; src2 += 4; dst += 4;
734}
while(src1 < src_end1);
736mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
753__m512i m1A, m1B, m1C, m1D;
754__m512i mAccF0, mAccF1;
756mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);
762m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(dst+0));
763m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(dst+1));
764m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(dst+2));
765m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(dst+3));
767m1A = _mm512_or_si512(m1A, _mm512_load_si512(src2+0));
768m1B = _mm512_or_si512(m1B, _mm512_load_si512(src2+1));
769m1C = _mm512_or_si512(m1C, _mm512_load_si512(src2+2));
770m1D = _mm512_or_si512(m1D, _mm512_load_si512(src2+3));
772_mm512_store_si512(dst+0, m1A);
773_mm512_store_si512(dst+1, m1B);
774_mm512_store_si512(dst+2, m1C);
775_mm512_store_si512(dst+3, m1D);
777mAccF1 = _mm512_and_si512(mAccF1, m1C);
778mAccF1 = _mm512_and_si512(mAccF1, m1D);
779mAccF0 = _mm512_and_si512(mAccF0, m1A);
780mAccF0 = _mm512_and_si512(mAccF0, m1B);
782src1 += 4; src2 += 4; dst += 4;
784}
while(src1 < src_end1);
786mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
805__m512i m1A, m1B, m1C, m1D;
806__m512i mAccF0, mAccF1;
807mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);
814m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(dst+0));
815m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(dst+1));
816m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(dst+2));
817m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(dst+3));
819m1A = _mm512_or_si512(m1A, _mm512_load_si512(src2+0));
820m1B = _mm512_or_si512(m1B, _mm512_load_si512(src2+1));
821m1C = _mm512_or_si512(m1C, _mm512_load_si512(src2+2));
822m1D = _mm512_or_si512(m1D, _mm512_load_si512(src2+3));
824m1A = _mm512_or_si512(m1A, _mm512_load_si512(src3+0));
825m1B = _mm512_or_si512(m1B, _mm512_load_si512(src3+1));
826m1C = _mm512_or_si512(m1C, _mm512_load_si512(src3+2));
827m1D = _mm512_or_si512(m1D, _mm512_load_si512(src3+3));
829m1A = _mm512_or_si512(m1A, _mm512_load_si512(src4+0));
830m1B = _mm512_or_si512(m1B, _mm512_load_si512(src4+1));
831m1C = _mm512_or_si512(m1C, _mm512_load_si512(src4+2));
832m1D = _mm512_or_si512(m1D, _mm512_load_si512(src4+3));
834_mm512_store_si512(dst+0, m1A);
835_mm512_store_si512(dst+1, m1B);
836_mm512_store_si512(dst+2, m1C);
837_mm512_store_si512(dst+3, m1D);
839mAccF1 = _mm512_and_si512(mAccF1, m1C);
840mAccF1 = _mm512_and_si512(mAccF1, m1D);
841mAccF0 = _mm512_and_si512(mAccF0, m1A);
842mAccF0 = _mm512_and_si512(mAccF0, m1B);
844src1 += 4; src2 += 4;
845src3 += 4; src4 += 4;
851}
while(src1 < src_end1);
853mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
868__m512i m1A, m1B, m1C, m1D;
869__m512i accA, accB, accC, accD;
874accA = accB = accC = accD = _mm512_setzero_si512();
878m1A = _mm512_xor_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
879m1B = _mm512_xor_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
880m1C = _mm512_xor_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));
881m1D = _mm512_xor_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));
883_mm512_store_si512(dst+0, m1A);
884_mm512_store_si512(dst+1, m1B);
885_mm512_store_si512(dst+2, m1C);
886_mm512_store_si512(dst+3, m1D);
888accA = _mm512_or_si512(accA, m1A);
889accB = _mm512_or_si512(accB, m1B);
890accC = _mm512_or_si512(accC, m1C);
891accD = _mm512_or_si512(accD, m1D);
895}
while(src < src_end);
897accA = _mm512_or_si512(accA, accB);
898accC = _mm512_or_si512(accC, accD);
899accA = _mm512_or_si512(accA, accC);
915__m512i m1A, m1B, m1C, m1D;
916__m512i accA, accB, accC, accD;
921accA = accB = accC = accD = _mm512_setzero_si512();
925m1A = _mm512_xor_si512(_mm512_load_si512(src1 + 0), _mm512_load_si512(src2 + 0));
926m1B = _mm512_xor_si512(_mm512_load_si512(src1 + 1), _mm512_load_si512(src2 + 1));
927m1C = _mm512_xor_si512(_mm512_load_si512(src1 + 2), _mm512_load_si512(src2 + 2));
928m1D = _mm512_xor_si512(_mm512_load_si512(src1 + 3), _mm512_load_si512(src2 + 3));
930_mm512_store_si512(dst + 0, m1A);
931_mm512_store_si512(dst + 1, m1B);
932_mm512_store_si512(dst + 2, m1C);
933_mm512_store_si512(dst + 3, m1D);
935accA = _mm512_or_si512(accA, m1A);
936accB = _mm512_or_si512(accB, m1B);
937accC = _mm512_or_si512(accC, m1C);
938accD = _mm512_or_si512(accD, m1D);
940src1 += 4; src2 += 4; dst += 4;
942}
while(src1 < src1_end);
944accA = _mm512_or_si512(accA, accB);
945accC = _mm512_or_si512(accC, accD);
946accA = _mm512_or_si512(accA, accC);
964__m512i m1A, m1B, m1C, m1D;
965__m512i accA, accB, accC, accD;
967accA = accB = accC = accD = _mm512_setzero_si512();
974m1A = _mm512_andnot_si512(_mm512_load_si512(src), _mm512_load_si512(dst));
975m1B = _mm512_andnot_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
976m1C = _mm512_andnot_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));
977m1D = _mm512_andnot_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));
979_mm512_store_si512(dst+0, m1A);
980_mm512_store_si512(dst+1, m1B);
981_mm512_store_si512(dst+2, m1C);
982_mm512_store_si512(dst+3, m1D);
984accA = _mm512_or_si512(accA, m1A);
985accB = _mm512_or_si512(accB, m1B);
986accC = _mm512_or_si512(accC, m1C);
987accD = _mm512_or_si512(accD, m1D);
991}
while(src < src_end);
993accA = _mm512_or_si512(accA, accB);
994accC = _mm512_or_si512(accC, accD);
995accA = _mm512_or_si512(accA, accC);
1013m1A = _mm512_andnot_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
1014m1B = _mm512_andnot_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
1016_mm512_store_si512(dst+0, m1A);
1017_mm512_store_si512(dst+1, m1B);
1019m1A = _mm512_or_si512(m1A, m1B);
1037__m512i zmm0 = _mm512_set1_epi32(
int(
value));
1040_mm512_store_si512(dst, zmm0);
1041_mm512_store_si512(dst+1, zmm0);
1042_mm512_store_si512(dst+2, zmm0);
1043_mm512_store_si512(dst+3, zmm0);
1046}
while(dst < dst_end);
1061__m512i ymm0, ymm1, ymm2, ymm3;
1068ymm0 = _mm512_load_si512(src+0);
1069ymm1 = _mm512_load_si512(src+1);
1070ymm2 = _mm512_load_si512(src+2);
1071ymm3 = _mm512_load_si512(src+3);
1073_mm512_store_si512(dst+0, ymm0);
1074_mm512_store_si512(dst+1, ymm1);
1075_mm512_store_si512(dst+2, ymm2);
1076_mm512_store_si512(dst+3, ymm3);
1080}
while(src < src_end);
1094__m512i maskFF = _mm512_set1_epi64(-1);
1101ymm0 = _mm512_xor_si512(_mm512_load_si512(dst+0), maskFF);
1102ymm1 = _mm512_xor_si512(_mm512_load_si512(dst+1), maskFF);
1104_mm512_store_si512(dst+0, ymm0);
1105_mm512_store_si512(dst+1, ymm1);
1107ymm0 = _mm512_xor_si512(_mm512_load_si512(dst+2), maskFF);
1108ymm1 = _mm512_xor_si512(_mm512_load_si512(dst+3), maskFF);
1110_mm512_store_si512(dst+2, ymm0);
1111_mm512_store_si512(dst+3, ymm1);
1115}
while(dst < dst_end);
1129__m512i w0 = _mm512_load_si512(block+0);
1130__m512i w1 = _mm512_load_si512(block+1);
1132__m512i wA = _mm512_or_si512(w0, w1);
1134__m512i w2 = _mm512_load_si512(block+2);
1135__m512i w3 = _mm512_load_si512(block+3);
1137__m512i wB = _mm512_or_si512(w2, w3);
1138wA = _mm512_or_si512(wA, wB);
1145}
while(block < block_end);
1157_mm512_or_si512(_mm512_load_si512(block+0),
1158_mm512_load_si512(block+1));
1171 const__mmask16 m16F = __mmask16(~0u);
1173__m512i maskF = _mm512_set1_epi64(-1);
1179__mmask16 eq_m = _mm512_cmpeq_epi32_mask(_mm512_load_si512(block), maskF);
1183eq_m = _mm512_cmpeq_epi32_mask(_mm512_load_si512(block+1), maskF);
1188}
while(block < block_end);
1199__m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
1200 return_mm256_testz_si256(w0, w0);
1211__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1212__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1213w0 = _mm256_or_si256(w0, w1);
1214 return_mm256_testz_si256(w0, w0);
1224__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1225__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1226w0 = _mm256_xor_si256(w0, w1);
1227 return_mm256_testz_si256(w0, w0);
1242 unsigned count= (unsigned)(block_end - block)*4;
1245 const intw_shift =
sizeof(w0) * 8 - 1;
1246 boolfirst_word =
true;
1257 count-= (w_prev = (w0 >> w_shift));
1271first_word =
false;
1280 count-= !(w_prev ^ (w0 & 1));
1285 count-= !w_prev; w_prev ^= w_prev;
1293 count-= !(w_prev ^ (w0 & 1));
1298 count-= !w_prev; w_prev ^= w_prev;
1305 count-= !(w_prev ^ (w0 & 1));
1310 count-= !w_prev; w_prev ^= w_prev;
1317 count-= !(w_prev ^ (w0 & 1));
1322 count-= !w_prev; w_prev ^= w_prev;
1325}
while(++block < block_end);
1342 unsignedavx_vect_waves,
1345__m256i xcnt = _mm256_setzero_si256();
1350 for(
unsigned i= 0;
i< avx_vect_waves; ++
i)
1352__m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
1353__m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
1354__m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
1355xcnt = _mm256_add_epi16(xcnt, ymm_s2);
1360xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
1365xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
1366xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
1367 __m128ixcnt2 =
_mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
1381 unsignednb,
unsignedstart)
1383 const unsignedunroll_factor = 16;
1384 const unsigned len= (
size- start);
1385 const unsignedlen_unr =
len- (
len% unroll_factor);
1390__m256i nbM = _mm256_set1_epi32(
int(nb));
1392 for(k = 0; k < len_unr; k+=unroll_factor)
1394__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
1397__m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
1398 if(~0u !=
unsigned(_mm256_movemask_epi8(wcmpA)))
1400__m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
1403__m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
1404 if(~0u !=
unsigned(_mm256_movemask_epi8(wcmpB)))
1407 for(; k <
len; ++k)
1448 const unsignedunroll_factor = 8;
1449 const unsigned len= (
size- start);
1450 const unsignedlen_unr =
len- (
len% unroll_factor);
1454__m256i maskFF = _mm256_set1_epi32(~0u);
1456__m256i mask_tmp, mask_0;
1460 unsignedk = 0,
mask, w_idx;
1461 for(; k < len_unr; k+=unroll_factor)
1463__m256i nbitA, nwordA;
1464 const unsignedbase = start + k;
1465__m256i* idx_ptr = (__m256i*)(idx+base);
1467nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask);
1471mask_tmp = _mm256_shuffle_epi32 (nwordA,
_MM_SHUFFLE(1,1,1,1));
1472mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
1473 mask= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
1474_mm256_store_si256((__m256i*)mword_v, nwordA);
1479mask_tmp = _mm256_set1_epi32(blk[w_idx]);
1483mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
1484blk[mword_v[5]], blk[mword_v[4]],
1485blk[mword_v[3]], blk[mword_v[2]],
1486blk[mword_v[1]], blk[mword_v[0]]);
1491__m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
1492__m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
1493mask_0 = _mm256_sllv_epi32(mask1, shiftA);
1495mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
1496 if(!_mm256_testz_si256(mask_tmp, mask_tmp))
1498__m256i* target_ptr = (__m256i*)(
arr+base);
1500__m256i maskZ = _mm256_xor_si256(maskFF, maskFF);
1501mask1 = _mm256_slli_epi32(mask1, bit_idx);
1502mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ);
1503mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF);
1504mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
1505_mm256_storeu_si256 (target_ptr,
1506_mm256_or_si256 (mask_tmp,
1507_mm256_loadu_si256(target_ptr)));
1512 for(; k <
len; ++k)
1514 const unsignedbase = start + k;
1522 #pragma GCC diagnostic pop 1526 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 1527 avx512_xor_arr_2_mask((__m512i*)(dst), (__m512i*)(src), (__m512i*)(src_end), (bm::word_t)mask) 1529 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 1530 avx512_andnot_arr_2_mask((__m512i*)(dst), (__m512i*)(src), (__m512i*)(src_end), (bm::word_t)mask) 1532 #define VECT_BITCOUNT(first, last) \ 1533 avx2_bit_count((__m256i*) (first), (__m256i*) (last)) 1535 #define VECT_BITCOUNT_AND(first, last, mask) \ 1536 avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1538 #define VECT_BITCOUNT_OR(first, last, mask) \ 1539 avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1541 #define VECT_BITCOUNT_XOR(first, last, mask) \ 1542 avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1544 #define VECT_BITCOUNT_SUB(first, last, mask) \ 1545 avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1547 #define VECT_INVERT_BLOCK(first) \ 1548 avx512_invert_block((__m512i*)first); 1550 #define VECT_AND_BLOCK(dst, src) \ 1551 avx512_and_block((__m512i*) dst, (const __m512i*) (src)) 1553 #define VECT_AND_DIGEST(dst, src) \ 1554 avx512_and_digest((__m512i*) dst, (const __m512i*) (src)) 1556 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ 1557 avx512_and_digest_2way((__m512i*) dst, (const __m512i*) (src1), (const __m512i*) (src2)) 1559 #define VECT_OR_BLOCK(dst, src) \ 1560 avx512_or_block((__m512i*) dst, (__m512i*) (src)) 1562 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \ 1563 avx512_or_block_2way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2)) 1565 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 1566 avx512_or_block_3way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2)) 1568 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \ 1569 avx512_or_block_5way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2), (__m512i*) (src3), (__m512i*) (src4)) 1571 #define VECT_SUB_BLOCK(dst, src) \ 1572 avx512_sub_block((__m512i*) dst, (__m512i*) (src)) 1574 #define VECT_SUB_DIGEST(dst, src) \ 1575 avx512_sub_digest((__m512i*) dst, (const __m512i*) (src)) 1577 #define VECT_XOR_BLOCK(dst, src) \ 1578 avx512_xor_block((__m512i*) dst, (__m512i*) (src)) 1580 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \ 1581 avx512_xor_block_2way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2)) 1583 #define VECT_COPY_BLOCK(dst, src) \ 1584 avx512_copy_block((__m512i*) dst, (__m512i*) (src)) 1586 #define VECT_SET_BLOCK(dst, value) \ 1587 avx512_set_block((__m512i*) dst, (value)) 1589 #define VECT_IS_ZERO_BLOCK(dst) \ 1590 avx512_is_all_zero((__m512i*) dst) 1592 #define VECT_IS_ONE_BLOCK(dst) \ 1593 avx512_is_all_one((__m512i*) dst) 1595 #define VECT_IS_DIGEST_ZERO(start) \ 1596 avx512_is_digest_zero((__m512i*)start) 1598 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \ 1599 avx2_idx_arr_block_lookup(idx, size, nb, start)ncbi::TMaskedQueryRegions mask
#define BM_AVX2_POPCNT_PROLOG
#define BM_CSA256(h, l, a, b, c)
#define BM_AVX2_BIT_COUNT(ret, v)
static vector< string > arr
bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all NULL
bool avx2_or_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
OR array elements against another unaligned array dst |= *src.
bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all the same (NULL or FULL)
unsigned avx2_and_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
AND array elements against another array (unaligned) dst &= *src.
bm::id_t avx2_bit_count_sub(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND NOT bit count for two aligned bit-blocks.
bool avx2_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
bm::id_t avx2_bit_count_xor(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
XOR bit count for two aligned bit-blocks.
bm::id_t avx2_bit_count(const __m256i *block, const __m256i *block_end)
AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...
bm::id_t avx2_bit_count_and(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND bit count for two aligned bit-blocks.
void avx512_copy_block(__m512i *dst, const __m512i *src)
block copy dst = *src
void avx512_xor_arr_2_mask(__m512i *dst, const __m512i *src, const __m512i *src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
bool avx512_or_block_5way(__m512i *dst, const __m512i *src1, const __m512i *src2, const __m512i *src3, const __m512i *src4)
OR array elements against another 4 arrays dst |= *src1 | src2.
bool avx512_test_zero(__m512i m)
bool avx512_and_digest_2way(__m512i *dst, const __m512i *src1, const __m512i *src2)
AND block digest stride 2 way dst = *src1 & *src2.
bool avx512_or_block(__m512i *dst, const __m512i *src)
OR array elements against another array dst |= *src.
bool avx512_or_block_2way(__m512i *dst, const __m512i *src1, const __m512i *src2)
OR 2 blocks, copy to destination dst = *src1 | src2.
void avx512_invert_block(__m512i *dst)
Invert bit-block dst = ~*dst or dst ^= *dst.
void avx512_andnot_arr_2_mask(__m512i *dst, const __m512i *src, const __m512i *src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
unsigned avx512_and_block(__m512i *dst, const __m512i *src)
AND array elements against another array dst &= *src.
bool avx512_test_one(__m512i m)
unsigned avx512_sub_block(__m512i *dst, const __m512i *src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
bool avx512_is_digest_zero(const __m512i *block)
check if digest stride is all zero bits
unsigned avx512_xor_block(__m512i *dst, const __m512i *src)
XOR block against another dst ^= *src.
bool avx512_is_all_one(const __m512i *block)
check if block is all one bits
bool avx512_is_all_zero(const __m512i *block)
check if block is all zero bits
bool avx512_sub_digest(__m512i *dst, const __m512i *src)
SUB (AND NOT) block digest stride dst &= *src.
unsigned avx512_xor_block_2way(__m512i *dst, const __m512i *src1, const __m512i *src2)
3-operand XOR dst = *src1 ^ *src2
void avx512_set_block(__m512i *dst, bm::word_t value)
AVX512 block memset dst = value.
bool avx512_or_block_3way(__m512i *dst, const __m512i *src1, const __m512i *src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
bool avx512_and_digest(__m512i *dst, const __m512i *src)
AND block digest stride dst &= *src.
bm::id_t sse42_bit_block_calc_count_change(const __m128i *block, const __m128i *block_end, unsigned *bit_count)
unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
const unsigned set_block_mask
const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)
bm::id_t avx2_bit_count_or(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
const unsigned set_word_shift
const unsigned set_block_size
unsigned long long int id64_t
unsigned short gap_word_t
void avx2_bit_block_gather_scatter(unsigned *arr, const unsigned *blk, const unsigned *idx, unsigned size, unsigned start, unsigned bit_idx)
const unsigned set_block_shift
const unsigned set_word_mask
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
static __m128i _mm_xor_si128(__m128i a, __m128i b)
static int _mm_cvtsi128_si32(__m128i a)
static int _mm_popcnt_u32(unsigned int a)
static __m128i _mm_add_epi16(__m128i a, __m128i b)
#define _mm_srli_epi32(a, imm)
static void _mm_prefetch(const void *p, int i)
static __m128i _mm_load_si128(const __m128i *p)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
MACRO for shuffle parameter for _mm_shuffle_ps().
#define _mm_extract_epi32(a, imm)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4