}
while(
b< b_end);
110 const unsigned*
b= (
unsigned*) block;
115}
while(++block < block_end);
165 unsignedret = (
a^
b);
188 template<
classFunc>
206block+=2; mask_block+=2;
207}
while(block < block_end);
220++block; ++mask_block;
221}
while(block < block_end);
248}
while(block < block_end);
298 __m128iaccA, accB, accC, accD;
323}
while(src < src_end);
890}
while(block < block_end);
960 unsignedco2, co1 = 0;
961 for(;block < block_end; block += 2)
1041 unsignedgap_count = 1;
1042 unsignedbit_count = 0;
1044 unsignedco2, co1 = 0;
1045 for(;block < block_end; block += 2, xor_block += 2)
1127gap_count -= (w0 & 1u);
1151 unsignedbit_count = 0;
1152 unsignedgap_count = 1;
1154 unsignedco2, co1 = 0;
1155 for(;block < block_end; block += 2)
1205gap_count -= (w0 & 1u);
1228 unsignedsimd_lane = 0;
1243 unsignedwidx = bsf >> 2;
1244 unsignedw = simd_buf[widx];
1246*pos = (simd_lane * 128) + (widx * 32) + bsf;
1254 unsignedwidx = bsf >> 2;
1255 unsignedw = simd_buf[widx];
1257*pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1262block1+=2; block2+=2;
1264}
while(block1 < block1_end);
1285 unsignedsimd_lane = 0;
1299 unsignedwidx = bsf >> 2;
1300 unsignedw = simd_buf[widx];
1302*pos = (off * 32) + (simd_lane * 128) + (widx * 32) + bsf;
1310 unsignedwidx = bsf >> 2;
1311 unsignedw = simd_buf[widx];
1313*pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf;
1320}
while(block < block_end);
1329 #pragma GCC diagnostic push 1330 #pragma GCC diagnostic ignored "-Warray-bounds" 1345 const unsignedunroll_factor = 8;
1347 __m128im1, mz, maskF, maskFL;
1354 intshiftL= (64 - (unroll_factor -
size) * 16);
1365 returnunroll_factor - bc;
1395 unsignedend = ((*buf) >> 3);
1399 unsigned size= end - start;
1400 for(;
size>= 64;
size= end - start)
1402 unsignedmid = (start + end) >> 1;
1403 if(
buf[mid] < pos)
1407 if(
buf[mid = (start + end) >> 1] < pos)
1411 if(
buf[mid = (start + end) >> 1] < pos)
1415 if(
buf[mid = (start + end) >> 1] < pos)
1422 for(;
size>= 16;
size= end - start)
1424 if(
unsignedmid = (start + end) >> 1;
buf[mid] < pos)
1428 if(
unsignedmid = (start + end) >> 1;
buf[mid] < pos)
1438 if(pbuf[0] >= pos) { }
1439 else if(pbuf[1] >= pos) { start++; }
1450*is_set = ((*buf) & 1) ^ ((start-1) & 1);
1465 unsignedend = ((*buf) >> 3);
1466 unsigned size= end - start;
1468 for(;
size>= 64;
size= end - start)
1470 unsignedmid = (start + end) >> 1;
1471 if(
buf[mid] < pos)
1475 if(
buf[mid = (start + end) >> 1] < pos)
1479 if(
buf[mid = (start + end) >> 1] < pos)
1483 if(
buf[mid = (start + end) >> 1] < pos)
1488 for(;
size>= 16;
size= end - start)
1490 if(
unsignedmid = (start + end) >> 1;
buf[mid] < pos)
1500 if(pbuf[0] >= pos) { }
1501 else if(pbuf[1] >= pos) { start++; }
1515 return((*
buf) & 1) ^ ((--start) & 1);
1561 const unsignedunroll_factor = 8;
1562 const unsigned len= (
size- start);
1563 const unsignedlen_unr =
len- (
len% unroll_factor);
1570 for(k = 0; k < len_unr; k+=unroll_factor)
1587 for(; k <
len; ++k)
1604 const unsignedunroll_factor = 4;
1605 const unsigned len= (stop - start);
1606 const unsignedlen_unr =
len- (
len% unroll_factor);
1617 for(; k < len_unr; k+=unroll_factor)
1634block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1635|(1u << mshift_v[2]) | (1u << mshift_v[3]);
1641block[mword_v[0]] |= (1u << mshift_v[0]);
1642block[mword_v[1]] |= (1u << mshift_v[1]);
1643block[mword_v[2]] |= (1u << mshift_v[2]);
1644block[mword_v[3]] |= (1u << mshift_v[3]);
1649 for(; k <
len; ++k)
1651 unsigned n= idx[k];
1655block[nword] |= (1u << nbit);
1691 const unsignedunroll_factor = 4;
1692 const unsigned len= (
size- start);
1693 const unsignedlen_unr =
len- (
len% unroll_factor);
1706 unsignedbase = start + k;
1709 for(; k < len_unr; k+=unroll_factor)
1740mask_0 =
_mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1746blk[mword_v[1]], blk[mword_v[0]]),
1760++idx_ptr; ++target_ptr;
1764 for(; k <
len; ++k)
1786 for(--block_end; block_end >= block; block_end -= 2)
1836 for(;block < block_end; block += 2)
1909 for(; di < 64 ; ++di)
1915block = (
__m128i*) &wblock[d_base];
1916mask_block = (
__m128i*) &mblock[d_base];
1918 for(
unsigned i= 0;
i< 4; ++
i, block += 2, mask_block += 2)
1968 bm::id64_tw0 = wblock[d_base] = co1 & mblock[d_base];
1969d |= (dmask & (w0 << di));
2073 const __m128i* sub_block = (
const __m128i*) (xor_block + off);
2111 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 2112 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask) 2114 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 2115 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask) 2117 #define VECT_BITCOUNT(first, last) \ 2118 sse4_bit_count((__m128i*) (first), (__m128i*) (last)) 2125 #define VECT_BITCOUNT_AND(first, last, mask) \ 2126 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) 2128 #define VECT_BITCOUNT_OR(first, last, mask) \ 2129 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) 2131 #define VECT_BITCOUNT_XOR(first, last, mask) \ 2132 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) 2134 #define VECT_BITCOUNT_SUB(first, last, mask) \ 2135 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) 2137 #define VECT_INVERT_BLOCK(first) \ 2138 sse2_invert_block((__m128i*)first); 2140 #define VECT_AND_BLOCK(dst, src) \ 2141 sse4_and_block((__m128i*) dst, (__m128i*) (src)) 2143 #define VECT_AND_DIGEST(dst, src) \ 2144 sse4_and_digest((__m128i*) dst, (const __m128i*) (src)) 2146 #define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \ 2147 sse4_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) 2149 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \ 2150 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4)) 2152 #define VECT_AND_DIGEST_3WAY(dst, src1, src2) \ 2153 sse4_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) 2155 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ 2156 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) 2158 #define VECT_OR_BLOCK(dst, src) \ 2159 sse2_or_block((__m128i*) dst, (__m128i*) (src)) 2161 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \ 2162 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2)) 2164 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 2165 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2)) 2167 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \ 2168 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4)) 2170 #define VECT_SUB_BLOCK(dst, src) \ 2171 sse2_sub_block((__m128i*) dst, (const __m128i*) (src)) 2173 #define VECT_SUB_DIGEST(dst, src) \ 2174 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src)) 2176 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \ 2177 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) 2179 #define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \ 2180 sse4_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4)) 2182 #define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \ 2183 sse4_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) 2185 #define VECT_XOR_BLOCK(dst, src) \ 2186 sse2_xor_block((__m128i*) dst, (__m128i*) (src)) 2188 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \ 2189 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2)) 2191 #define VECT_COPY_BLOCK(dst, src) \ 2192 sse2_copy_block((__m128i*) dst, (__m128i*) (src)) 2194 #define VECT_COPY_BLOCK_UNALIGN(dst, src) \ 2195 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src)) 2197 #define VECT_STREAM_BLOCK(dst, src) \ 2198 sse2_stream_block((__m128i*) dst, (__m128i*) (src)) 2200 #define VECT_STREAM_BLOCK_UNALIGN(dst, src) \ 2201 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src)) 2203 #define VECT_SET_BLOCK(dst, value) \ 2204 sse2_set_block((__m128i*) dst, value) 2206 #define VECT_IS_ZERO_BLOCK(dst) \ 2207 sse4_is_all_zero((__m128i*) dst) 2209 #define VECT_IS_ONE_BLOCK(dst) \ 2210 sse4_is_all_one((__m128i*) dst) 2212 #define VECT_IS_DIGEST_ZERO(start) \ 2213 sse4_is_digest_zero((__m128i*)start) 2215 #define VECT_BLOCK_SET_DIGEST(dst, val) \ 2216 sse4_block_set_digest((__m128i*)dst, val) 2218 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \ 2219 sse2_lower_bound_scan_u32(arr, target, from, to) 2221 #define VECT_SHIFT_L1(b, acc, co) \ 2222 sse42_shift_l1((__m128i*)b, acc, co) 2224 #define VECT_SHIFT_R1(b, acc, co) \ 2225 sse42_shift_r1((__m128i*)b, acc, co) 2227 #define VECT_SHIFT_R1_AND(b, co, m, digest) \ 2228 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest) 2230 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \ 2231 sse42_idx_arr_block_lookup(idx, size, nb, start) 2233 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \ 2234 sse42_set_block_bits(block, idx, start, stop) 2236 #define VECT_BLOCK_CHANGE(block, size) \ 2237 sse42_bit_block_calc_change((__m128i*)block, size) 2239 #define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \ 2240 sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size, gc, bc) 2243 #define VECT_BLOCK_CHANGE_BC(block, gc, bc) \ 2244 sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc) 2247 #define VECT_BIT_FIND_FIRST(src, off, pos) \ 2248 sse42_bit_find_first((__m128i*) src, off, pos) 2250 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \ 2251 sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos) 2253 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \ 2254 sse42_bit_block_xor(t, src, src_xor, d) 2256 #define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \ 2257 sse42_bit_block_xor_2way(t, src_xor, d) 2260 #define VECT_GAP_BFIND(buf, pos, is_set) \ 2261 sse42_gap_bfind(buf, pos, is_set) 2263 #define VECT_GAP_TEST(buf, pos) \ 2264 sse42_gap_test(buf, pos) 2267 #pragma GCC diagnostic pop 2276 #pragma warning( pop )ncbi::TMaskedQueryRegions mask
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
static vector< string > arr
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) noexcept
block shift left by 1
bool sse42_test_all_one_wave(const void *ptr) noexcept
check if SSE wave is all oxFFFF...FFF
bool sse4_sub_digest_3way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
SUB block digest stride.
void sse4_block_set_digest(__m128i *dst, unsigned value) noexcept
set digest stride to 0xFF.. or 0x0 value
bool sse42_bit_find_first_diff(const __m128i *block1, const __m128i *block2, unsigned *pos) noexcept
Find first bit which is different between two bit-blocks.
bool sse4_is_all_zero(const __m128i *block) noexcept
check if block is all zero bits
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end) noexcept
bool sse4_and_or_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND-OR block digest stride dst |= *src1 & src2.
unsigned sse4_gap_find(const bm::gap_word_t *pbuf, const bm::gap_word_t pos, const unsigned size) noexcept
void sse42_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
bool sse4_and_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND block digest stride dst = *src1 & src2.
unsigned sse42_gap_bfind(const unsigned short *buf, unsigned pos, unsigned *is_set) noexcept
Hybrid binary search, starts as binary, then switches to linear scan.
bool sse4_and_digest_5way(__m128i *dst, const __m128i *src1, const __m128i *src2, const __m128i *src3, const __m128i *src4) noexcept
AND block digest stride.
void sse42_bit_block_calc_xor_change(const __m128i *block, const __m128i *xor_block, unsigned size, unsigned *gc, unsigned *bc) noexcept
bool sse42_test_all_zero_wave(const void *ptr) noexcept
check if wave of pointers is all NULL
unsigned sse4_and_block(__m128i *dst, const __m128i *src) noexcept
AND blocks2 dst &= *src.
bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1) noexcept
check if 2 waves of pointers are all NULL
bool sse4_is_all_one(const __m128i *block) noexcept
check if block is all ONE bits
bool sse4_sub_digest_5way(__m128i *dst, const __m128i *src1, const __m128i *src2, const __m128i *src3, const __m128i *src4) noexcept
SUB block digest stride.
bool sse42_bit_find_first(const __m128i *block, unsigned off, unsigned *pos) noexcept
Find first non-zero bit.
int sse42_cmpge_u32(__m128i vect4, unsigned value) noexcept
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
unsigned sse42_gap_test(const unsigned short *buf, unsigned pos) noexcept
Hybrid binary search to test GAP value, starts as binary, then switches to scan.
bool sse4_and_digest_3way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND block digest stride.
bool sse4_and_digest(__m128i *dst, const __m128i *src) noexcept
AND block digest stride dst &= *src.
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) noexcept
block shift right by 1
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *mask_block, bm::id64_t *digest) noexcept
block shift right by 1 plus AND
void sse42_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
bool sse4_sub_digest(__m128i *dst, const __m128i *src) noexcept
SUB (AND NOT) block digest stride dst &= ~*src.
unsigned sse42_bit_block_calc_change(const __m128i *block, unsigned size) noexcept
bool sse4_sub_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1) noexcept
check if wave of 2 pointers are the same (null or FULL)
bool sse4_is_digest_zero(const __m128i *block) noexcept
check if digest stride is all zero bits
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start) noexcept
void sse42_set_block_bits(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop) noexcept
const unsigned set_block_digest_wave_size
bm::id_t sse4_bit_count_op(const __m128i *block, const __m128i *block_end, const __m128i *mask_block, Func sse2_func) noexcept
const unsigned set_block_mask
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
unsigned op_and(unsigned a, unsigned b) noexcept
const unsigned set_word_shift
unsigned op_or(unsigned a, unsigned b) noexcept
const unsigned set_block_size
unsigned long long int id64_t
const unsigned block_waves
void sse4_bit_block_gather_scatter(unsigned *arr, const unsigned *blk, const unsigned *idx, unsigned size, unsigned start, unsigned bit_idx) noexcept
unsigned short gap_word_t
unsigned op_xor(unsigned a, unsigned b) noexcept
const unsigned set_block_shift
const unsigned set_word_mask
unsigned long long bmi_blsi_u64(unsigned long long w)
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
static __m128i _mm_subs_epu16(__m128i a, __m128i b)
static __m128i _mm_setzero_si128()
static int _mm_test_all_ones(__m128i a)
static __m128i _mm_xor_si128(__m128i a, __m128i b)
static int _mm_popcnt_u32(unsigned int a)
#define _mm_srli_epi32(a, imm)
static __m128i _mm_srli_si128(__m128i a, int imm)
static __m128i _mm_slli_epi64(__m128i a, int imm)
static int _mm_movemask_epi8(__m128i a)
static __m128i _mm_slli_si128(__m128i a, int imm)
static __m128i _mm_set1_epi16(short w)
#define _mm_insert_epi32(a, b, imm)
static __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
static __m128i _mm_slli_epi16(__m128i a, int imm)
static int _mm_test_all_zeros(__m128i a, __m128i mask)
static __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
static __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
static __m128i _mm_slli_epi32(__m128i a, int imm)
static __m128i _mm_loadu_si128(const __m128i *p)
static void _mm_store_si128(__m128i *p, __m128i a)
static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
static void _mm_prefetch(const void *p, int i)
static __m128i _mm_load_si128(const __m128i *p)
static __m128i _mm_or_si128(__m128i, __m128i)
static int _mm_testz_si128(__m128i a, __m128i b)
static __m128i _mm_set_epi32(int, int, int, int)
static __m128i _mm_sub_epi32(__m128i a, __m128i b)
static __m128i _mm_set1_epi32(int)
static int64_t _mm_popcnt_u64(uint64_t a)
static __m128i _mm_andnot_si128(__m128i a, __m128i b)
static void _mm_storeu_si128(__m128i *p, __m128i a)
#define _mm_extract_epi32(a, imm)
#define _mm_shuffle_epi32(a, imm)
#define _mm_extract_epi64(a, imm)
static __m128i _mm_and_si128(__m128i, __m128i)
static __m128i _mm_cmpeq_epi32(__m128i, __m128i)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4