A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/sse2neon_8h_source.html below:

NCBI C++ ToolKit: include/util/bitset/sse2neon.h Source File

58 #ifndef SSE2NEON_PRECISE_MINMAX 59 #define SSE2NEON_PRECISE_MINMAX (0) 62 #ifndef SSE2NEON_PRECISE_DIV 63 #define SSE2NEON_PRECISE_DIV (0) 66 #ifndef SSE2NEON_PRECISE_SQRT 67 #define SSE2NEON_PRECISE_SQRT (0) 70 #ifndef SSE2NEON_PRECISE_DP 71 #define SSE2NEON_PRECISE_DP (0) 75 #if defined(__GNUC__) || defined(__clang__) 76 #pragma push_macro("FORCE_INLINE"

)

77 #pragma push_macro("ALIGN_STRUCT"

)

78 #define FORCE_INLINE static inline __attribute__((always_inline)) 79 #define ALIGN_STRUCT(x) __attribute__((aligned(x))) 80 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1) 81 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) 83 #warning "Macro name collisions may happen with unsupported compiler." 85 #define FORCE_INLINE static inline 88 #define ALIGN_STRUCT(x) __declspec(align(x)) 90 #define _sse2neon_likely(x) (x) 91 #define _sse2neon_unlikely(x) (x) 100 #if defined(__arm__) && __ARM_ARCH == 7 105 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) 106 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." 108 #if !defined(__clang__) 109 #pragma GCC push_options 110 #pragma GCC target("fpu=neon"

)

112 #elif defined(__aarch64__) 113 #if !defined(__clang__) 114 #pragma GCC push_options 115 #pragma GCC target("+simd"

)

118 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." 122 #include <arm_neon.h> 125 #if !defined(__aarch64__) 132 #ifndef __has_builtin 134 #if defined(__GNUC__) && (__GNUC__ <= 9) 135 #define __has_builtin(x) HAS##x 136 #define HAS__builtin_popcount 1 137 #define HAS__builtin_popcountll 1 139 #define __has_builtin(x) 0 151 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ 152  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) 155 #define _MM_FROUND_TO_NEAREST_INT 0x00 156 #define _MM_FROUND_TO_NEG_INF 0x01 157 #define _MM_FROUND_TO_POS_INF 0x02 158 #define _MM_FROUND_TO_ZERO 0x03 159 #define _MM_FROUND_CUR_DIRECTION 0x04 160 #define _MM_FROUND_NO_EXC 0x08 161 #define _MM_FROUND_RAISE_EXC 0x00 162 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) 163 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) 164 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) 165 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) 166 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) 167 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) 168 #define _MM_ROUND_NEAREST 0x0000 169 #define _MM_ROUND_DOWN 0x2000 170 #define _MM_ROUND_UP 0x4000 171 #define _MM_ROUND_TOWARD_ZERO 0x6000 173 #define _MM_FLUSH_ZERO_MASK 0x8000 174 #define _MM_FLUSH_ZERO_ON 0x8000 175 #define _MM_FLUSH_ZERO_OFF 0x0000 177 #define _MM_DENORMALS_ZERO_MASK 0x0040 178 #define _MM_DENORMALS_ZERO_ON 0x0040 179 #define _MM_DENORMALS_ZERO_OFF 0x0000 182 #define __constrange(a, b) const 195 #if defined(__aarch64__) 204 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) 205 #if (defined(__x86_64__) || defined(__i386__)) 206 #define __int64 long long 208 #define __int64 int64_t 214 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) 215 #define vreinterpretq_m128_f32(x) (x) 216 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) 218 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) 219 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) 220 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) 221 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) 223 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) 224 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) 225 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) 226 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) 228 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) 229 #define vreinterpretq_f32_m128(x) (x) 230 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) 232 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) 233 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) 234 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) 235 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) 237 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) 238 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) 239 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) 240 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) 242 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) 243 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) 244 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) 245 #define vreinterpretq_m128i_s64(x) (x) 247 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) 248 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) 249 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) 250 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) 252 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) 253 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) 255 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) 256 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) 257 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) 258 #define vreinterpretq_s64_m128i(x) (x) 260 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) 261 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) 262 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) 263 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) 265 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) 266 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) 267 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) 268 #define vreinterpret_m64_s64(x) (x) 270 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) 271 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) 272 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) 273 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) 275 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) 276 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) 277 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) 279 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) 280 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) 281 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) 282 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) 284 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) 285 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) 286 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) 287 #define vreinterpret_s64_m64(x) (x) 289 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) 291 #if defined(__aarch64__) 292 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) 293 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) 295 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) 297 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) 298 #define vreinterpretq_m128d_f64(x) (x) 300 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) 302 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) 303 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) 305 #define vreinterpretq_f64_m128d(x) (x) 306 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) 308 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) 309 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) 311 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) 312 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) 314 #define vreinterpretq_m128d_f32(x) (x) 316 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) 318 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) 319 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) 321 #define vreinterpretq_f32_m128d(x) (x) 360 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) 361 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) 362 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) 365 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode 366 #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode 367 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode 368 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode 402 #if defined(__GNUC__) && !defined(__clang__) && \ 403  ((__GNUC__ <= 10 && defined(__arm__)) || \ 404  (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ 405  (__GNUC__ <= 9 && defined(__aarch64__))) 409

ret.val[0] = vld1q_u8(p + 0);

410

ret.val[1] = vld1q_u8(p + 16);

411

ret.val[2] = vld1q_u8(p + 32);

412

ret.val[3] = vld1q_u8(p + 48);

419  return

vld1q_u8_x4(p);

520 #if defined(__aarch64__) 546

float32x2_t a21 = vget_high_f32(

548

float32x2_t b03 = vget_low_f32(

555

float32x2_t a03 = vget_low_f32(

557

float32x2_t b21 = vget_high_f32(

620

float32x2_t a02 = vset_lane_f32(a0, a22, 1);

638

float32x2_t b20 = vset_lane_f32(b2, b00, 1);

645

float32_t b2 = vgetq_lane_f32(

b

, 2);

647

float32x2_t b20 = vset_lane_f32(b2, b00, 1);

654

float32_t b2 = vgetq_lane_f32(

b

, 2);

656

float32x2_t b20 = vset_lane_f32(b2, b00, 1);

666

*c = (

t

- *sum) - y;

670 #if defined(__ARM_FEATURE_CRYPTO) 674

poly64_t

a

= vget_lane_p64(vreinterpret_p64_u64(_a), 0);

675

poly64_t

b

= vget_lane_p64(vreinterpret_p64_u64(_b), 0);

676  return

vreinterpretq_u64_p128(vmull_p64(

a

,

b

));

694

poly8x8_t

a

= vreinterpret_p8_u64(_a);

695

poly8x8_t

b

= vreinterpret_p8_u64(_b);

698

uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),

699

vcreate_u8(0x00000000ffffffff));

700

uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),

701

vcreate_u8(0x0000000000000000));

704

uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(

a

,

b

));

706

vreinterpretq_u8_p16(vmull_p8(

a

, vext_p8(

b

,

b

, 1)));

708

vreinterpretq_u8_p16(vmull_p8(vext_p8(

a

,

a

, 1),

b

));

710

vreinterpretq_u8_p16(vmull_p8(

a

, vext_p8(

b

,

b

, 2)));

712

vreinterpretq_u8_p16(vmull_p8(vext_p8(

a

,

a

, 2),

b

));

714

vreinterpretq_u8_p16(vmull_p8(

a

, vext_p8(

b

,

b

, 3)));

716

vreinterpretq_u8_p16(vmull_p8(vext_p8(

a

,

a

, 3),

b

));

718

vreinterpretq_u8_p16(vmull_p8(

a

, vext_p8(

b

,

b

, 4)));

721

uint8x16_t

l

= veorq_u8(e,

f

);

722

uint8x16_t m = veorq_u8(

g

, h);

723

uint8x16_t

n

= veorq_u8(

i

, j);

727 #if defined(__aarch64__) 728

uint8x16_t lm_p0 = vreinterpretq_u8_u64(

729

vzip1q_u64(vreinterpretq_u64_u8(

l

), vreinterpretq_u64_u8(m)));

730

uint8x16_t lm_p1 = vreinterpretq_u8_u64(

731

vzip2q_u64(vreinterpretq_u64_u8(

l

), vreinterpretq_u64_u8(m)));

732

uint8x16_t nk_p0 = vreinterpretq_u8_u64(

733

vzip1q_u64(vreinterpretq_u64_u8(

n

), vreinterpretq_u64_u8(k)));

734

uint8x16_t nk_p1 = vreinterpretq_u8_u64(

735

vzip2q_u64(vreinterpretq_u64_u8(

n

), vreinterpretq_u64_u8(k)));

737

uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(

l

), vget_low_u8(m));

738

uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(

l

), vget_high_u8(m));

739

uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(

n

), vget_low_u8(k));

740

uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(

n

), vget_high_u8(k));

744

uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);

745

uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);

746

uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);

750

uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);

751

uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);

752

uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);

755 #if defined(__aarch64__) 756

uint8x16_t t0 = vreinterpretq_u8_u64(

757

vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));

758

uint8x16_t t1 = vreinterpretq_u8_u64(

759

vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));

760

uint8x16_t t2 = vreinterpretq_u8_u64(

761

vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));

762

uint8x16_t t3 = vreinterpretq_u8_u64(

763

vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));

765

uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));

766

uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));

767

uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));

768

uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));

771

uint8x16_t t0_shift = vextq_u8(t0, t0, 15);

772

uint8x16_t t1_shift = vextq_u8(t1, t1, 14);

773

uint8x16_t t2_shift = vextq_u8(t2, t2, 13);

774

uint8x16_t t3_shift = vextq_u8(t3, t3, 12);

777

uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);

778

uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);

779

uint8x16_t mix = veorq_u8(d, cross1);

780

uint8x16_t

r

= veorq_u8(mix, cross2);

781  return

vreinterpretq_u64_u8(

r

);

793 #define _mm_shuffle_epi32_default(a, imm) \ 797  vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ 798  ret = vsetq_lane_s32( \ 799  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ 801  ret = vsetq_lane_s32( \ 802  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ 804  ret = vsetq_lane_s32( \ 805  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ 807  vreinterpretq_m128i_s32(ret); \ 894 #if defined(__aarch64__) 895 #define _mm_shuffle_epi32_splat(a, imm) \ 897  vreinterpretq_m128i_s32( \ 898  vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ 901 #define _mm_shuffle_epi32_splat(a, imm) \ 903  vreinterpretq_m128i_s32( \ 904  vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ 922 #define _mm_shuffle_ps_default(a, b, imm) \ 926  vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ 927  ret = vsetq_lane_f32( \ 928  vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ 930  ret = vsetq_lane_f32( \ 931  vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ 933  ret = vsetq_lane_f32( \ 934  vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ 936  vreinterpretq_m128_f32(ret); \ 945 #define _mm_shufflelo_epi16_function(a, imm) \ 947  int16x8_t ret = vreinterpretq_s16_m128i(a); \ 948  int16x4_t lowBits = vget_low_s16(ret); \ 949  ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ 950  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ 952  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ 954  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ 956  vreinterpretq_m128i_s16(ret); \ 965 #define _mm_shufflehi_epi16_function(a, imm) \ 967  int16x8_t ret = vreinterpretq_s16_m128i(a); \ 968  int16x4_t highBits = vget_high_s16(ret); \ 969  ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ 970  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ 972  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ 974  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ 976  vreinterpretq_m128i_s16(ret); \ 1005

float32x4_t

value

= vsetq_lane_f32(b0, vdupq_n_f32(0), 0);

1284  return

vgetq_lane_u32(a_eq_b, 0) & 0x1;

1294  return

vgetq_lane_u32(a_ge_b, 0) & 0x1;

1304  return

vgetq_lane_u32(a_gt_b, 0) & 0x1;

1314  return

vgetq_lane_u32(a_le_b, 0) & 0x1;

1326  return

vgetq_lane_u32(a_lt_b, 0) & 0x1;

1366 #if defined(__aarch64__) 1394 #if defined(__aarch64__) 1398

float32_t

data

= vgetq_lane_f32(

1514 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) 1548

int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));

1549  static const uint32_t

bitMask[2] = {0xFFFFFFFF, 0};

1550

int8x8_t

mask

= vreinterpret_s8_u32(vld1_u32(bitMask));

1596 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) 1628 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) 1638 #if defined(__aarch64__) 1641

float32_t

data

= vgetq_lane_f32(

1682 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) 1690 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) 1713 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV 1719 #if SSE2NEON_PRECISE_DIV 1740 #define _mm_extract_pi16(a, imm) \ 1741  (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) 1758 #if defined(__aarch64__) 1765 #if defined(__aarch64__) 1766  asm volatile

(

"mrs %0, FPCR"

:

"=r"

(

r

.value));

1768  asm volatile

(

"vmrs %0, FPSCR"

:

"=r"

(

r

.value));

1782 #if defined(__aarch64__) 1789 #if defined(__aarch64__) 1790  asm volatile

(

"mrs %0, FPCR"

:

"=r"

(

r

.value));

1792  asm volatile

(

"vmrs %0, FPSCR"

:

"=r"

(

r

.value));

1795  if

(

r

.field.bit22) {

1805 #define _mm_insert_pi16(a, b, imm) \ 1807  vreinterpret_m64_s16( \ 1808  vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ 1827 #define _mm_load_ps1 _mm_load1_ps 1858

vcombine_f32(vget_low_f32(

a

), vld1_f32((

const

float32_t *) p)));

1875

vcombine_f32(vld1_f32((

const

float32_t *) p), vget_high_f32(

a

)));

1890

float32x4_t v = vrev64q_f32(vld1q_f32(p));

1912

vsetq_lane_s16(*(

const int16_t

*) p, vdupq_n_s16(0), 0));

1924

vcombine_s64(vld1_s64((

const int64_t

*) p), vdup_n_s64(0)));

1935  if

(align == 2 || (

sizeof

(

void

*) == 8 && align == 4))

1936

align =

sizeof

(

void

*);

1937  if

(!posix_memalign(&ptr, align,

size

))

1953

vst1_s8((

int8_t

*) mem_addr, masked);

1960 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) 1982 #if SSE2NEON_PRECISE_MINMAX 2037 #if SSE2NEON_PRECISE_MINMAX 2113 #if defined(__aarch64__) 2114  static const

int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};

2115

uint8x8_t

tmp

= vshr_n_u8(

input

, 7);

2116  return

vaddv_u8(vshl_u8(

tmp

, shift));

2119

uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(

input

, 7));

2120

uint32x2_t paired16 =

2121

vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));

2122

uint8x8_t paired32 =

2123

vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));

2124  return

vget_lane_u8(paired32, 0) | ((

int

) vget_lane_u8(paired32, 4) << 4);

2135 #if defined(__aarch64__) 2136  static const

int32x4_t shift = {0, 1, 2, 3};

2137

uint32x4_t

tmp

= vshrq_n_u32(

input

, 31);

2138  return

vaddvq_u32(vshlq_u32(

tmp

, shift));

2143

uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(

input

, 31));

2146

vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));

2148  return

vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);

2207 #define _m_pavgb(a, b) _mm_avg_pu8(a, b) 2218 #define _m_pavgw(a, b) _mm_avg_pu16(a, b) 2223 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) 2228 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) 2233 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) 2238 #define _m_pmaxub(a, b) _mm_max_pu8(a, b) 2243 #define _m_pminsw(a, b) _mm_min_pi16(a, b) 2248 #define _m_pminub(a, b) _mm_min_pu8(a, b) 2253 #define _m_pmovmskb(a) _mm_movemask_pi8(a) 2259 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) 2266

__builtin_prefetch(p);

2274 #define _m_psadbw(a, b) _mm_sad_pu8(a, b) 2279 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) 2289 #if SSE2NEON_PRECISE_DIV 2317 #if SSE2NEON_PRECISE_SQRT 2344

uint64x1_t

t

= vpaddl_u32(vpaddl_u16(

2347

vset_lane_u16(vget_lane_u64(

t

, 0), vdup_n_u16(0), 0));

2360 #if defined(__aarch64__) 2367 #if defined(__aarch64__) 2368  asm volatile

(

"mrs %0, FPCR"

:

"=r"

(

r

.value));

2370  asm volatile

(

"vmrs %0, FPSCR"

:

"=r"

(

r

.value));

2375 #if defined(__aarch64__) 2376  asm volatile

(

"msr FPCR, %0"

::

"r"

(

r

));

2378  asm volatile

(

"vmsr FPSCR, %0"

::

"r"

(

r

));

2406 #if defined(__aarch64__) 2413 #if defined(__aarch64__) 2414  asm volatile

(

"mrs %0, FPCR"

:

"=r"

(

r

.value));

2416  asm volatile

(

"vmrs %0, FPSCR"

:

"=r"

(

r

.value));

2437 #if defined(__aarch64__) 2438  asm volatile

(

"msr FPCR, %0"

::

"r"

(

r

));

2440  asm volatile

(

"vmsr FPSCR, %0"

::

"r"

(

r

));

2494 #if __has_builtin(__builtin_shufflevector) 2495 #define _mm_shuffle_pi16(a, imm) \ 2497  vreinterpret_m64_s16(__builtin_shufflevector( \ 2498  vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ 2499  ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \ 2502 #define _mm_shuffle_pi16(a, imm) \ 2506  vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \ 2507  ret = vset_lane_s16( \ 2508  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \ 2510  ret = vset_lane_s16( \ 2511  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \ 2513  ret = vset_lane_s16( \ 2514  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \ 2516  vreinterpret_m64_s16(ret); \ 2525

__sync_synchronize();

2530 #if __has_builtin(__builtin_shufflevector) 2531 #define _mm_shuffle_ps(a, b, imm) \ 2533  float32x4_t _input1 = vreinterpretq_f32_m128(a); \ 2534  float32x4_t _input2 = vreinterpretq_f32_m128(b); \ 2535  float32x4_t _shuf = __builtin_shufflevector( \ 2536  _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ 2537  (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ 2538  vreinterpretq_m128_f32(_shuf); \ 2541 #define _mm_shuffle_ps(a, b, imm) \ 2545  case _MM_SHUFFLE(1, 0, 3, 2): \ 2546  ret = _mm_shuffle_ps_1032((a), (b)); \ 2548  case _MM_SHUFFLE(2, 3, 0, 1): \ 2549  ret = _mm_shuffle_ps_2301((a), (b)); \ 2551  case _MM_SHUFFLE(0, 3, 2, 1): \ 2552  ret = _mm_shuffle_ps_0321((a), (b)); \ 2554  case _MM_SHUFFLE(2, 1, 0, 3): \ 2555  ret = _mm_shuffle_ps_2103((a), (b)); \ 2557  case _MM_SHUFFLE(1, 0, 1, 0): \ 2558  ret = _mm_movelh_ps((a), (b)); \ 2560  case _MM_SHUFFLE(1, 0, 0, 1): \ 2561  ret = _mm_shuffle_ps_1001((a), (b)); \ 2563  case _MM_SHUFFLE(0, 1, 0, 1): \ 2564  ret = _mm_shuffle_ps_0101((a), (b)); \ 2566  case _MM_SHUFFLE(3, 2, 1, 0): \ 2567  ret = _mm_shuffle_ps_3210((a), (b)); \ 2569  case _MM_SHUFFLE(0, 0, 1, 1): \ 2570  ret = _mm_shuffle_ps_0011((a), (b)); \ 2572  case _MM_SHUFFLE(0, 0, 2, 2): \ 2573  ret = _mm_shuffle_ps_0022((a), (b)); \ 2575  case _MM_SHUFFLE(2, 2, 0, 0): \ 2576  ret = _mm_shuffle_ps_2200((a), (b)); \ 2578  case _MM_SHUFFLE(3, 2, 0, 2): \ 2579  ret = _mm_shuffle_ps_3202((a), (b)); \ 2581  case _MM_SHUFFLE(3, 2, 3, 2): \ 2582  ret = _mm_movehl_ps((b), (a)); \ 2584  case _MM_SHUFFLE(1, 1, 3, 3): \ 2585  ret = _mm_shuffle_ps_1133((a), (b)); \ 2587  case _MM_SHUFFLE(2, 0, 1, 0): \ 2588  ret = _mm_shuffle_ps_2010((a), (b)); \ 2590  case _MM_SHUFFLE(2, 0, 0, 1): \ 2591  ret = _mm_shuffle_ps_2001((a), (b)); \ 2593  case _MM_SHUFFLE(2, 0, 3, 2): \ 2594  ret = _mm_shuffle_ps_2032((a), (b)); \ 2597  ret = _mm_shuffle_ps_default((a), (b), (imm)); \ 2616 #if SSE2NEON_PRECISE_SQRT 2621  const

uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);

2622  const

uint32x4_t div_by_zero =

2623

vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));

2624

recip = vreinterpretq_f32_u32(

2625

vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));

2637 #elif defined(__aarch64__) 2641

float32x4_t sq = vrecpeq_f32(recipsq);

2677

vst1q_f32(p, vdupq_n_f32(a0));

2697 #define _mm_store1_ps _mm_store_ps1 2736

float32x4_t rev = vextq_f32(

tmp

,

tmp

, 2);

2774 #if __has_builtin(__builtin_nontemporal_store) 2775

__builtin_nontemporal_store(

a

, (float32x4_t *) p);

2813 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2815  float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ 2816  float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ 2817  row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ 2818  vget_low_f32(ROW23.val[0])); \ 2819  row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ 2820  vget_low_f32(ROW23.val[1])); \ 2821  row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ 2822  vget_high_f32(ROW23.val[0])); \ 2823  row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ 2824  vget_high_f32(ROW23.val[1])); \ 2829 #define _mm_ucomieq_ss _mm_comieq_ss 2830 #define _mm_ucomige_ss _mm_comige_ss 2831 #define _mm_ucomigt_ss _mm_comigt_ss 2832 #define _mm_ucomile_ss _mm_comile_ss 2833 #define _mm_ucomilt_ss _mm_comilt_ss 2834 #define _mm_ucomineq_ss _mm_comineq_ss 2840 #if defined(__GNUC__) || defined(__clang__) 2841 #pragma GCC diagnostic push 2842 #pragma GCC diagnostic ignored "-Wuninitialized" 2846 #if defined(__GNUC__) || defined(__clang__) 2847 #pragma GCC diagnostic pop 2855 #if defined(__GNUC__) || defined(__clang__) 2856 #pragma GCC diagnostic push 2857 #pragma GCC diagnostic ignored "-Wuninitialized" 2861 #if defined(__GNUC__) || defined(__clang__) 2862 #pragma GCC diagnostic pop 2877 #if defined(__aarch64__) 2883

float32x2x2_t

result

= vzip_f32(a1, b1);

2899 #if defined(__aarch64__) 2905

float32x2x2_t

result

= vzip_f32(a1, b1);

2968 #if defined(__aarch64__) 2969  return

vreinterpretq_m128d_f64(

2970

vaddq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

2972  double

*da = (

double

*) &

a

;

2973  double

*db = (

double

*) &

b

;

2975

c[0] = da[0] + db[0];

2976

c[1] = da[1] + db[1];

2977  return

vld1q_f32((float32_t *) c);

2991 #if defined(__aarch64__) 2994  double

*da = (

double

*) &

a

;

2995  double

*db = (

double

*) &

b

;

2997

c[0] = da[0] + db[0];

2999  return

vld1q_f32((float32_t *) c);

3151 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) 3156 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) 3195 #if defined(__aarch64__) 3250 #if defined(__aarch64__) 3252

vceqq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

3257

uint32x4_t swapped = vrev64q_u32(

cmp

);

3276 #if defined(__aarch64__) 3278

vcgeq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

3285

d[0] = (*(

double

*) &a0) >= (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3286

d[1] = (*(

double

*) &a1) >= (*(

double

*) &b1) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3298 #if defined(__aarch64__) 3306

d[0] = (*(

double

*) &a0) >= (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3357 #if defined(__aarch64__) 3359

vcgtq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

3366

d[0] = (*(

double

*) &a0) > (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3367

d[1] = (*(

double

*) &a1) > (*(

double

*) &b1) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3379 #if defined(__aarch64__) 3387

d[0] = (*(

double

*) &a0) > (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3399 #if defined(__aarch64__) 3401

vcleq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

3408

d[0] = (*(

double

*) &a0) <= (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3409

d[1] = (*(

double

*) &a1) <= (*(

double

*) &b1) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3421 #if defined(__aarch64__) 3429

d[0] = (*(

double

*) &a0) <= (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3475 #if defined(__aarch64__) 3477

vcltq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

3484

d[0] = (*(

double

*) &a0) < (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3485

d[1] = (*(

double

*) &a1) < (*(

double

*) &b1) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3497 #if defined(__aarch64__) 3504

d[0] = (*(

double

*) &a0) < (*(

double

*) &b0) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3516 #if defined(__aarch64__) 3518

vceqq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)))));

3523

uint32x4_t swapped = vrev64q_u32(

cmp

);

3542 #if defined(__aarch64__) 3544

vcgeq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)),

3553

!((*(

double

*) &a0) >= (*(

double

*) &b0)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3555

!((*(

double

*) &a1) >= (*(

double

*) &b1)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3575 #if defined(__aarch64__) 3577

vcgtq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)),

3586

!((*(

double

*) &a0) > (*(

double

*) &b0)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3588

!((*(

double

*) &a1) > (*(

double

*) &b1)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3608 #if defined(__aarch64__) 3610

vcleq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)),

3619

!((*(

double

*) &a0) <= (*(

double

*) &b0)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3621

!((*(

double

*) &a1) <= (*(

double

*) &b1)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3641 #if defined(__aarch64__) 3643

vcltq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)),

3652

!((*(

double

*) &a0) < (*(

double

*) &b0)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3654

!((*(

double

*) &a1) < (*(

double

*) &b1)) ? ~

UINT64_C

(0) :

UINT64_C

(0);

3674 #if defined(__aarch64__) 3676

uint64x2_t not_nan_a =

3677

vceqq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

a

));

3678

uint64x2_t not_nan_b =

3679

vceqq_f64(vreinterpretq_f64_m128d(

b

), vreinterpretq_f64_m128d(

b

));

3687

d[0] = ((*(

double

*) &a0) == (*(

double

*) &a0) &&

3688

(*(

double

*) &b0) == (*(

double

*) &b0))

3691

d[1] = ((*(

double

*) &a1) == (*(

double

*) &a1) &&

3692

(*(

double

*) &b1) == (*(

double

*) &b1))

3706 #if defined(__aarch64__) 3713

d[0] = ((*(

double

*) &a0) == (*(

double

*) &a0) &&

3714

(*(

double

*) &b0) == (*(

double

*) &b0))

3728 #if defined(__aarch64__) 3730

uint64x2_t not_nan_a =

3731

vceqq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

a

));

3732

uint64x2_t not_nan_b =

3733

vceqq_f64(vreinterpretq_f64_m128d(

b

), vreinterpretq_f64_m128d(

b

));

3735

vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));

3742

d[0] = ((*(

double

*) &a0) == (*(

double

*) &a0) &&

3743

(*(

double

*) &b0) == (*(

double

*) &b0))

3746

d[1] = ((*(

double

*) &a1) == (*(

double

*) &a1) &&

3747

(*(

double

*) &b1) == (*(

double

*) &b1))

3761 #if defined(__aarch64__) 3768

d[0] = ((*(

double

*) &a0) == (*(

double

*) &a0) &&

3769

(*(

double

*) &b0) == (*(

double

*) &b0))

3783 #if defined(__aarch64__) 3784  return

vgetq_lane_u64(vcgeq_f64(

a

,

b

), 0) & 0x1;

3789  return

(*(

double

*) &a0 >= *(

double

*) &b0);

3798 #if defined(__aarch64__) 3799  return

vgetq_lane_u64(vcgtq_f64(

a

,

b

), 0) & 0x1;

3804  return

(*(

double

*) &a0 > *(

double

*) &b0);

3813 #if defined(__aarch64__) 3814  return

vgetq_lane_u64(vcleq_f64(

a

,

b

), 0) & 0x1;

3819  return

(*(

double

*) &a0 <= *(

double

*) &b0);

3828 #if defined(__aarch64__) 3829  return

vgetq_lane_u64(vcltq_f64(

a

,

b

), 0) & 0x1;

3834  return

(*(

double

*) &a0 < *(

double

*) &b0);

3843 #if defined(__aarch64__) 3844  return

vgetq_lane_u64(vceqq_f64(

a

,

b

), 0) & 0x1;

3846

uint32x4_t a_not_nan =

3848

uint32x4_t b_not_nan =

3850

uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);

3853

uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),

3854

vreinterpretq_u64_u32(a_eq_b));

3855  return

vgetq_lane_u64(and_results, 0) & 0x1;

3879 #if defined(__aarch64__) 3880  return

vreinterpretq_m128d_f64(

3910  double

d0 = ((

double

*) &

rnd

)[0];

3911  double

d1 = ((

double

*) &

rnd

)[1];

3928  double

d0 = ((

double

*) &

rnd

)[0];

3929  double

d1 = ((

double

*) &

rnd

)[1];

3948 #if defined(__aarch64__) 3949

float32x2_t

tmp

= vcvt_f32_f64(vreinterpretq_f64_m128d(

a

));

3952  float

a0 = (float) ((

double

*) &

a

)[0];

3953  float

a1 = (float) ((

double

*) &

a

)[1];

3970 #if defined(__aarch64__) 3971  return

vreinterpretq_m128d_f64(

3993 #if defined(__aarch64__) 4005  float

*

f

= (

float

*) &

a

;

4008

uint32x4_t signmask = vdupq_n_u32(0x80000000);

4011

int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(

4013

int32x4_t r_trunc = vcvtq_s32_f32(

4015

int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(

4016

vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));

4017

int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),

4019

float32x4_t

delta

= vsubq_f32(

4021

vcvtq_f32_s32(r_trunc));

4022

uint32x4_t is_delta_half =

4023

vceqq_f32(

delta

, half);

4025

vbslq_s32(is_delta_half, r_even, r_normal));

4053 #if defined(__aarch64__) 4054  return

vreinterpretq_m128d_f64(

4070 #if defined(__aarch64__) 4071  return

(

double

) vgetq_lane_f64(vreinterpretq_f64_m128d(

a

), 0);

4073  return

((

double

*) &

a

)[0];

4085 #if defined(__aarch64__) 4086  return

(

int32_t

) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(

a

)), 0);

4089  double

ret = ((

double

*) &

rnd

)[0];

4102 #if defined(__aarch64__) 4103  return

(

int64_t

) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(

a

)), 0);

4106  double

ret = ((

double

*) &

rnd

)[0];

4117 #define _mm_cvtsd_si64x _mm_cvtsd_si64 4126 #if defined(__aarch64__) 4128

vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(

b

)), 0),

4158 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) 4166 #if defined(__aarch64__) 4167  return

vreinterpretq_m128d_f64(

4168

vsetq_lane_f64((

double

)

b

, vreinterpretq_f64_m128d(

a

), 0));

4170  double

bf = (double)

b

;

4181 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) 4203 #if defined(__aarch64__) 4204  return

vreinterpretq_m128d_f64(

4205

vsetq_lane_f64((

double

)

b

, vreinterpretq_f64_m128d(

a

), 0));

4207  double

bf = (double)

b

;

4226 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) 4232 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) 4246 #if defined(__aarch64__) 4247  return

vreinterpretq_m128d_f64(

4248

vsetq_lane_f64(d, vreinterpretq_f64_m128d(

a

), 0));

4260  double

a0 = ((

double

*) &

a

)[0];

4261  double

a1 = ((

double

*) &

a

)[1];

4270  double

a0 = ((

double

*) &

a

)[0];

4271  double

a1 = ((

double

*) &

a

)[1];

4292  double

ret = *((

double

*) &

a

);

4304 #if defined(__aarch64__) 4305  return

vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(

a

)), 0);

4307  double

ret = *((

double

*) &

a

);

4318 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) 4331 #if defined(__aarch64__) 4332  return

vreinterpretq_m128d_f64(

4333

vdivq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

4335  double

*da = (

double

*) &

a

;

4336  double

*db = (

double

*) &

b

;

4338

c[0] = da[0] / db[0];

4339

c[1] = da[1] / db[1];

4340  return

vld1q_f32((float32_t *) c);

4351 #if defined(__aarch64__) 4353

vdivq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

));

4354  return

vreinterpretq_m128d_f64(

4355

vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(

a

), 1),

tmp

, 1));

4365 #define _mm_extract_epi16(a, imm) \ 4366  vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) 4373 #define _mm_insert_epi16(a, b, imm) \ 4375  vreinterpretq_m128i_s16( \ 4376  vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ 4387 #if defined(__aarch64__) 4388  return

vreinterpretq_m128d_f64(vld1q_f64(p));

4390  const float

*

fp

= (

const float

*) p;

4403 #define _mm_load_pd1 _mm_load1_pd 4415 #if defined(__aarch64__) 4416  return

vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));

4418  const float

*

fp

= (

const float

*) p;

4440 #if defined(__aarch64__) 4441  return

vreinterpretq_m128d_f64(vld1q_dup_f64(p));

4457 #if defined(__aarch64__) 4458  return

vreinterpretq_m128d_f64(

4459

vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(

a

)), vld1_f64(p)));

4474

vcombine_s32(vld1_s32((

int32_t const

*) p), vcreate_s32(0)));

4487 #if defined(__aarch64__) 4488  return

vreinterpretq_m128d_f64(

4489

vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(

a

))));

4492

vcombine_f32(vld1_f32((

const float

*) p),

4507 #if defined(__aarch64__) 4508

float64x2_t v = vld1q_f64(p);

4509  return

vreinterpretq_m128d_f64(vextq_f64(v, v, 1));

4511

int64x2_t v = vld1q_s64((

const int64_t

*) p);

4539

vsetq_lane_s32(*(

const int32_t

*) p, vdupq_n_s32(0), 0));

4557

int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));

4558

int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));

4575

vst1q_s8((

int8_t

*) mem_addr, masked);

4601 #if defined(__aarch64__) 4602 #if SSE2NEON_PRECISE_MINMAX 4603

float64x2_t _a = vreinterpretq_f64_m128d(

a

);

4604

float64x2_t _b = vreinterpretq_f64_m128d(

b

);

4605  return

vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));

4607  return

vreinterpretq_m128d_f64(

4608

vmaxq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

4616

d[0] = (*(

double

*) &a0) > (*(

double

*) &b0) ? a0 : b0;

4617

d[1] = (*(

double

*) &a1) > (*(

double

*) &b1) ? a1 : b1;

4629 #if defined(__aarch64__) 4632  double

*da = (

double

*) &

a

;

4633  double

*db = (

double

*) &

b

;

4634  double

c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};

4662 #if defined(__aarch64__) 4663 #if SSE2NEON_PRECISE_MINMAX 4664

float64x2_t _a = vreinterpretq_f64_m128d(

a

);

4665

float64x2_t _b = vreinterpretq_f64_m128d(

b

);

4666  return

vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));

4668  return

vreinterpretq_m128d_f64(

4669

vminq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

4677

d[0] = (*(

double

*) &a0) < (*(

double

*) &b0) ? a0 : b0;

4678

d[1] = (*(

double

*) &a1) < (*(

double

*) &b1) ? a1 : b1;

4689 #if defined(__aarch64__) 4692  double

*da = (

double

*) &

a

;

4693  double

*db = (

double

*) &

b

;

4694  double

c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};

4757

uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(

input

, 7));

4772

uint32x4_t paired16 =

4773

vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));

4786

uint64x2_t paired32 =

4787

vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));

4800

uint8x16_t paired64 =

4801

vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));

4808  return

vgetq_lane_u8(paired64, 0) | ((

int

) vgetq_lane_u8(paired64, 8) << 8);

4817

uint64x2_t high_bits = vshrq_n_u64(

input

, 63);

4818  return

vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);

4862 #if defined(__aarch64__) 4863  return

vreinterpretq_m128d_f64(

4864

vmulq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

4866  double

*da = (

double

*) &

a

;

4867  double

*db = (

double

*) &

b

;

4869

c[0] = da[0] * db[0];

4870

c[1] = da[1] * db[1];

4871  return

vld1q_f32((float32_t *) c);

4913

int32x4_t ab3210 = vmull_s16(a3210, b3210);

4916

int32x4_t ab7654 = vmull_s16(a7654, b7654);

4918

vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));

4930

uint32x4_t ab3210 = vmull_u16(a3210, b3210);

4931 #if defined(__aarch64__) 4934

uint16x8_t

r

= vuzp2q_u16(vreinterpretq_u16_u32(ab3210),

4935

vreinterpretq_u16_u32(ab7654));

4940

uint32x4_t ab7654 = vmull_u16(a7654, b7654);

4942

vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));

5039

__asm__ __volatile__(

"isb\n"

);

5049

uint16x8_t

t

= vpaddlq_u8(vabdq_u8((uint8x16_t)

a

, (uint8x16_t)

b

));

5090

vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));

5126 #if defined(__aarch64__) 5127  return

vreinterpretq_m128d_f64(vld1q_f64((float64_t *)

data

));

5136 #define _mm_set_pd1 _mm_set1_pd 5204 #if defined(__aarch64__) 5205  return

vreinterpretq_m128d_f64(vdupq_n_f64(d));

5285 #if defined(__aarch64__) 5286  return

vreinterpretq_m128d_f64(vdupq_n_f64(0));

5303 #if __has_builtin(__builtin_shufflevector) 5304 #define _mm_shuffle_epi32(a, imm) \ 5306  int32x4_t _input = vreinterpretq_s32_m128i(a); \ 5307  int32x4_t _shuf = __builtin_shufflevector( \ 5308  _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ 5309  ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ 5310  vreinterpretq_m128i_s32(_shuf); \ 5313 #define _mm_shuffle_epi32(a, imm) \ 5317  case _MM_SHUFFLE(1, 0, 3, 2): \ 5318  ret = _mm_shuffle_epi_1032((a)); \ 5320  case _MM_SHUFFLE(2, 3, 0, 1): \ 5321  ret = _mm_shuffle_epi_2301((a)); \ 5323  case _MM_SHUFFLE(0, 3, 2, 1): \ 5324  ret = _mm_shuffle_epi_0321((a)); \ 5326  case _MM_SHUFFLE(2, 1, 0, 3): \ 5327  ret = _mm_shuffle_epi_2103((a)); \ 5329  case _MM_SHUFFLE(1, 0, 1, 0): \ 5330  ret = _mm_shuffle_epi_1010((a)); \ 5332  case _MM_SHUFFLE(1, 0, 0, 1): \ 5333  ret = _mm_shuffle_epi_1001((a)); \ 5335  case _MM_SHUFFLE(0, 1, 0, 1): \ 5336  ret = _mm_shuffle_epi_0101((a)); \ 5338  case _MM_SHUFFLE(2, 2, 1, 1): \ 5339  ret = _mm_shuffle_epi_2211((a)); \ 5341  case _MM_SHUFFLE(0, 1, 2, 2): \ 5342  ret = _mm_shuffle_epi_0122((a)); \ 5344  case _MM_SHUFFLE(3, 3, 3, 2): \ 5345  ret = _mm_shuffle_epi_3332((a)); \ 5347  case _MM_SHUFFLE(0, 0, 0, 0): \ 5348  ret = _mm_shuffle_epi32_splat((a), 0); \ 5350  case _MM_SHUFFLE(1, 1, 1, 1): \ 5351  ret = _mm_shuffle_epi32_splat((a), 1); \ 5353  case _MM_SHUFFLE(2, 2, 2, 2): \ 5354  ret = _mm_shuffle_epi32_splat((a), 2); \ 5356  case _MM_SHUFFLE(3, 3, 3, 3): \ 5357  ret = _mm_shuffle_epi32_splat((a), 3); \ 5360  ret = _mm_shuffle_epi32_default((a), (imm)); \ 5374 #if __has_builtin(__builtin_shufflevector) 5375 #define _mm_shuffle_pd(a, b, imm8) \ 5376  vreinterpretq_m128d_s64(__builtin_shufflevector( \ 5377  vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ 5378  ((imm8 & 0x2) >> 1) + 2)) 5380 #define _mm_shuffle_pd(a, b, imm8) \ 5381  _mm_castsi128_pd(_mm_set_epi64x( \ 5382  vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ 5383  vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) 5388 #if __has_builtin(__builtin_shufflevector) 5389 #define _mm_shufflehi_epi16(a, imm) \ 5391  int16x8_t _input = vreinterpretq_s16_m128i(a); \ 5392  int16x8_t _shuf = __builtin_shufflevector( \ 5393  _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ 5394  (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ 5395  (((imm) >> 6) & 0x3) + 4); \ 5396  vreinterpretq_m128i_s16(_shuf); \ 5399 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) 5404 #if __has_builtin(__builtin_shufflevector) 5405 #define _mm_shufflelo_epi16(a, imm) \ 5407  int16x8_t _input = vreinterpretq_s16_m128i(a); \ 5408  int16x8_t _shuf = __builtin_shufflevector( \ 5409  _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ 5410  (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ 5411  vreinterpretq_m128i_s16(_shuf); \ 5414 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) 5436

int16x8_t vc = vdupq_n_s16((

int16_t

) c);

5459

int32x4_t vc = vdupq_n_s32((

int32_t

) c);

5482

int64x2_t vc = vdupq_n_s64((

int64_t

) c);

5565

vld1q_u8(((

uint8_t const

*)

tmp

) + (16 - imm)));

5573 #if defined(__aarch64__) 5574  return

vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(

a

)));

5576  double

a0 = sqrt(((

double

*) &

a

)[0]);

5577  double

a1 = sqrt(((

double

*) &

a

)[1]);

5588 #if defined(__aarch64__) 5591  return _mm_set_pd

(((

double

*) &

a

)[1], sqrt(((

double

*) &

b

)[0]));

5652  const int count

= (imm & ~15) ? 15 : imm;

5653  return

(

__m128i

) vshlq_s16((int16x8_t)

a

, vdupq_n_s16(-

count

));

5670 #define _mm_srai_epi32(a, imm) \ 5673  if (_sse2neon_unlikely((imm) == 0)) { \ 5675  } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ 5676  ret = vreinterpretq_m128i_s32( \ 5677  vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ 5679  ret = vreinterpretq_m128i_s32( \ 5680  vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ 5704

int16x8_t vc = vdupq_n_s16(-(

int16_t

) c);

5727

int32x4_t vc = vdupq_n_s32(-(

int32_t

) c);

5750

int64x2_t vc = vdupq_n_s64(-(

int64_t

) c);

5767 #define _mm_srli_epi16(a, imm) \ 5770  if (_sse2neon_unlikely((imm) & ~15)) { \ 5771  ret = _mm_setzero_si128(); \ 5773  ret = vreinterpretq_m128i_u16( \ 5774  vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \ 5793 #define _mm_srli_epi32(a, imm) \ 5796  if (_sse2neon_unlikely((imm) & ~31)) { \ 5797  ret = _mm_setzero_si128(); \ 5799  ret = vreinterpretq_m128i_u32( \ 5800  vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \ 5818 #define _mm_srli_epi64(a, imm) \ 5821  if (_sse2neon_unlikely((imm) & ~63)) { \ 5822  ret = _mm_setzero_si128(); \ 5824  ret = vreinterpretq_m128i_u64( \ 5825  vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \ 5854 #if defined(__aarch64__) 5855

vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(

a

));

5867 #if defined(__aarch64__) 5868

float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(

a

));

5869

vst1q_f64((float64_t *) mem_addr,

5870

vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));

5873

vst1q_f32((float32_t *) mem_addr,

5883 #if defined(__aarch64__) 5884

vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(

a

)));

5901 #define _mm_store1_pd _mm_store_pd1 5911 #if defined(__aarch64__) 5912

vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(

a

)));

5935 #if defined(__aarch64__) 5936

vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(

a

)));

5986 #if __has_builtin(__builtin_nontemporal_store) 5987

__builtin_nontemporal_store(

a

, (float32x4_t *) p);

5988 #elif defined(__aarch64__) 5989

vst1q_f64(p, vreinterpretq_f64_m128d(

a

));

6001 #if __has_builtin(__builtin_nontemporal_store) 6002

__builtin_nontemporal_store(

a

, p);

6014

vst1q_lane_s32((

int32_t

*) p, vdupq_n_s32(

a

), 0);

6081 #if defined(__aarch64__) 6082  return

vreinterpretq_m128d_f64(

6083

vsubq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

6085  double

*da = (

double

*) &

a

;

6086  double

*db = (

double

*) &

b

;

6088

c[0] = da[0] - db[0];

6089

c[1] = da[1] - db[1];

6090  return

vld1q_f32((float32_t *) c);

6169 #define _mm_ucomieq_sd _mm_comieq_sd 6170 #define _mm_ucomige_sd _mm_comige_sd 6171 #define _mm_ucomigt_sd _mm_comigt_sd 6172 #define _mm_ucomile_sd _mm_comile_sd 6173 #define _mm_ucomilt_sd _mm_comilt_sd 6174 #define _mm_ucomineq_sd _mm_comineq_sd 6180 #if defined(__GNUC__) || defined(__clang__) 6181 #pragma GCC diagnostic push 6182 #pragma GCC diagnostic ignored "-Wuninitialized" 6186 #if defined(__GNUC__) || defined(__clang__) 6187 #pragma GCC diagnostic pop 6206 #if defined(__aarch64__) 6212

int16x4x2_t

result

= vzip_s16(a1, b1);

6222 #if defined(__aarch64__) 6228

int32x2x2_t

result

= vzip_s32(a1, b1);

6259 #if defined(__aarch64__) 6267

int8x8x2_t

result

= vzip_s8(a1, b1);

6285 #if defined(__aarch64__) 6286  return

vreinterpretq_m128d_f64(

6287

vzip2q_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

6310 #if defined(__aarch64__) 6316

int16x4x2_t

result

= vzip_s16(a1, b1);

6332 #if defined(__aarch64__) 6338

int32x2x2_t

result

= vzip_s32(a1, b1);

6364 #if defined(__aarch64__) 6370

int8x8x2_t

result

= vzip_s8(a1, b1);

6388 #if defined(__aarch64__) 6389  return

vreinterpretq_m128d_f64(

6390

vzip1q_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

6440 #if defined(__aarch64__) 6441  return

vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(

a

),

6442

vreinterpretq_f64_m128d(

b

),

6443

vreinterpretq_f64_m128d(

mask

)));

6456 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) 6470 #if defined(__aarch64__) 6471  return

vreinterpretq_m128d_f64(

6472

vpaddq_f64(vreinterpretq_f64_m128d(

a

), vreinterpretq_f64_m128d(

b

)));

6474  double

*da = (

double

*) &

a

;

6475  double

*db = (

double

*) &

b

;

6476  double

c[] = {da[0] + da[1], db[0] + db[1]};

6486 #if defined(__aarch64__) 6495

vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));

6504 #if defined(__aarch64__) 6505

float64x2_t

a

= vreinterpretq_f64_m128d(_a);

6506

float64x2_t

b

= vreinterpretq_f64_m128d(_b);

6507  return

vreinterpretq_m128d_f64(

6508

vsubq_f64(vuzp1q_f64(

a

,

b

), vuzp2q_f64(

a

,

b

)));

6510  double

*da = (

double

*) &_a;

6511  double

*db = (

double

*) &_b;

6512  double

c[] = {da[0] - da[1], db[0] - db[1]};

6524 #if defined(__aarch64__) 6526

vsubq_f32(vuzp1q_f32(

a

,

b

), vuzp2q_f32(

a

,

b

)));

6528

float32x4x2_t c = vuzpq_f32(

a

,

b

);

6540 #define _mm_lddqu_si128 _mm_loadu_si128 6549 #define _mm_loaddup_pd _mm_load1_pd 6556 #if defined(__aarch64__) 6557  return

vreinterpretq_m128d_f64(

6558

vdupq_laneq_f64(vreinterpretq_f64_m128d(

a

), 0));

6570 #if __has_builtin(__builtin_shufflevector) 6586 #if __has_builtin(__builtin_shufflevector) 6699  tmp

[1] = vdupq_n_u8(0);

6715 #define _mm_alignr_pi8(a, b, imm) \ 6718  if (_sse2neon_unlikely((imm) >= 16)) { \ 6719  ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ 6721  uint8x8_t tmp_low, tmp_high; \ 6723  const int idx = (imm) -8; \ 6724  tmp_low = vreinterpret_u8_m64(a); \ 6725  tmp_high = vdup_n_u8(0); \ 6726  ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ 6728  const int idx = (imm); \ 6729  tmp_low = vreinterpret_u8_m64(b); \ 6730  tmp_high = vreinterpret_u8_m64(a); \ 6731  ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ 6743 #if defined(__aarch64__) 6747

vcombine_s16(vpadd_s16(vget_low_s16(

a

), vget_high_s16(

a

)),

6748

vpadd_s16(vget_low_s16(

b

), vget_high_s16(

b

))));

6759

vcombine_s32(vpadd_s32(vget_low_s32(

a

), vget_high_s32(

a

)),

6760

vpadd_s32(vget_low_s32(

b

), vget_high_s32(

b

))));

6785 #if defined(__aarch64__) 6788  return

vreinterpretq_s64_s16(

6789

vqaddq_s16(vuzp1q_s16(

a

,

b

), vuzp2q_s16(

a

,

b

)));

6796

int16x8_t ab0246 = vcombine_s16(vmovn_s32(

a

), vmovn_s32(

b

));

6797

int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(

a

, 16), vshrn_n_s32(

b

, 16));

6810 #if defined(__aarch64__) 6811  return

vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(

a

,

b

), vuzp2_s16(

a

,

b

)));

6813

int16x4x2_t res = vuzp_s16(

a

,

b

);

6814  return

vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));

6825 #if defined(__aarch64__) 6827

vsubq_s16(vuzp1q_s16(

a

,

b

), vuzp2q_s16(

a

,

b

)));

6829

int16x8x2_t c = vuzpq_s16(

a

,

b

);

6841 #if defined(__aarch64__) 6843

vsubq_s32(vuzp1q_s32(

a

,

b

), vuzp2q_s32(

a

,

b

)));

6845

int32x4x2_t c = vuzpq_s32(

a

,

b

);

6857 #if defined(__aarch64__) 6860

int16x4x2_t c = vuzp_s16(

a

,

b

);

6872 #if defined(__aarch64__) 6875

int32x2x2_t c = vuzp_s32(

a

,

b

);

6887 #if defined(__aarch64__) 6889

vqsubq_s16(vuzp1q_s16(

a

,

b

), vuzp2q_s16(

a

,

b

)));

6891

int16x8x2_t c = vuzpq_s16(

a

,

b

);

6903 #if defined(__aarch64__) 6906

int16x4x2_t c = vuzp_s16(

a

,

b

);

6923 #if defined(__aarch64__) 6926

int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(

a

))),

6927

vmovl_s8(vget_low_s8(

b

)));

6928

int16x8_t

th

= vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(

a

))),

6929

vmovl_s8(vget_high_s8(

b

)));

6931

vqaddq_s16(vuzp1q_s16(tl,

th

), vuzp2q_s16(tl,

th

)));

6939

int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(

a

, 8));

6940

int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(

a

, vdupq_n_u16(0xff00)));

6943

int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(

b

, 8), 8);

6944

int16x8_t b_odd = vshrq_n_s16(

b

, 8);

6947

int16x8_t prod1 = vmulq_s16(a_even, b_even);

6948

int16x8_t prod2 = vmulq_s16(a_odd, b_odd);

6966

int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(

a

, 8));

6967

int16x4_t a_even = vreinterpret_s16_u16(vand_u16(

a

, vdup_n_u16(0xff)));

6970

int16x4_t b_even = vshr_n_s16(vshl_n_s16(

b

, 8), 8);

6971

int16x4_t b_odd = vshr_n_s16(

b

, 8);

6974

int16x4_t prod1 = vmul_s16(a_even, b_even);

6975

int16x4_t prod2 = vmul_s16(a_odd, b_odd);

7003

int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);

7004

int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);

7016

int32x4_t mul_extend =

7030

uint8x16_t idx_masked =

7031

vandq_u8(idx, vdupq_n_u8(0x8F));

7032 #if defined(__aarch64__) 7034 #elif defined(__GNUC__) 7038

__asm__ __volatile__(

7039  "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" 7040  "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" 7042

: [tbl]

"w"

(tbl), [idx]

"w"

(idx_masked));

7046

int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};

7048

vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),

7049

vtbl2_s8(a_split, vget_high_u8(idx_masked))));

7069  const

int8x8_t controlMask =

7096

uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(

b

, 15));

7098 #if defined(__aarch64__) 7099

int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(

b

));

7101

int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(

b

, vdupq_n_s16(0)));

7106

int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(

a

),

a

);

7108

int16x8_t res = vbicq_s16(masked, zeroMask);

7133

uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(

b

, 31));

7136 #if defined(__aarch64__) 7137

int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(

b

));

7139

int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(

b

, vdupq_n_s32(0)));

7144

int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(

a

),

a

);

7146

int32x4_t res = vbicq_s32(masked, zeroMask);

7171

uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(

b

, 7));

7174 #if defined(__aarch64__) 7175

int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(

b

));

7177

int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(

b

, vdupq_n_s8(0)));

7182

int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(

a

),

a

);

7184

int8x16_t res = vbicq_s8(masked, zeroMask);

7212

uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(

b

, 15));

7215 #if defined(__aarch64__) 7216

int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(

b

));

7218

int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(

b

, vdup_n_s16(0)));

7223

int16x4_t masked = vbsl_s16(ltMask, vneg_s16(

a

),

a

);

7225

int16x4_t res = vbic_s16(masked, zeroMask);

7253

uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(

b

, 31));

7256 #if defined(__aarch64__) 7257

int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(

b

));

7259

int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(

b

, vdup_n_s32(0)));

7264

int32x2_t masked = vbsl_s32(ltMask, vneg_s32(

a

),

a

);

7266

int32x2_t res = vbic_s32(masked, zeroMask);

7294

uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(

b

, 7));

7297 #if defined(__aarch64__) 7298

int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(

b

));

7300

int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(

b

, vdup_n_s8(0)));

7305

int8x8_t masked = vbsl_s8(ltMask, vneg_s8(

a

),

a

);

7307

int8x8_t res = vbic_s8(masked, zeroMask);

7327 #define _mm_blend_epi16(a, b, imm) \ 7329  const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ 7330  ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ 7331  ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ 7332  ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ 7333  ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ 7334  ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ 7335  ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ 7336  ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \ 7337  uint16x8_t _mask_vec = vld1q_u16(_mask); \ 7338  uint16x8_t _a = vreinterpretq_u16_m128i(a); \ 7339  uint16x8_t _b = vreinterpretq_u16_m128i(b); \ 7340  vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ 7346 #define _mm_blend_pd(a, b, imm) \ 7348  const uint64_t _mask[2] = { \ 7349  ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ 7350  ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \ 7351  uint64x2_t _mask_vec = vld1q_u64(_mask); \ 7352  uint64x2_t _a = vreinterpretq_u64_m128d(a); \ 7353  uint64x2_t _b = vreinterpretq_u64_m128d(b); \ 7354  vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \ 7367

uint32x4_t

mask

= vld1q_u32(

data

);

7401 #if defined(__aarch64__) 7402

float64x2_t

a

= vreinterpretq_f64_m128d(_a);

7403

float64x2_t

b

= vreinterpretq_f64_m128d(_b);

7404  return

vreinterpretq_m128d_f64(vbslq_f64(

mask

,

b

,

a

));

7431 #if defined(__aarch64__) 7432  return

vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(

a

)));

7434  double

*

f

= (

double

*) &

a

;

7445 #if defined(__aarch64__) 7448  float

*

f

= (

float

*) &

a

;

7449  return _mm_set_ps

(ceilf(

f

[3]), ceilf(

f

[2]), ceilf(

f

[1]), ceilf(

f

[0]));

7481 #if defined(__aarch64__) 7489

uint32x4_t swapped = vrev64q_u32(

cmp

);

7507

int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));

7508

int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));

7525

int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));

7534

int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));

7535

int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));

7544

int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));

7545

int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));

7546

int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));

7563

uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));

7564

uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));

7582

uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));

7592

uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));

7593

uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));

7602

uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));

7603

uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));

7604

uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));

7617 #if !SSE2NEON_PRECISE_DP 7622 #if !SSE2NEON_PRECISE_DP 7628 #if defined(__aarch64__) 7629  double

d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(

a

), 0) *

7630

vgetq_lane_f64(vreinterpretq_f64_m128d(

b

), 0)

7632  double

d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(

a

), 1) *

7633

vgetq_lane_f64(vreinterpretq_f64_m128d(

b

), 1)

7636  double

d0 = (imm & 0x10) ? ((

double

*) &

a

)[0] * ((

double

*) &

b

)[0] : 0;

7637  double

d1 = (imm & 0x20) ? ((

double

*) &

a

)[1] * ((

double

*) &

b

)[1] : 0;

7642 #if defined(__aarch64__) 7643  double

sum = vpaddd_f64(vreinterpretq_f64_m128d(

tmp

));

7645  double

sum = *((

double

*) &

tmp

) + *(((

double

*) &

tmp

) + 1);

7660 #if defined(__aarch64__) 7690

(imm & 0x1) ? s : 0,

7691

(imm & 0x2) ? s : 0,

7692

(imm & 0x4) ? s : 0,

7693

(imm & 0x8) ? s : 0,

7701 #define _mm_extract_epi32(a, imm) \ 7702  vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) 7707 #define _mm_extract_epi64(a, imm) \ 7708  vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) 7714 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) 7718 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) 7726 #if defined(__aarch64__) 7727  return

vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(

a

)));

7729  double

*

f

= (

double

*) &

a

;

7740 #if defined(__aarch64__) 7743  float

*

f

= (

float

*) &

a

;

7744  return _mm_set_ps

(floorf(

f

[3]), floorf(

f

[2]), floorf(

f

[1]), floorf(

f

[0]));

7776 #define _mm_insert_epi32(a, b, imm) \ 7778  vreinterpretq_m128i_s32( \ 7779  vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ 7786 #define _mm_insert_epi64(a, b, imm) \ 7788  vreinterpretq_m128i_s64( \ 7789  vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ 7796 #define _mm_insert_epi8(a, b, imm) \ 7798  vreinterpretq_m128i_s8( \ 7799  vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ 7806 #define _mm_insert_ps(a, b, imm8) \ 7808  float32x4_t tmp1 = \ 7809  vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \ 7810  vreinterpretq_f32_m128(a), 0); \ 7811  float32x4_t tmp2 = \ 7812  vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \ 7813  ((imm8 >> 4) & 0x3)); \ 7814  const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ 7815  ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ 7816  ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ 7817  ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \ 7818  uint32x4_t mask = vld1q_u32(data); \ 7819  float32x4_t all_zeros = vdupq_n_f32(0); \ 7821  vreinterpretq_m128_f32( \ 7822  vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \ 7934 #if defined(__aarch64__) 7949  for

(

i

= 0;

i

< 8;

i

++) {

7976  switch

(imm & 0x4) {

7986 #if defined(__GNUC__) || defined(__clang__) 7987

__builtin_unreachable();

7992  switch

(imm & 0x3) {

7994

_b = vreinterpretq_u8_u32(

7998

_b = vreinterpretq_u8_u32(

8002

_b = vreinterpretq_u8_u32(

8006

_b = vreinterpretq_u8_u32(

8010 #if defined(__GNUC__) || defined(__clang__) 8011

__builtin_unreachable();

8016

int16x8_t c04, c15, c26, c37;

8017

uint8x8_t low_b = vget_low_u8(_b);

8018

c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));

8019

_a = vextq_u8(_a, _a, 1);

8020

c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));

8021

_a = vextq_u8(_a, _a, 1);

8022

c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));

8023

_a = vextq_u8(_a, _a, 1);

8024

c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));

8025 #if defined(__aarch64__) 8027

c04 = vpaddq_s16(c04, c26);

8029

c15 = vpaddq_s16(c15, c37);

8032

vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));

8034

vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));

8036

vreinterpretq_s16_s32(trn2_c)));

8038

int16x4_t c01, c23, c45, c67;

8039

c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));

8040

c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));

8041

c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));

8042

c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));

8045

vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));

8095 #if defined(__aarch64__) 8098  return

vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(

a

)));

8104  return

vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(

a

)));

8106  return

vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(

a

)));

8109  double

*v_double = (

double

*) &

a

;

8114  double

res[2],

tmp

;

8115  for

(

int i

= 0;

i

< 2;

i

++) {

8116  tmp

= (v_double[

i

] < 0) ? -v_double[

i

] : v_double[

i

];

8117  double

roundDown = floor(

tmp

);

8118  double

roundUp = ceil(

tmp

);

8119  double

diffDown =

tmp

- roundDown;

8120  double

diffUp = roundUp -

tmp

;

8121  if

(diffDown < diffUp) {

8123

res[

i

] = roundDown;

8124

}

else if

(diffDown > diffUp) {

8130  double

half = roundDown / 2;

8131  if

(half != floor(half)) {

8138

res[

i

] = roundDown;

8141

res[

i

] = (v_double[

i

] < 0) ? -res[

i

] : res[

i

];

8153  return _mm_set_pd

(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),

8154

v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));

8164 #if defined(__aarch64__) 8178  float

*v_float = (

float

*) &

a

;

8183

uint32x4_t signmask = vdupq_n_u32(0x80000000);

8186

int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(

8188

int32x4_t r_trunc = vcvtq_s32_f32(

8190

int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(

8191

vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));

8192

int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),

8194

float32x4_t

delta

= vsubq_f32(

8196

vcvtq_f32_s32(r_trunc));

8197

uint32x4_t is_delta_half =

8198

vceqq_f32(

delta

, half);

8200

vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));

8210  return _mm_set_ps

(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),

8211

v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),

8212

v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),

8213

v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));

8256 #if __has_builtin(__builtin_nontemporal_store) 8257  return

__builtin_nontemporal_load(p);

8268  return

(

uint64_t

)(vgetq_lane_s64(

a

, 0) & vgetq_lane_s64(

a

, 1)) ==

8277

int64x2_t a_and_mask =

8279  return

!(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));

8294

uint64x2_t

result

= vandq_u64(zf, cf);

8295  return

!(vgetq_lane_u64(

result

, 0) | vgetq_lane_u64(

result

, 1));

8308  return

!(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));

8317 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) 8328  return

!(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));

8337 #if defined(__aarch64__) 8352 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8353

__asm__ __volatile__(

"crc32ch %w[c], %w[c], %w[v]\n\t" 8368 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8369

__asm__ __volatile__(

"crc32cw %w[c], %w[c], %w[v]\n\t" 8384 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8385

__asm__ __volatile__(

"crc32cx %w[c], %w[c], %x[v]\n\t" 8400 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8401

__asm__ __volatile__(

"crc32cb %w[c], %w[c], %w[v]\n\t" 8406  for

(

int

bit = 0; bit < 8; bit++) {

8408

crc = (crc >> 1) ^

UINT32_C

(0x82f63b78);

8418 #if !defined(__ARM_FEATURE_CRYPTO) 8420 #define SSE2NEON_AES_DATA(w) \ 8422  w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ 8423  w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ 8424  w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ 8425  w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ 8426  w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ 8427  w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ 8428  w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ 8429  w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ 8430  w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ 8431  w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ 8432  w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ 8433  w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ 8434  w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ 8435  w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ 8436  w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ 8437  w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ 8438  w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ 8439  w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ 8440  w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ 8441  w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ 8442  w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ 8443  w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ 8444  w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ 8445  w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ 8446  w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ 8447  w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ 8448  w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ 8449  w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ 8450  w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ 8451  w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ 8452  w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ 8453  w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ 8454  w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ 8455  w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ 8456  w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ 8457  w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ 8458  w(0xb0), w(0x54), w(0xbb), w(0x16) \ 8463 #define SSE2NEON_AES_H0(x) (x) 8465 #undef SSE2NEON_AES_H0 8475 #if defined(__aarch64__) 8476  static const uint8_t

shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,

8477

0xe, 0x3, 0x8, 0xd, 0x2, 0x7,

8478

0xc, 0x1, 0x6, 0xb};

8479  static const uint8_t

ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,

8480

0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};

8486

w = vqtbl1q_u8(w, vld1q_u8(shift_rows));

8495

w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);

8496

w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);

8497

w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));

8503 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ 8504  (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ 8506 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b )) 8507 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) 8508 #define SSE2NEON_AES_U0(p) \ 8509  SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) 8510 #define SSE2NEON_AES_U1(p) \ 8511  SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) 8512 #define SSE2NEON_AES_U2(p) \ 8513  SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) 8514 #define SSE2NEON_AES_U3(p) \ 8515  SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) 8522 #undef SSE2NEON_AES_B2W 8523 #undef SSE2NEON_AES_F2 8524 #undef SSE2NEON_AES_F3 8525 #undef SSE2NEON_AES_U0 8526 #undef SSE2NEON_AES_U1 8527 #undef SSE2NEON_AES_U2 8528 #undef SSE2NEON_AES_U3 8536

(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^

8537

aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),

8538

(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^

8539

aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),

8540

(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^

8541

aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),

8542

(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^

8543

aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));

8573  for

(

int i

= 0;

i

< 16;

i

++)

8589  for

(

int i

= 0;

i

< 4; ++

i

) {

8594

((X1 >> 8) | (X1 << 24)) ^ rcon, X1);

8596 #undef SSE2NEON_AES_DATA 8627

u8[0x4], u8[0x1], u8[0xE], u8[0xB],

8628

u8[0x1], u8[0xE], u8[0xB], u8[0x4],

8629

u8[0xC], u8[0x9], u8[0x6], u8[0x3],

8630

u8[0x9], u8[0x6], u8[0x3], u8[0xC],

8632

uint32x4_t

r

= {0, (unsigned) rcon, 0, (

unsigned

) rcon};

8646  switch

(imm & 0x11) {

8668 #if defined(__aarch64__) 8675 #if defined(__aarch64__) 8676  asm volatile

(

"mrs %0, FPCR"

:

"=r"

(

r

.value));

8678  asm volatile

(

"vmrs %0, FPSCR"

:

"=r"

(

r

.value));

8689 #if defined(__aarch64__) 8690 #if __has_builtin(__builtin_popcount) 8691  return

__builtin_popcount(

a

);

8693  return

(

int

) vaddlv_u8(vcnt_u8(vcreate_u8((

uint64_t

)

a

)));

8697

uint8x8_t input_val, count8x8_val;

8698

uint16x4_t count16x4_val;

8699

uint32x2_t count32x2_val;

8701

input_val = vld1_u8((

uint8_t

*) &

a

);

8702

count8x8_val = vcnt_u8(input_val);

8703

count16x4_val = vpaddl_u8(count8x8_val);

8704

count32x2_val = vpaddl_u16(count16x4_val);

8706

vst1_u32(&

count

, count32x2_val);

8716 #if defined(__aarch64__) 8717 #if __has_builtin(__builtin_popcountll) 8718  return

__builtin_popcountll(

a

);

8720  return

(

int64_t

) vaddlv_u8(vcnt_u8(vcreate_u8(

a

)));

8724

uint8x8_t input_val, count8x8_val;

8725

uint16x4_t count16x4_val;

8726

uint32x2_t count32x2_val;

8727

uint64x1_t count64x1_val;

8729

input_val = vld1_u8((

uint8_t

*) &

a

);

8730

count8x8_val = vcnt_u8(input_val);

8731

count16x4_val = vpaddl_u8(count8x8_val);

8732

count32x2_val = vpaddl_u16(count16x4_val);

8733

count64x1_val = vpaddl_u32(count32x2_val);

8734

vst1_u64(&

count

, count64x1_val);

8745 #if defined(__aarch64__) 8752 #if defined(__aarch64__) 8753  asm volatile

(

"mrs %0, FPCR"

:

"=r"

(

r

.value));

8755  asm volatile

(

"vmrs %0, FPSCR"

:

"=r"

(

r

.value));

8760 #if defined(__aarch64__) 8761  asm volatile

(

"msr FPCR, %0"

::

"r"

(

r

));

8763  asm volatile

(

"vmsr FPSCR, %0"

::

"r"

(

r

));

8767 #if defined(__GNUC__) || defined(__clang__) 8768 #pragma pop_macro("ALIGN_STRUCT"

)

8769 #pragma pop_macro("FORCE_INLINE"

)

8772 #if defined(__GNUC__) && !defined(__clang__) 8773 #pragma GCC pop_options

ncbi::TMaskedQueryRegions mask

std::ofstream out("events_result.xml")

main entry point for tests

unsigned int

A callback function used to compare two keys in a database.

<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th

const struct ncbi::grid::netcache::search::fields::SIZE size

const struct ncbi::grid::netcache::search::fields::KEY key

const GenericPointer< typename T::ValueType > T2 value

static size_t rnd(size_t minimal, size_t maximal)

std::istream & in(std::istream &in_, double &x_)

Int4 delta(size_t dimension_, const Int4 *score_)

double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)

static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)

static __m128 _mm_cmpunord_ss(__m128 a, __m128 b)

static __m128i _mm_max_epi16(__m128i a, __m128i b)

static __m128i _mm_packs_epi16(__m128i a, __m128i b)

static __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)

static __m128 _mm_floor_ss(__m128 a, __m128 b)

static float _mm_cvtss_f32(__m128 a)

static __m128i _mm_subs_epu16(__m128i a, __m128i b)

#define vreinterpret_m64_f32(x)

static __m128i _mm_abs_epi32(__m128i a)

static __m128i _mm_undefined_si128(void)

static void _mm_stream_si64(int64_t *p, int64_t a)

#define _MM_FROUND_TO_POS_INF

static void _mm_storeh_pi(__m64 *p, __m128 a)

static __m128i _mm_cvtepi16_epi32(__m128i a)

static __m128i _mm_cmplt_epi8(__m128i a, __m128i b)

#define vreinterpretq_u32_m128d(x)

static __m128 _mm_cmpnle_ps(__m128 a, __m128 b)

#define vreinterpret_m64_s32(x)

static __m64 _mm_cvtps_pi8(__m128 a)

static __m128d _mm_div_pd(__m128d a, __m128d b)

static __m128i _mm_setzero_si128()

static __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)

static __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)

#define vreinterpretq_m128_s32(x)

static __m128 _mm_xor_ps(__m128 a, __m128 b)

static __m128i _mm_avg_epu16(__m128i a, __m128i b)

static __m128i _mm_abs_epi16(__m128i a)

static __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)

static __m128i _mm_min_epi8(__m128i a, __m128i b)

static __m128 _mm_cmple_ss(__m128 a, __m128 b)

static __m128d _mm_loadu_pd(const double *p)

static unsigned int _mm_getcsr()

static void _mm_free(void *addr)

static __m128 _mm_cmpeq_ps(__m128 a, __m128 b)

static __m128i _mm_mullo_epi32(__m128i a, __m128i b)

static __m64 _mm_cvt_ps2pi(__m128 a)

static __m128i _mm_max_epu8(__m128i a, __m128i b)

static __m128i _mm_add_epi8(__m128i a, __m128i b)

static __m128d _mm_cmpnge_pd(__m128d a, __m128d b)

static __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)

static int _mm_comile_sd(__m128d a, __m128d b)

static int _mm_test_all_ones(__m128i a)

static __m128i _mm_max_epi8(__m128i a, __m128i b)

static __m128 _mm_max_ss(__m128 a, __m128 b)

#define SSE2NEON_AES_U2(p)

#define vreinterpretq_m128_f32(x)

static int _mm_comieq_ss(__m128 a, __m128 b)

static __m128 _mm_movelh_ps(__m128 __A, __m128 __B)

static __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)

static __m128d _mm_cmpnle_pd(__m128d a, __m128d b)

static __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)

static int64_t _mm_cvttss_si64(__m128 a)

static __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)

static __m128i _mm_xor_si128(__m128i a, __m128i b)

static __m128d _mm_hsub_pd(__m128d _a, __m128d _b)

static uint32_t _mm_crc32_u8(uint32_t, uint8_t)

static __m128i _mm_adds_epi16(__m128i a, __m128i b)

#define vreinterpretq_m128i_s8(x)

static __m128d _mm_xor_pd(__m128d a, __m128d b)

static __m128i _mm_sub_epi16(__m128i a, __m128i b)

static __m64 _mm_min_pu8(__m64 a, __m64 b)

static __m128i _mm_shuffle_epi_3332(__m128i a)

static __m128i _mm_loadu_si32(const void *p)

static __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)

static __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)

#define vreinterpret_m64_s8(x)

static __m128i _mm_castps_si128(__m128)

static void _MM_SET_ROUNDING_MODE(int rounding)

static __m128d _mm_move_sd(__m128d, __m128d)

static __m128i _mm_sad_epu8(__m128i a, __m128i b)

#define vreinterpret_u16_m64(x)

static __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)

#define vreinterpretq_m128i_u64(x)

static int64_t _mm_cvtss_si64(__m128 a)

static __m128 _mm_loadl_pi(__m128 a, __m64 const *p)

static __m128d _mm_set1_pd(double d)

static void _mm_storer_ps(float *p, __m128 a)

#define vreinterpret_m64_s64(x)

static __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)

static int _mm_comige_ss(__m128 a, __m128 b)

static __m128i _mm_sub_epi64(__m128i a, __m128i b)

static __m128 _mm_cmple_ps(__m128 a, __m128 b)

static __m64 _mm_abs_pi8(__m64 a)

#define vreinterpretq_u8_m128i(x)

static void _mm_storel_pi(__m64 *p, __m128 a)

#define vreinterpretq_s16_m128i(x)

static __m64 _mm_max_pu8(__m64 a, __m64 b)

#define _MM_FLUSH_ZERO_MASK

static __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)

#define vreinterpretq_u64_m128d(x)

static __m128d _mm_cmpge_sd(__m128d a, __m128d b)

#define vreinterpretq_s8_m128(x)

static int64_t _mm_cvtsi128_si64(__m128i a)

static void _mm_stream_si128(__m128i *p, __m128i a)

static __m128i _mm_sll_epi32(__m128i a, __m128i count)

static __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)

static unsigned int _sse2neon_mm_get_flush_zero_mode()

static __m128 _mm_cmpgt_ps(__m128 a, __m128 b)

static __m128d _mm_castsi128_pd(__m128i a)

static __m128 _mm_loadu_ps(const float *p)

static uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)

static __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)

static __m128d _mm_load1_pd(const double *p)

static void _mm_empty(void)

static __m128i _mm_min_epu8(__m128i a, __m128i b)

static __m128 _mm_cmpnle_ss(__m128 a, __m128 b)

static __m128i _mm_shuffle_epi_2301(__m128i a)

static __m128 _mm_load_ps(const float *p)

#define vreinterpret_m64_u8(x)

static __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)

static __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)

static __m128 _mm_cmpord_ps(__m128 a, __m128 b)

static void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)

#define vreinterpretq_m128d_s32(x)

static __m128 _mm_set_ss(float a)

static int _mm_cvt_ss2si(__m128 a)

static __m128 _mm_and_ps(__m128 a, __m128 b)

static int _mm_cvtsi128_si32(__m128i a)

static __m128i _mm_set1_epi8(signed char w)

static __m128i _mm_loadu_si64(const void *p)

static __m128d _mm_ceil_sd(__m128d a, __m128d b)

static __m128 _mm_min_ps(__m128 a, __m128 b)

static void _mm_storel_pd(double *mem_addr, __m128d a)

static __m128 _mm_mul_ss(__m128 a, __m128 b)

static __m128 _mm_mul_ps(__m128 a, __m128 b)

static __m128i _mm_cvtsi32_si128(int a)

static __m128i _mm_cvttpd_epi32(__m128d a)

static __m128d _mm_cmpnle_sd(__m128d a, __m128d b)

static int _mm_popcnt_u32(unsigned int a)

#define _sse2neon_unlikely(x)

static __m128 _mm_cvtepi32_ps(__m128i a)

static __m128d _mm_round_pd(__m128d, int)

static __m128i _mm_cvtepu16_epi64(__m128i a)

static __m128 _mm_load_ss(const float *p)

static __m128d _mm_setzero_pd(void)

static __m128i _mm_add_epi16(__m128i a, __m128i b)

static void _mm_stream_ps(float *p, __m128 a)

static __m128 _mm_cmpord_ss(__m128 a, __m128 b)

#define vreinterpretq_f32_m128i(x)

static __m128d _mm_cmpord_sd(__m128d a, __m128d b)

static __m128i _mm_mul_epi32(__m128i a, __m128i b)

static __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)

static __m128d _mm_cmpngt_sd(__m128d a, __m128d b)

static __m128 _mm_moveldup_ps(__m128 a)

static __m128i _mm_srli_si128(__m128i a, int imm)

static __m128d _mm_cmpeq_sd(__m128d a, __m128d b)

#define vreinterpret_m64_u16(x)

#define SSE2NEON_AES_DATA(w)

static __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)

static __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)

static __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)

static __m128 _mm_undefined_ps(void)

static __m128d _mm_cmpunord_sd(__m128d a, __m128d b)

#define _MM_DENORMALS_ZERO_OFF

static __m128 _mm_cmpngt_ps(__m128 a, __m128 b)

static __m128 _mm_loadh_pi(__m128 a, __m64 const *p)

static __m128i _mm_cvtepi16_epi64(__m128i a)

static __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)

static __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)

static __m128i _mm_packus_epi32(__m128i a, __m128i b)

#define vreinterpretq_m128_u32(x)

#define SSE2NEON_AES_U0(p)

static int _mm_comineq_ss(__m128 a, __m128 b)

#define vreinterpretq_s32_m128i(x)

static __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)

#define vreinterpret_s64_m64(x)

static __m128 _mm_hadd_ps(__m128 a, __m128 b)

static __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)

static __m64 _mm_sign_pi16(__m64 _a, __m64 _b)

static __m128i _mm_cvttps_epi32(__m128 a)

#define vreinterpretq_m128i_s16(x)

#define vreinterpretq_f32_m128d(x)

static __m128i _mm_set_epi64x(int64_t, int64_t)

static __m128d _mm_or_pd(__m128d a, __m128d b)

static uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)

static __m128 _mm_cvtsd_ss(__m128 a, __m128d b)

static __m128i _mm_avg_epu8(__m128i a, __m128i b)

#define vreinterpretq_m128d_s64(x)

static void _mm_stream_pi(__m64 *p, __m64 a)

static __m128i _mm_cvtepu8_epi16(__m128i a)

static __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)

static void _mm_setcsr(unsigned int a)

static uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)

static int _mm_movemask_pd(__m128d a)

static __m128d _mm_load_sd(const double *p)

static __m64 _mm_avg_pu16(__m64 a, __m64 b)

static __m128 _mm_castsi128_ps(__m128i a)

static int _mm_comineq_sd(__m128d a, __m128d b)

static __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)

static __m128i _mm_min_epi32(__m128i a, __m128i b)

#define vreinterpretq_s8_m128i(x)

static __m128i _mm_slli_epi64(__m128i a, int imm)

static __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)

static __m64 _mm_cvtt_ps2pi(__m128 a)

static __m128i _mm_madd_epi16(__m128i a, __m128i b)

static __m128i _mm_srl_epi16(__m128i a, __m128i count)

static __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)

static __m64 _mm_max_pi16(__m64 a, __m64 b)

static __m128d _mm_div_sd(__m128d a, __m128d b)

static __m128i _mm_shuffle_epi8(__m128i a, __m128i b)

static __m128 _mm_rcp_ps(__m128 in)

static __m128 _mm_cmpgt_ss(__m128 a, __m128 b)

static __m128d _mm_cmpge_pd(__m128d a, __m128d b)

static __m128 _mm_add_ss(__m128 a, __m128 b)

static __m64 _mm_sad_pu8(__m64 a, __m64 b)

static int _mm_movemask_epi8(__m128i a)

#define vreinterpretq_s64_m128d(x)

static __m128i _mm_sign_epi8(__m128i _a, __m128i _b)

static __m128d _mm_unpackhi_pd(__m128d a, __m128d b)

static __m128 _mm_cvt_si2ss(__m128 a, int b)

static __m128d _mm_cmple_pd(__m128d a, __m128d b)

static __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)

#define _MM_DENORMALS_ZERO_ON

static __m128d _mm_load_pd(const double *p)

static void _mm_sfence(void)

static __m128i _mm_sll_epi64(__m128i a, __m128i count)

static void _mm_storeu_si16(void *p, __m128i a)

static __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)

static int _mm_comilt_ss(__m128 a, __m128 b)

#define vreinterpretq_m128i_u8(x)

static __m128 _mm_sqrt_ss(__m128 in)

#define _MM_ROUND_NEAREST

static __m128 _mm_set_ps(float w, float z, float y, float x)

static __m128i _mm_loadu_si16(const void *p)

static __m128d _mm_sqrt_sd(__m128d a, __m128d b)

#define vreinterpret_u32_m64(x)

#define vreinterpretq_nth_u64_m128i(x, n)

static __m128i _mm_srl_epi32(__m128i a, __m128i count)

static __m128i _mm_slli_si128(__m128i a, int imm)

#define vreinterpretq_m128d_f32(x)

static __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)

#define vreinterpretq_f64_m128i(x)

static __m128 _mm_or_ps(__m128, __m128)

static __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)

static __m128 _mm_andnot_ps(__m128 a, __m128 b)

static __m128 _mm_move_ss(__m128, __m128)

static __m128d _mm_cmple_sd(__m128d a, __m128d b)

static __m128d _mm_addsub_pd(__m128d a, __m128d b)

static __m128i _mm_sign_epi32(__m128i _a, __m128i _b)

#define vreinterpret_m64_s16(x)

static __m128 _mm_cmpge_ss(__m128 a, __m128 b)

static __m128d _mm_cmpnge_sd(__m128d a, __m128d b)

static __m128i _mm_shuffle_epi_2103(__m128i a)

static __m128 _mm_hsub_ps(__m128 _a, __m128 _b)

static __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)

static __m128d _mm_movedup_pd(__m128d a)

static __m128 _mm_movehl_ps(__m128 __A, __m128 __B)

static __m128i _mm_adds_epi8(__m128i a, __m128i b)

static __m128i _mm_movpi64_epi64(__m64 a)

static __m64 _mm_add_si64(__m64 a, __m64 b)

static __m128i _mm_srai_epi16(__m128i a, int imm)

static __m128i _mm_set1_epi16(short w)

static __m128i _mm_add_epi64(__m128i a, __m128i b)

static __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)

static __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)

static __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)

static void * _mm_malloc(size_t size, size_t align)

static __m128 _mm_cmpnge_ps(__m128 a, __m128 b)

static __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)

static __m128i _mm_slli_epi16(__m128i a, int imm)

static int _mm_cvtt_ss2si(__m128 a)

static __m128i _mm_set_epi64(__m64 i1, __m64 i2)

static unsigned int _sse2neon_mm_get_denormals_zero_mode()

static __m128i _mm_adds_epu8(__m128i a, __m128i b)

static __m128i _mm_sll_epi16(__m128i a, __m128i count)

static int _mm_movemask_ps(__m128 a)

static __m128d _mm_mul_pd(__m128d a, __m128d b)

#define _MM_FROUND_TO_NEG_INF

static __m128i _mm_subs_epi8(__m128i a, __m128i b)

static __m128i _mm_setr_epi64(__m64 e1, __m64 e0)

static int _mm_comigt_sd(__m128d a, __m128d b)

static int _mm_test_all_zeros(__m128i a, __m128i mask)

static __m128d _mm_cmpunord_pd(__m128d a, __m128d b)

static __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)

static int _mm_comile_ss(__m128 a, __m128 b)

#define vreinterpretq_u64_m128(x)

static void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)

static __m128 _mm_ceil_ss(__m128 a, __m128 b)

static __m128 _mm_movehdup_ps(__m128 a)

static __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)

static __m128i _mm_mullo_epi16(__m128i a, __m128i b)

static __m128d _mm_hadd_pd(__m128d a, __m128d b)

static __m128i _mm_cmplt_epi32(__m128i a, __m128i b)

static void _mm_store_pd1(double *mem_addr, __m128d a)

static __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)

static __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)

static __m128d _mm_sub_pd(__m128d a, __m128d b)

static void _mm_store_ps1(float *p, __m128 a)

static __m128d _mm_max_pd(__m128d a, __m128d b)

static __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)

static __m128i _mm_max_epi32(__m128i a, __m128i b)

static int _mm_comilt_sd(__m128d a, __m128d b)

static __m128i _mm_slli_epi32(__m128i a, int imm)

#define vreinterpretq_s32_m128(x)

static __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)

static __m128i _mm_loadu_si128(const __m128i *p)

static __m128i _mm_minpos_epu16(__m128i a)

static __m128 _mm_set1_ps(float _w)

static __m128d _mm_setr_pd(double e1, double e0)

#define vreinterpretq_u32_m128i(x)

static __m128d _mm_floor_sd(__m128d a, __m128d b)

static __m128i _mm_sra_epi32(__m128i a, __m128i count)

static __m128 _mm_addsub_ps(__m128 a, __m128 b)

static void _mm_store_si128(__m128i *p, __m128i a)

static __m128i _mm_cmplt_epi16(__m128i a, __m128i b)

static __m128 _mm_rsqrt_ps(__m128 in)

static void _mm_store_sd(double *mem_addr, __m128d a)

static __m128 _mm_setr_ps(float w, float z, float y, float x)

static __m128 _mm_cmplt_ss(__m128 a, __m128 b)

static __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)

static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)

#define vreinterpretq_m128i_s64(x)

#define vreinterpretq_m128_s64(x)

static int _mm_testc_si128(__m128i a, __m128i b)

#define vreinterpretq_u32_m128(x)

static __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)

static __m128d _mm_min_pd(__m128d a, __m128d b)

static void _mm_prefetch(const void *p, int i)

static __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)

static __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)

static __m128i _mm_cvtepu16_epi32(__m128i a)

static __m128d _mm_min_sd(__m128d a, __m128d b)

static unsigned int _MM_GET_ROUNDING_MODE()

static void _mm_storeu_si64(void *p, __m128i a)

static __m64 _mm_min_pi16(__m64 a, __m64 b)

static __m128d _mm_floor_pd(__m128d)

static __m128i _mm_mulhi_epi16(__m128i a, __m128i b)

static void _mm_clflush(void const *p)

static __m128i _mm_cvtepi8_epi32(__m128i a)

static void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)

static __m128i _mm_cvtepu8_epi64(__m128i a)

#define vreinterpret_s32_m64(x)

static __m128d _mm_cvtepi32_pd(__m128i a)

#define vreinterpretq_m128d_u32(x)

static __m128i _mm_shuffle_epi_0101(__m128i a)

static __m128i _mm_cvtps_epi32(__m128)

static __m64 _mm_cvttpd_pi32(__m128d a)

static __m128d _mm_cmpngt_pd(__m128d a, __m128d b)

static __m128i _mm_subs_epi16(__m128i a, __m128i b)

#define vreinterpret_m64_u64(x)

static __m128d _mm_cmpgt_sd(__m128d a, __m128d b)

#define vreinterpretq_u64_m128i(x)

static __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)

static __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)

static __m128i _mm_min_epi16(__m128i a, __m128i b)

static __m64 _mm_avg_pu8(__m64 a, __m64 b)

static __m128 _mm_load1_ps(const float *p)

static void _mm_storer_pd(double *mem_addr, __m128d a)

static __m128i _mm_load_si128(const __m128i *p)

static __m128i _mm_sign_epi16(__m128i _a, __m128i _b)

static __m64 _mm_hadd_pi32(__m64 a, __m64 b)

static __m128 _mm_cmpneq_ss(__m128 a, __m128 b)

#define _MM_ROUND_TOWARD_ZERO

static __m64 _mm_mulhi_pu16(__m64 a, __m64 b)

static __m128d _mm_andnot_pd(__m128d a, __m128d b)

static __m128i _mm_packs_epi32(__m128i a, __m128i b)

static __m128i _mm_abs_epi8(__m128i a)

static __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)

static __m64 _mm_shuffle_pi8(__m64 a, __m64 b)

static void _mm_store_ss(float *p, __m128 a)

static __m128i _mm_or_si128(__m128i, __m128i)

static int32_t _mm_cvttsd_si32(__m128d a)

static void _mm_storeu_pd(double *mem_addr, __m128d a)

static double _mm_cvtsd_f64(__m128d a)

static __m128 _mm_cmplt_ps(__m128 a, __m128 b)

static void _mm_store_ps(float *p, __m128 a)

static __m128i _mm_cvtepu32_epi64(__m128i a)

static __m128d _mm_cmpneq_sd(__m128d a, __m128d b)

static __m128i _mm_min_epu32(__m128i a, __m128i b)

static int _mm_testz_si128(__m128i a, __m128i b)

static __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)

static __m128d _mm_cmpord_pd(__m128d a, __m128d b)

static __m128 _mm_unpacklo_ps(__m128 a, __m128 b)

static __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)

static __m128d _mm_cvtps_pd(__m128 a)

static __m128 _mm_unpackhi_ps(__m128 a, __m128 b)

static __m128i _mm_add_epi32(__m128i a, __m128i b)

static __m64 _mm_hadd_pi16(__m64 a, __m64 b)

static __m128d _mm_cmplt_pd(__m128d a, __m128d b)

#define _MM_FROUND_TO_NEAREST_INT

static __m128i _mm_subs_epu8(__m128i a, __m128i b)

static int _mm_movemask_pi8(__m64 a)

#define vreinterpretq_m128d_u64(x)

static __m128i _mm_cvtepu8_epi32(__m128i a)

static __m128 _mm_round_ps(__m128, int)

static __m128i _mm_set_epi32(int, int, int, int)

static __m128i _mm_shuffle_epi_2211(__m128i a)

static __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)

static __m128 _mm_div_ss(__m128 a, __m128 b)

static __m128i _mm_cvtepi32_epi64(__m128i a)

static __m128i _mm_cvtpd_epi32(__m128d a)

#define SSE2NEON_AES_U3(p)

#define vreinterpret_s16_m64(x)

static __m128 _mm_sub_ss(__m128 a, __m128 b)

static uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)

static __m128 _mm_min_ss(__m128 a, __m128 b)

static int _mm_comieq_sd(__m128d a, __m128d b)

#define vreinterpretq_nth_u8_m128i(x, n)

static void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)

#define vreinterpretq_m128i_u32(x)

static __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)

static __m64 _mm_movepi64_pi64(__m128i a)

static __m128d _mm_loadr_pd(const double *p)

static __m128 _mm_cvtpd_ps(__m128d a)

static __m128i _mm_packus_epi16(const __m128i a, const __m128i b)

static int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)

#define SSE2NEON_AES_U1(p)

static __m128 _mm_cvtpu16_ps(__m64 a)

static __m128i _mm_mulhi_epu16(__m128i a, __m128i b)

static __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)

static __m64 _mm_cvtpd_pi32(__m128d a)

static __m128 _mm_max_ps(__m128 a, __m128 b)

static __m128i _mm_sub_epi32(__m128i a, __m128i b)

static __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)

static __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)

static __m128i _mm_sub_epi8(__m128i a, __m128i b)

static __m128d _mm_cmpeq_pd(__m128d a, __m128d b)

#define vreinterpretq_m128i_s32(x)

static __m128 _mm_cmpge_ps(__m128 a, __m128 b)

static int64_t _mm_cvttsd_si64(__m128d a)

static __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)

static __m128d _mm_cmpneq_pd(__m128d a, __m128d b)

static __m128d _mm_undefined_pd(void)

static __m128i _mm_shuffle_epi_1032(__m128i a)

static __m128 _mm_rsqrt_ss(__m128 in)

static __m128i _mm_shuffle_epi_1010(__m128i a)

static const uint8_t SSE2NEON_sbox[256]

static __m128d _mm_ceil_pd(__m128d)

static __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)

static __m128d _mm_loadh_pd(__m128d a, const double *p)

#define vreinterpretq_f32_m128(x)

static __m128i _mm_set1_epi32(int)

static __m64 _mm_cvtps_pi16(__m128 a)

static __m128d _mm_loadl_pd(__m128d a, const double *p)

static __m128i _mm_move_epi64(__m128i a)

static __m128i _mm_cvtepi8_epi64(__m128i a)

static int64_t _mm_popcnt_u64(uint64_t a)

static __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)

static __m128d _mm_cmplt_sd(__m128d a, __m128d b)

static __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)

static __m128i _mm_sra_epi16(__m128i a, __m128i count)

static __m128 _mm_cmpeq_ss(__m128 a, __m128 b)

static __m128d _mm_sub_sd(__m128d a, __m128d b)

static __m128i _mm_castpd_si128(__m128d a)

static void _mm_stream_pd(double *p, __m128d a)

#define _MM_FROUND_TO_ZERO

static __m128i _mm_shuffle_epi_0321(__m128i a)

static __m128d _mm_set_pd(double, double)

static __m128i _mm_stream_load_si128(__m128i *p)

static __m128 _mm_sub_ps(__m128 a, __m128 b)

static __m128 _mm_cmpnge_ss(__m128 a, __m128 b)

static __m128d _mm_cvtss_sd(__m128d a, __m128 b)

static int _mm_comige_sd(__m128d a, __m128d b)

static __m128 _mm_div_ps(__m128 a, __m128 b)

static int32_t _mm_cvtsd_si32(__m128d a)

static __m64 _mm_mul_su32(__m64 a, __m64 b)

#define _MM_FLUSH_ZERO_ON

static __m128d _mm_sqrt_pd(__m128d a)

static __m128i _mm_adds_epu16(__m128i a, __m128i b)

static __m128 _mm_cmpneq_ps(__m128 a, __m128 b)

#define _MM_DENORMALS_ZERO_MASK

static __m128i _mm_andnot_si128(__m128i a, __m128i b)

static __m128 _mm_ceil_ps(__m128)

#define vreinterpretq_m128i_u16(x)

static __m128d _mm_castps_pd(__m128 a)

static __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)

static __m64 _mm_sub_si64(__m64 a, __m64 b)

static __m128d _mm_add_pd(__m128d a, __m128d b)

static __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)

static __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)

static __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)

#define _MM_FROUND_CUR_DIRECTION

static __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)

static __m128 _mm_floor_ps(__m128)

static __m128d _mm_cvtpi32_pd(__m64 a)

static void _mm_storeu_si128(__m128i *p, __m128i a)

static __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)

static __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)

static __m64 _mm_abs_pi32(__m64 a)

#define vreinterpretq_u16_m128i(x)

static __m128d _mm_set_sd(double a)

static __m128 _mm_rcp_ss(__m128 a)

static __m128i _mm_cvtepi8_epi16(__m128i a)

static void _mm_storel_epi64(__m128i *a, __m128i b)

static void _mm_stream_si32(int *p, int a)

static __m128i _mm_set1_epi64x(int64_t _i)

static __m128d _mm_and_pd(__m128d a, __m128d b)

static void _sse2neon_kadd_f32(float *sum, float *c, float y)

static __m128i _mm_loadl_epi64(__m128i const *p)

static __m128i _mm_set1_epi64(__m64 _i)

static void _mm_store_pd(double *mem_addr, __m128d a)

static __m64 _mm_abs_pi16(__m64 a)

static __m128 _mm_cvtpi8_ps(__m64 a)

static __m128i _mm_cvtsi64_si128(int64_t a)

static __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)

static __m128i _mm_shuffle_epi_1001(__m128i a)

static __m128d _mm_max_sd(__m128d a, __m128d b)

#define vreinterpret_s8_m64(x)

static __m128d _mm_mul_sd(__m128d a, __m128d b)

static __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)

static void _mm_storeh_pd(double *mem_addr, __m128d a)

static __m128 _mm_sqrt_ps(__m128 in)

static __m64 _mm_sign_pi32(__m64 _a, __m64 _b)

#define _mm_shuffle_epi32(a, imm)

static __m128d _mm_add_sd(__m128d a, __m128d b)

static void _mm_storeu_ps(float *p, __m128 a)

#define vreinterpret_u8_m64(x)

static __m128 _mm_set_ps1(float)

static __m128i _mm_srl_epi64(__m128i a, __m128i count)

static __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)

static __m128d _mm_unpacklo_pd(__m128d a, __m128d b)

static int64_t _mm_cvtsd_si64(__m128d a)

static __m128 _mm_loadr_ps(const float *p)

static __m128i _mm_min_epu16(__m128i a, __m128i b)

static __m64 _mm_sign_pi8(__m64 _a, __m64 _b)

static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)

static __m128 _mm_cmpngt_ss(__m128 a, __m128 b)

static __m128 _mm_castpd_ps(__m128d a)

static __m128i _mm_mul_epu32(__m128i a, __m128i b)

static __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)

static __m128 _mm_cvtpu8_ps(__m64 a)

static __m128i _mm_and_si128(__m128i, __m128i)

#define SSE2NEON_AES_H0(x)

static void _mm_storeu_si32(void *p, __m128i a)

static __m128i _mm_max_epu32(__m128i a, __m128i b)

static __m128i _mm_shuffle_epi_0122(__m128i a)

static __m128d _mm_cmpgt_pd(__m128d a, __m128d b)

static __m128 _mm_add_ps(__m128 a, __m128 b)

static int _mm_comigt_ss(__m128 a, __m128 b)

#define _MM_FROUND_NO_EXC

#define vreinterpretq_s64_m128i(x)

static __m128 _mm_cvtpi16_ps(__m64 a)

#define _MM_FLUSH_ZERO_OFF

static __m128 _mm_setzero_ps(void)

static __m128i _mm_max_epu16(__m128i a, __m128i b)

static __m128i _mm_cmpeq_epi32(__m128i, __m128i)

static __m128 _mm_cmpunord_ps(__m128 a, __m128 b)

int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4