)
77 #pragma push_macro("ALIGN_STRUCT")
78 #define FORCE_INLINE static inline __attribute__((always_inline)) 79 #define ALIGN_STRUCT(x) __attribute__((aligned(x))) 80 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1) 81 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) 83 #warning "Macro name collisions may happen with unsupported compiler." 85 #define FORCE_INLINE static inline 88 #define ALIGN_STRUCT(x) __declspec(align(x)) 90 #define _sse2neon_likely(x) (x) 91 #define _sse2neon_unlikely(x) (x) 100 #if defined(__arm__) && __ARM_ARCH == 7 105 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) 106 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." 108 #if !defined(__clang__) 109 #pragma GCC push_options 110 #pragma GCC target("fpu=neon")
112 #elif defined(__aarch64__) 113 #if !defined(__clang__) 114 #pragma GCC push_options 115 #pragma GCC target("+simd")
118 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." 122 #include <arm_neon.h> 125 #if !defined(__aarch64__) 132 #ifndef __has_builtin 134 #if defined(__GNUC__) && (__GNUC__ <= 9) 135 #define __has_builtin(x) HAS##x 136 #define HAS__builtin_popcount 1 137 #define HAS__builtin_popcountll 1 139 #define __has_builtin(x) 0 151 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ 152 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) 155 #define _MM_FROUND_TO_NEAREST_INT 0x00 156 #define _MM_FROUND_TO_NEG_INF 0x01 157 #define _MM_FROUND_TO_POS_INF 0x02 158 #define _MM_FROUND_TO_ZERO 0x03 159 #define _MM_FROUND_CUR_DIRECTION 0x04 160 #define _MM_FROUND_NO_EXC 0x08 161 #define _MM_FROUND_RAISE_EXC 0x00 162 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) 163 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) 164 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) 165 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) 166 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) 167 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) 168 #define _MM_ROUND_NEAREST 0x0000 169 #define _MM_ROUND_DOWN 0x2000 170 #define _MM_ROUND_UP 0x4000 171 #define _MM_ROUND_TOWARD_ZERO 0x6000 173 #define _MM_FLUSH_ZERO_MASK 0x8000 174 #define _MM_FLUSH_ZERO_ON 0x8000 175 #define _MM_FLUSH_ZERO_OFF 0x0000 177 #define _MM_DENORMALS_ZERO_MASK 0x0040 178 #define _MM_DENORMALS_ZERO_ON 0x0040 179 #define _MM_DENORMALS_ZERO_OFF 0x0000 182 #define __constrange(a, b) const 195 #if defined(__aarch64__) 204 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) 205 #if (defined(__x86_64__) || defined(__i386__)) 206 #define __int64 long long 208 #define __int64 int64_t 214 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) 215 #define vreinterpretq_m128_f32(x) (x) 216 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) 218 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) 219 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) 220 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) 221 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) 223 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) 224 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) 225 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) 226 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) 228 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) 229 #define vreinterpretq_f32_m128(x) (x) 230 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) 232 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) 233 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) 234 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) 235 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) 237 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) 238 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) 239 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) 240 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) 242 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) 243 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) 244 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) 245 #define vreinterpretq_m128i_s64(x) (x) 247 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) 248 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) 249 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) 250 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) 252 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) 253 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) 255 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) 256 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) 257 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) 258 #define vreinterpretq_s64_m128i(x) (x) 260 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) 261 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) 262 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) 263 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) 265 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) 266 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) 267 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) 268 #define vreinterpret_m64_s64(x) (x) 270 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) 271 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) 272 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) 273 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) 275 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) 276 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) 277 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) 279 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) 280 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) 281 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) 282 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) 284 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) 285 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) 286 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) 287 #define vreinterpret_s64_m64(x) (x) 289 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) 291 #if defined(__aarch64__) 292 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) 293 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) 295 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) 297 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) 298 #define vreinterpretq_m128d_f64(x) (x) 300 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) 302 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) 303 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) 305 #define vreinterpretq_f64_m128d(x) (x) 306 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) 308 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) 309 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) 311 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) 312 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) 314 #define vreinterpretq_m128d_f32(x) (x) 316 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) 318 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) 319 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) 321 #define vreinterpretq_f32_m128d(x) (x) 360 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) 361 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) 362 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) 365 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode 366 #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode 367 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode 368 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode 402 #if defined(__GNUC__) && !defined(__clang__) && \ 403 ((__GNUC__ <= 10 && defined(__arm__)) || \ 404 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ 405 (__GNUC__ <= 9 && defined(__aarch64__))) 409ret.val[0] = vld1q_u8(p + 0);
410ret.val[1] = vld1q_u8(p + 16);
411ret.val[2] = vld1q_u8(p + 32);
412ret.val[3] = vld1q_u8(p + 48);
419 returnvld1q_u8_x4(p);
520 #if defined(__aarch64__) 546float32x2_t a21 = vget_high_f32(
548float32x2_t b03 = vget_low_f32(
555float32x2_t a03 = vget_low_f32(
557float32x2_t b21 = vget_high_f32(
620float32x2_t a02 = vset_lane_f32(a0, a22, 1);
638float32x2_t b20 = vset_lane_f32(b2, b00, 1);
645float32_t b2 = vgetq_lane_f32(
b, 2);
647float32x2_t b20 = vset_lane_f32(b2, b00, 1);
654float32_t b2 = vgetq_lane_f32(
b, 2);
656float32x2_t b20 = vset_lane_f32(b2, b00, 1);
666*c = (
t- *sum) - y;
670 #if defined(__ARM_FEATURE_CRYPTO) 674poly64_t
a= vget_lane_p64(vreinterpret_p64_u64(_a), 0);
675poly64_t
b= vget_lane_p64(vreinterpret_p64_u64(_b), 0);
676 returnvreinterpretq_u64_p128(vmull_p64(
a,
b));
694poly8x8_t
a= vreinterpret_p8_u64(_a);
695poly8x8_t
b= vreinterpret_p8_u64(_b);
698uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
699vcreate_u8(0x00000000ffffffff));
700uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
701vcreate_u8(0x0000000000000000));
704uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(
a,
b));
706vreinterpretq_u8_p16(vmull_p8(
a, vext_p8(
b,
b, 1)));
708vreinterpretq_u8_p16(vmull_p8(vext_p8(
a,
a, 1),
b));
710vreinterpretq_u8_p16(vmull_p8(
a, vext_p8(
b,
b, 2)));
712vreinterpretq_u8_p16(vmull_p8(vext_p8(
a,
a, 2),
b));
714vreinterpretq_u8_p16(vmull_p8(
a, vext_p8(
b,
b, 3)));
716vreinterpretq_u8_p16(vmull_p8(vext_p8(
a,
a, 3),
b));
718vreinterpretq_u8_p16(vmull_p8(
a, vext_p8(
b,
b, 4)));
721uint8x16_t
l= veorq_u8(e,
f);
722uint8x16_t m = veorq_u8(
g, h);
723uint8x16_t
n= veorq_u8(
i, j);
727 #if defined(__aarch64__) 728uint8x16_t lm_p0 = vreinterpretq_u8_u64(
729vzip1q_u64(vreinterpretq_u64_u8(
l), vreinterpretq_u64_u8(m)));
730uint8x16_t lm_p1 = vreinterpretq_u8_u64(
731vzip2q_u64(vreinterpretq_u64_u8(
l), vreinterpretq_u64_u8(m)));
732uint8x16_t nk_p0 = vreinterpretq_u8_u64(
733vzip1q_u64(vreinterpretq_u64_u8(
n), vreinterpretq_u64_u8(k)));
734uint8x16_t nk_p1 = vreinterpretq_u8_u64(
735vzip2q_u64(vreinterpretq_u64_u8(
n), vreinterpretq_u64_u8(k)));
737uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(
l), vget_low_u8(m));
738uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(
l), vget_high_u8(m));
739uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(
n), vget_low_u8(k));
740uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(
n), vget_high_u8(k));
744uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
745uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
746uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
750uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
751uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
752uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
755 #if defined(__aarch64__) 756uint8x16_t t0 = vreinterpretq_u8_u64(
757vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
758uint8x16_t t1 = vreinterpretq_u8_u64(
759vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
760uint8x16_t t2 = vreinterpretq_u8_u64(
761vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
762uint8x16_t t3 = vreinterpretq_u8_u64(
763vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
765uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
766uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
767uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
768uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
771uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
772uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
773uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
774uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
777uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
778uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
779uint8x16_t mix = veorq_u8(d, cross1);
780uint8x16_t
r= veorq_u8(mix, cross2);
781 returnvreinterpretq_u64_u8(
r);
793 #define _mm_shuffle_epi32_default(a, imm) \ 797 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ 798 ret = vsetq_lane_s32( \ 799 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ 801 ret = vsetq_lane_s32( \ 802 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ 804 ret = vsetq_lane_s32( \ 805 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ 807 vreinterpretq_m128i_s32(ret); \ 894 #if defined(__aarch64__) 895 #define _mm_shuffle_epi32_splat(a, imm) \ 897 vreinterpretq_m128i_s32( \ 898 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ 901 #define _mm_shuffle_epi32_splat(a, imm) \ 903 vreinterpretq_m128i_s32( \ 904 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ 922 #define _mm_shuffle_ps_default(a, b, imm) \ 926 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ 927 ret = vsetq_lane_f32( \ 928 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ 930 ret = vsetq_lane_f32( \ 931 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ 933 ret = vsetq_lane_f32( \ 934 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ 936 vreinterpretq_m128_f32(ret); \ 945 #define _mm_shufflelo_epi16_function(a, imm) \ 947 int16x8_t ret = vreinterpretq_s16_m128i(a); \ 948 int16x4_t lowBits = vget_low_s16(ret); \ 949 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ 950 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ 952 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ 954 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ 956 vreinterpretq_m128i_s16(ret); \ 965 #define _mm_shufflehi_epi16_function(a, imm) \ 967 int16x8_t ret = vreinterpretq_s16_m128i(a); \ 968 int16x4_t highBits = vget_high_s16(ret); \ 969 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ 970 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ 972 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ 974 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ 976 vreinterpretq_m128i_s16(ret); \ 1005float32x4_t
value= vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1284 returnvgetq_lane_u32(a_eq_b, 0) & 0x1;
1294 returnvgetq_lane_u32(a_ge_b, 0) & 0x1;
1304 returnvgetq_lane_u32(a_gt_b, 0) & 0x1;
1314 returnvgetq_lane_u32(a_le_b, 0) & 0x1;
1326 returnvgetq_lane_u32(a_lt_b, 0) & 0x1;
1366 #if defined(__aarch64__) 1394 #if defined(__aarch64__) 1398float32_t
data= vgetq_lane_f32(
1514 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) 1548int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1549 static const uint32_tbitMask[2] = {0xFFFFFFFF, 0};
1550int8x8_t
mask= vreinterpret_s8_u32(vld1_u32(bitMask));
1596 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) 1628 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) 1638 #if defined(__aarch64__) 1641float32_t
data= vgetq_lane_f32(
1682 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) 1690 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) 1713 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV 1719 #if SSE2NEON_PRECISE_DIV 1740 #define _mm_extract_pi16(a, imm) \ 1741 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) 1758 #if defined(__aarch64__) 1765 #if defined(__aarch64__) 1766 asm volatile(
"mrs %0, FPCR":
"=r"(
r.value));
1768 asm volatile(
"vmrs %0, FPSCR":
"=r"(
r.value));
1782 #if defined(__aarch64__) 1789 #if defined(__aarch64__) 1790 asm volatile(
"mrs %0, FPCR":
"=r"(
r.value));
1792 asm volatile(
"vmrs %0, FPSCR":
"=r"(
r.value));
1795 if(
r.field.bit22) {
1805 #define _mm_insert_pi16(a, b, imm) \ 1807 vreinterpret_m64_s16( \ 1808 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ 1827 #define _mm_load_ps1 _mm_load1_ps 1858vcombine_f32(vget_low_f32(
a), vld1_f32((
constfloat32_t *) p)));
1875vcombine_f32(vld1_f32((
constfloat32_t *) p), vget_high_f32(
a)));
1890float32x4_t v = vrev64q_f32(vld1q_f32(p));
1912vsetq_lane_s16(*(
const int16_t*) p, vdupq_n_s16(0), 0));
1924vcombine_s64(vld1_s64((
const int64_t*) p), vdup_n_s64(0)));
1935 if(align == 2 || (
sizeof(
void*) == 8 && align == 4))
1936align =
sizeof(
void*);
1937 if(!posix_memalign(&ptr, align,
size))
1953vst1_s8((
int8_t*) mem_addr, masked);
1960 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) 1982 #if SSE2NEON_PRECISE_MINMAX 2037 #if SSE2NEON_PRECISE_MINMAX 2113 #if defined(__aarch64__) 2114 static constint8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2115uint8x8_t
tmp= vshr_n_u8(
input, 7);
2116 returnvaddv_u8(vshl_u8(
tmp, shift));
2119uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(
input, 7));
2120uint32x2_t paired16 =
2121vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2122uint8x8_t paired32 =
2123vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2124 returnvget_lane_u8(paired32, 0) | ((
int) vget_lane_u8(paired32, 4) << 4);
2135 #if defined(__aarch64__) 2136 static constint32x4_t shift = {0, 1, 2, 3};
2137uint32x4_t
tmp= vshrq_n_u32(
input, 31);
2138 returnvaddvq_u32(vshlq_u32(
tmp, shift));
2143uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(
input, 31));
2146vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2148 returnvgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2207 #define _m_pavgb(a, b) _mm_avg_pu8(a, b) 2218 #define _m_pavgw(a, b) _mm_avg_pu16(a, b) 2223 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) 2228 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) 2233 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) 2238 #define _m_pmaxub(a, b) _mm_max_pu8(a, b) 2243 #define _m_pminsw(a, b) _mm_min_pi16(a, b) 2248 #define _m_pminub(a, b) _mm_min_pu8(a, b) 2253 #define _m_pmovmskb(a) _mm_movemask_pi8(a) 2259 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) 2266__builtin_prefetch(p);
2274 #define _m_psadbw(a, b) _mm_sad_pu8(a, b) 2279 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) 2289 #if SSE2NEON_PRECISE_DIV 2317 #if SSE2NEON_PRECISE_SQRT 2344uint64x1_t
t= vpaddl_u32(vpaddl_u16(
2347vset_lane_u16(vget_lane_u64(
t, 0), vdup_n_u16(0), 0));
2360 #if defined(__aarch64__) 2367 #if defined(__aarch64__) 2368 asm volatile(
"mrs %0, FPCR":
"=r"(
r.value));
2370 asm volatile(
"vmrs %0, FPSCR":
"=r"(
r.value));
2375 #if defined(__aarch64__) 2376 asm volatile(
"msr FPCR, %0"::
"r"(
r));
2378 asm volatile(
"vmsr FPSCR, %0"::
"r"(
r));
2406 #if defined(__aarch64__) 2413 #if defined(__aarch64__) 2414 asm volatile(
"mrs %0, FPCR":
"=r"(
r.value));
2416 asm volatile(
"vmrs %0, FPSCR":
"=r"(
r.value));
2437 #if defined(__aarch64__) 2438 asm volatile(
"msr FPCR, %0"::
"r"(
r));
2440 asm volatile(
"vmsr FPSCR, %0"::
"r"(
r));
2494 #if __has_builtin(__builtin_shufflevector) 2495 #define _mm_shuffle_pi16(a, imm) \ 2497 vreinterpret_m64_s16(__builtin_shufflevector( \ 2498 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ 2499 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \ 2502 #define _mm_shuffle_pi16(a, imm) \ 2506 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \ 2507 ret = vset_lane_s16( \ 2508 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \ 2510 ret = vset_lane_s16( \ 2511 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \ 2513 ret = vset_lane_s16( \ 2514 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \ 2516 vreinterpret_m64_s16(ret); \ 2525__sync_synchronize();
2530 #if __has_builtin(__builtin_shufflevector) 2531 #define _mm_shuffle_ps(a, b, imm) \ 2533 float32x4_t _input1 = vreinterpretq_f32_m128(a); \ 2534 float32x4_t _input2 = vreinterpretq_f32_m128(b); \ 2535 float32x4_t _shuf = __builtin_shufflevector( \ 2536 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ 2537 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ 2538 vreinterpretq_m128_f32(_shuf); \ 2541 #define _mm_shuffle_ps(a, b, imm) \ 2545 case _MM_SHUFFLE(1, 0, 3, 2): \ 2546 ret = _mm_shuffle_ps_1032((a), (b)); \ 2548 case _MM_SHUFFLE(2, 3, 0, 1): \ 2549 ret = _mm_shuffle_ps_2301((a), (b)); \ 2551 case _MM_SHUFFLE(0, 3, 2, 1): \ 2552 ret = _mm_shuffle_ps_0321((a), (b)); \ 2554 case _MM_SHUFFLE(2, 1, 0, 3): \ 2555 ret = _mm_shuffle_ps_2103((a), (b)); \ 2557 case _MM_SHUFFLE(1, 0, 1, 0): \ 2558 ret = _mm_movelh_ps((a), (b)); \ 2560 case _MM_SHUFFLE(1, 0, 0, 1): \ 2561 ret = _mm_shuffle_ps_1001((a), (b)); \ 2563 case _MM_SHUFFLE(0, 1, 0, 1): \ 2564 ret = _mm_shuffle_ps_0101((a), (b)); \ 2566 case _MM_SHUFFLE(3, 2, 1, 0): \ 2567 ret = _mm_shuffle_ps_3210((a), (b)); \ 2569 case _MM_SHUFFLE(0, 0, 1, 1): \ 2570 ret = _mm_shuffle_ps_0011((a), (b)); \ 2572 case _MM_SHUFFLE(0, 0, 2, 2): \ 2573 ret = _mm_shuffle_ps_0022((a), (b)); \ 2575 case _MM_SHUFFLE(2, 2, 0, 0): \ 2576 ret = _mm_shuffle_ps_2200((a), (b)); \ 2578 case _MM_SHUFFLE(3, 2, 0, 2): \ 2579 ret = _mm_shuffle_ps_3202((a), (b)); \ 2581 case _MM_SHUFFLE(3, 2, 3, 2): \ 2582 ret = _mm_movehl_ps((b), (a)); \ 2584 case _MM_SHUFFLE(1, 1, 3, 3): \ 2585 ret = _mm_shuffle_ps_1133((a), (b)); \ 2587 case _MM_SHUFFLE(2, 0, 1, 0): \ 2588 ret = _mm_shuffle_ps_2010((a), (b)); \ 2590 case _MM_SHUFFLE(2, 0, 0, 1): \ 2591 ret = _mm_shuffle_ps_2001((a), (b)); \ 2593 case _MM_SHUFFLE(2, 0, 3, 2): \ 2594 ret = _mm_shuffle_ps_2032((a), (b)); \ 2597 ret = _mm_shuffle_ps_default((a), (b), (imm)); \ 2616 #if SSE2NEON_PRECISE_SQRT 2621 constuint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2622 constuint32x4_t div_by_zero =
2623vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2624recip = vreinterpretq_f32_u32(
2625vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2637 #elif defined(__aarch64__) 2641float32x4_t sq = vrecpeq_f32(recipsq);
2677vst1q_f32(p, vdupq_n_f32(a0));
2697 #define _mm_store1_ps _mm_store_ps1 2736float32x4_t rev = vextq_f32(
tmp,
tmp, 2);
2774 #if __has_builtin(__builtin_nontemporal_store) 2775__builtin_nontemporal_store(
a, (float32x4_t *) p);
2813 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2815 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ 2816 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ 2817 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ 2818 vget_low_f32(ROW23.val[0])); \ 2819 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ 2820 vget_low_f32(ROW23.val[1])); \ 2821 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ 2822 vget_high_f32(ROW23.val[0])); \ 2823 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ 2824 vget_high_f32(ROW23.val[1])); \ 2829 #define _mm_ucomieq_ss _mm_comieq_ss 2830 #define _mm_ucomige_ss _mm_comige_ss 2831 #define _mm_ucomigt_ss _mm_comigt_ss 2832 #define _mm_ucomile_ss _mm_comile_ss 2833 #define _mm_ucomilt_ss _mm_comilt_ss 2834 #define _mm_ucomineq_ss _mm_comineq_ss 2840 #if defined(__GNUC__) || defined(__clang__) 2841 #pragma GCC diagnostic push 2842 #pragma GCC diagnostic ignored "-Wuninitialized" 2846 #if defined(__GNUC__) || defined(__clang__) 2847 #pragma GCC diagnostic pop 2855 #if defined(__GNUC__) || defined(__clang__) 2856 #pragma GCC diagnostic push 2857 #pragma GCC diagnostic ignored "-Wuninitialized" 2861 #if defined(__GNUC__) || defined(__clang__) 2862 #pragma GCC diagnostic pop 2877 #if defined(__aarch64__) 2883float32x2x2_t
result= vzip_f32(a1, b1);
2899 #if defined(__aarch64__) 2905float32x2x2_t
result= vzip_f32(a1, b1);
2968 #if defined(__aarch64__) 2969 returnvreinterpretq_m128d_f64(
2970vaddq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
2972 double*da = (
double*) &
a;
2973 double*db = (
double*) &
b;
2975c[0] = da[0] + db[0];
2976c[1] = da[1] + db[1];
2977 returnvld1q_f32((float32_t *) c);
2991 #if defined(__aarch64__) 2994 double*da = (
double*) &
a;
2995 double*db = (
double*) &
b;
2997c[0] = da[0] + db[0];
2999 returnvld1q_f32((float32_t *) c);
3151 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) 3156 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) 3195 #if defined(__aarch64__) 3250 #if defined(__aarch64__) 3252vceqq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
3257uint32x4_t swapped = vrev64q_u32(
cmp);
3276 #if defined(__aarch64__) 3278vcgeq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
3285d[0] = (*(
double*) &a0) >= (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3286d[1] = (*(
double*) &a1) >= (*(
double*) &b1) ? ~
UINT64_C(0) :
UINT64_C(0);
3298 #if defined(__aarch64__) 3306d[0] = (*(
double*) &a0) >= (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3357 #if defined(__aarch64__) 3359vcgtq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
3366d[0] = (*(
double*) &a0) > (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3367d[1] = (*(
double*) &a1) > (*(
double*) &b1) ? ~
UINT64_C(0) :
UINT64_C(0);
3379 #if defined(__aarch64__) 3387d[0] = (*(
double*) &a0) > (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3399 #if defined(__aarch64__) 3401vcleq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
3408d[0] = (*(
double*) &a0) <= (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3409d[1] = (*(
double*) &a1) <= (*(
double*) &b1) ? ~
UINT64_C(0) :
UINT64_C(0);
3421 #if defined(__aarch64__) 3429d[0] = (*(
double*) &a0) <= (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3475 #if defined(__aarch64__) 3477vcltq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
3484d[0] = (*(
double*) &a0) < (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3485d[1] = (*(
double*) &a1) < (*(
double*) &b1) ? ~
UINT64_C(0) :
UINT64_C(0);
3497 #if defined(__aarch64__) 3504d[0] = (*(
double*) &a0) < (*(
double*) &b0) ? ~
UINT64_C(0) :
UINT64_C(0);
3516 #if defined(__aarch64__) 3518vceqq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)))));
3523uint32x4_t swapped = vrev64q_u32(
cmp);
3542 #if defined(__aarch64__) 3544vcgeq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)),
3553!((*(
double*) &a0) >= (*(
double*) &b0)) ? ~
UINT64_C(0) :
UINT64_C(0);
3555!((*(
double*) &a1) >= (*(
double*) &b1)) ? ~
UINT64_C(0) :
UINT64_C(0);
3575 #if defined(__aarch64__) 3577vcgtq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)),
3586!((*(
double*) &a0) > (*(
double*) &b0)) ? ~
UINT64_C(0) :
UINT64_C(0);
3588!((*(
double*) &a1) > (*(
double*) &b1)) ? ~
UINT64_C(0) :
UINT64_C(0);
3608 #if defined(__aarch64__) 3610vcleq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)),
3619!((*(
double*) &a0) <= (*(
double*) &b0)) ? ~
UINT64_C(0) :
UINT64_C(0);
3621!((*(
double*) &a1) <= (*(
double*) &b1)) ? ~
UINT64_C(0) :
UINT64_C(0);
3641 #if defined(__aarch64__) 3643vcltq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)),
3652!((*(
double*) &a0) < (*(
double*) &b0)) ? ~
UINT64_C(0) :
UINT64_C(0);
3654!((*(
double*) &a1) < (*(
double*) &b1)) ? ~
UINT64_C(0) :
UINT64_C(0);
3674 #if defined(__aarch64__) 3676uint64x2_t not_nan_a =
3677vceqq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
a));
3678uint64x2_t not_nan_b =
3679vceqq_f64(vreinterpretq_f64_m128d(
b), vreinterpretq_f64_m128d(
b));
3687d[0] = ((*(
double*) &a0) == (*(
double*) &a0) &&
3688(*(
double*) &b0) == (*(
double*) &b0))
3691d[1] = ((*(
double*) &a1) == (*(
double*) &a1) &&
3692(*(
double*) &b1) == (*(
double*) &b1))
3706 #if defined(__aarch64__) 3713d[0] = ((*(
double*) &a0) == (*(
double*) &a0) &&
3714(*(
double*) &b0) == (*(
double*) &b0))
3728 #if defined(__aarch64__) 3730uint64x2_t not_nan_a =
3731vceqq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
a));
3732uint64x2_t not_nan_b =
3733vceqq_f64(vreinterpretq_f64_m128d(
b), vreinterpretq_f64_m128d(
b));
3735vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3742d[0] = ((*(
double*) &a0) == (*(
double*) &a0) &&
3743(*(
double*) &b0) == (*(
double*) &b0))
3746d[1] = ((*(
double*) &a1) == (*(
double*) &a1) &&
3747(*(
double*) &b1) == (*(
double*) &b1))
3761 #if defined(__aarch64__) 3768d[0] = ((*(
double*) &a0) == (*(
double*) &a0) &&
3769(*(
double*) &b0) == (*(
double*) &b0))
3783 #if defined(__aarch64__) 3784 returnvgetq_lane_u64(vcgeq_f64(
a,
b), 0) & 0x1;
3789 return(*(
double*) &a0 >= *(
double*) &b0);
3798 #if defined(__aarch64__) 3799 returnvgetq_lane_u64(vcgtq_f64(
a,
b), 0) & 0x1;
3804 return(*(
double*) &a0 > *(
double*) &b0);
3813 #if defined(__aarch64__) 3814 returnvgetq_lane_u64(vcleq_f64(
a,
b), 0) & 0x1;
3819 return(*(
double*) &a0 <= *(
double*) &b0);
3828 #if defined(__aarch64__) 3829 returnvgetq_lane_u64(vcltq_f64(
a,
b), 0) & 0x1;
3834 return(*(
double*) &a0 < *(
double*) &b0);
3843 #if defined(__aarch64__) 3844 returnvgetq_lane_u64(vceqq_f64(
a,
b), 0) & 0x1;
3846uint32x4_t a_not_nan =
3848uint32x4_t b_not_nan =
3850uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3853uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3854vreinterpretq_u64_u32(a_eq_b));
3855 returnvgetq_lane_u64(and_results, 0) & 0x1;
3879 #if defined(__aarch64__) 3880 returnvreinterpretq_m128d_f64(
3910 doubled0 = ((
double*) &
rnd)[0];
3911 doubled1 = ((
double*) &
rnd)[1];
3928 doubled0 = ((
double*) &
rnd)[0];
3929 doubled1 = ((
double*) &
rnd)[1];
3948 #if defined(__aarch64__) 3949float32x2_t
tmp= vcvt_f32_f64(vreinterpretq_f64_m128d(
a));
3952 floata0 = (float) ((
double*) &
a)[0];
3953 floata1 = (float) ((
double*) &
a)[1];
3970 #if defined(__aarch64__) 3971 returnvreinterpretq_m128d_f64(
3993 #if defined(__aarch64__) 4005 float*
f= (
float*) &
a;
4008uint32x4_t signmask = vdupq_n_u32(0x80000000);
4011int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4013int32x4_t r_trunc = vcvtq_s32_f32(
4015int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4016vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
4017int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4019float32x4_t
delta= vsubq_f32(
4021vcvtq_f32_s32(r_trunc));
4022uint32x4_t is_delta_half =
4023vceqq_f32(
delta, half);
4025vbslq_s32(is_delta_half, r_even, r_normal));
4053 #if defined(__aarch64__) 4054 returnvreinterpretq_m128d_f64(
4070 #if defined(__aarch64__) 4071 return(
double) vgetq_lane_f64(vreinterpretq_f64_m128d(
a), 0);
4073 return((
double*) &
a)[0];
4085 #if defined(__aarch64__) 4086 return(
int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(
a)), 0);
4089 doubleret = ((
double*) &
rnd)[0];
4102 #if defined(__aarch64__) 4103 return(
int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(
a)), 0);
4106 doubleret = ((
double*) &
rnd)[0];
4117 #define _mm_cvtsd_si64x _mm_cvtsd_si64 4126 #if defined(__aarch64__) 4128vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(
b)), 0),
4158 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) 4166 #if defined(__aarch64__) 4167 returnvreinterpretq_m128d_f64(
4168vsetq_lane_f64((
double)
b, vreinterpretq_f64_m128d(
a), 0));
4170 doublebf = (double)
b;
4181 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) 4203 #if defined(__aarch64__) 4204 returnvreinterpretq_m128d_f64(
4205vsetq_lane_f64((
double)
b, vreinterpretq_f64_m128d(
a), 0));
4207 doublebf = (double)
b;
4226 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) 4232 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) 4246 #if defined(__aarch64__) 4247 returnvreinterpretq_m128d_f64(
4248vsetq_lane_f64(d, vreinterpretq_f64_m128d(
a), 0));
4260 doublea0 = ((
double*) &
a)[0];
4261 doublea1 = ((
double*) &
a)[1];
4270 doublea0 = ((
double*) &
a)[0];
4271 doublea1 = ((
double*) &
a)[1];
4292 doubleret = *((
double*) &
a);
4304 #if defined(__aarch64__) 4305 returnvgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(
a)), 0);
4307 doubleret = *((
double*) &
a);
4318 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) 4331 #if defined(__aarch64__) 4332 returnvreinterpretq_m128d_f64(
4333vdivq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
4335 double*da = (
double*) &
a;
4336 double*db = (
double*) &
b;
4338c[0] = da[0] / db[0];
4339c[1] = da[1] / db[1];
4340 returnvld1q_f32((float32_t *) c);
4351 #if defined(__aarch64__) 4353vdivq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b));
4354 returnvreinterpretq_m128d_f64(
4355vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(
a), 1),
tmp, 1));
4365 #define _mm_extract_epi16(a, imm) \ 4366 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) 4373 #define _mm_insert_epi16(a, b, imm) \ 4375 vreinterpretq_m128i_s16( \ 4376 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ 4387 #if defined(__aarch64__) 4388 returnvreinterpretq_m128d_f64(vld1q_f64(p));
4390 const float*
fp= (
const float*) p;
4403 #define _mm_load_pd1 _mm_load1_pd 4415 #if defined(__aarch64__) 4416 returnvreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4418 const float*
fp= (
const float*) p;
4440 #if defined(__aarch64__) 4441 returnvreinterpretq_m128d_f64(vld1q_dup_f64(p));
4457 #if defined(__aarch64__) 4458 returnvreinterpretq_m128d_f64(
4459vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(
a)), vld1_f64(p)));
4474vcombine_s32(vld1_s32((
int32_t const*) p), vcreate_s32(0)));
4487 #if defined(__aarch64__) 4488 returnvreinterpretq_m128d_f64(
4489vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(
a))));
4492vcombine_f32(vld1_f32((
const float*) p),
4507 #if defined(__aarch64__) 4508float64x2_t v = vld1q_f64(p);
4509 returnvreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4511int64x2_t v = vld1q_s64((
const int64_t*) p);
4539vsetq_lane_s32(*(
const int32_t*) p, vdupq_n_s32(0), 0));
4557int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4558int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4575vst1q_s8((
int8_t*) mem_addr, masked);
4601 #if defined(__aarch64__) 4602 #if SSE2NEON_PRECISE_MINMAX 4603float64x2_t _a = vreinterpretq_f64_m128d(
a);
4604float64x2_t _b = vreinterpretq_f64_m128d(
b);
4605 returnvreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4607 returnvreinterpretq_m128d_f64(
4608vmaxq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
4616d[0] = (*(
double*) &a0) > (*(
double*) &b0) ? a0 : b0;
4617d[1] = (*(
double*) &a1) > (*(
double*) &b1) ? a1 : b1;
4629 #if defined(__aarch64__) 4632 double*da = (
double*) &
a;
4633 double*db = (
double*) &
b;
4634 doublec[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4662 #if defined(__aarch64__) 4663 #if SSE2NEON_PRECISE_MINMAX 4664float64x2_t _a = vreinterpretq_f64_m128d(
a);
4665float64x2_t _b = vreinterpretq_f64_m128d(
b);
4666 returnvreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4668 returnvreinterpretq_m128d_f64(
4669vminq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
4677d[0] = (*(
double*) &a0) < (*(
double*) &b0) ? a0 : b0;
4678d[1] = (*(
double*) &a1) < (*(
double*) &b1) ? a1 : b1;
4689 #if defined(__aarch64__) 4692 double*da = (
double*) &
a;
4693 double*db = (
double*) &
b;
4694 doublec[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4757uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(
input, 7));
4772uint32x4_t paired16 =
4773vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4786uint64x2_t paired32 =
4787vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4800uint8x16_t paired64 =
4801vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4808 returnvgetq_lane_u8(paired64, 0) | ((
int) vgetq_lane_u8(paired64, 8) << 8);
4817uint64x2_t high_bits = vshrq_n_u64(
input, 63);
4818 returnvgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4862 #if defined(__aarch64__) 4863 returnvreinterpretq_m128d_f64(
4864vmulq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
4866 double*da = (
double*) &
a;
4867 double*db = (
double*) &
b;
4869c[0] = da[0] * db[0];
4870c[1] = da[1] * db[1];
4871 returnvld1q_f32((float32_t *) c);
4913int32x4_t ab3210 = vmull_s16(a3210, b3210);
4916int32x4_t ab7654 = vmull_s16(a7654, b7654);
4918vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4930uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4931 #if defined(__aarch64__) 4934uint16x8_t
r= vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4935vreinterpretq_u16_u32(ab7654));
4940uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4942vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
5039__asm__ __volatile__(
"isb\n");
5049uint16x8_t
t= vpaddlq_u8(vabdq_u8((uint8x16_t)
a, (uint8x16_t)
b));
5090vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5126 #if defined(__aarch64__) 5127 returnvreinterpretq_m128d_f64(vld1q_f64((float64_t *)
data));
5136 #define _mm_set_pd1 _mm_set1_pd 5204 #if defined(__aarch64__) 5205 returnvreinterpretq_m128d_f64(vdupq_n_f64(d));
5285 #if defined(__aarch64__) 5286 returnvreinterpretq_m128d_f64(vdupq_n_f64(0));
5303 #if __has_builtin(__builtin_shufflevector) 5304 #define _mm_shuffle_epi32(a, imm) \ 5306 int32x4_t _input = vreinterpretq_s32_m128i(a); \ 5307 int32x4_t _shuf = __builtin_shufflevector( \ 5308 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ 5309 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ 5310 vreinterpretq_m128i_s32(_shuf); \ 5313 #define _mm_shuffle_epi32(a, imm) \ 5317 case _MM_SHUFFLE(1, 0, 3, 2): \ 5318 ret = _mm_shuffle_epi_1032((a)); \ 5320 case _MM_SHUFFLE(2, 3, 0, 1): \ 5321 ret = _mm_shuffle_epi_2301((a)); \ 5323 case _MM_SHUFFLE(0, 3, 2, 1): \ 5324 ret = _mm_shuffle_epi_0321((a)); \ 5326 case _MM_SHUFFLE(2, 1, 0, 3): \ 5327 ret = _mm_shuffle_epi_2103((a)); \ 5329 case _MM_SHUFFLE(1, 0, 1, 0): \ 5330 ret = _mm_shuffle_epi_1010((a)); \ 5332 case _MM_SHUFFLE(1, 0, 0, 1): \ 5333 ret = _mm_shuffle_epi_1001((a)); \ 5335 case _MM_SHUFFLE(0, 1, 0, 1): \ 5336 ret = _mm_shuffle_epi_0101((a)); \ 5338 case _MM_SHUFFLE(2, 2, 1, 1): \ 5339 ret = _mm_shuffle_epi_2211((a)); \ 5341 case _MM_SHUFFLE(0, 1, 2, 2): \ 5342 ret = _mm_shuffle_epi_0122((a)); \ 5344 case _MM_SHUFFLE(3, 3, 3, 2): \ 5345 ret = _mm_shuffle_epi_3332((a)); \ 5347 case _MM_SHUFFLE(0, 0, 0, 0): \ 5348 ret = _mm_shuffle_epi32_splat((a), 0); \ 5350 case _MM_SHUFFLE(1, 1, 1, 1): \ 5351 ret = _mm_shuffle_epi32_splat((a), 1); \ 5353 case _MM_SHUFFLE(2, 2, 2, 2): \ 5354 ret = _mm_shuffle_epi32_splat((a), 2); \ 5356 case _MM_SHUFFLE(3, 3, 3, 3): \ 5357 ret = _mm_shuffle_epi32_splat((a), 3); \ 5360 ret = _mm_shuffle_epi32_default((a), (imm)); \ 5374 #if __has_builtin(__builtin_shufflevector) 5375 #define _mm_shuffle_pd(a, b, imm8) \ 5376 vreinterpretq_m128d_s64(__builtin_shufflevector( \ 5377 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ 5378 ((imm8 & 0x2) >> 1) + 2)) 5380 #define _mm_shuffle_pd(a, b, imm8) \ 5381 _mm_castsi128_pd(_mm_set_epi64x( \ 5382 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ 5383 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) 5388 #if __has_builtin(__builtin_shufflevector) 5389 #define _mm_shufflehi_epi16(a, imm) \ 5391 int16x8_t _input = vreinterpretq_s16_m128i(a); \ 5392 int16x8_t _shuf = __builtin_shufflevector( \ 5393 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ 5394 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ 5395 (((imm) >> 6) & 0x3) + 4); \ 5396 vreinterpretq_m128i_s16(_shuf); \ 5399 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) 5404 #if __has_builtin(__builtin_shufflevector) 5405 #define _mm_shufflelo_epi16(a, imm) \ 5407 int16x8_t _input = vreinterpretq_s16_m128i(a); \ 5408 int16x8_t _shuf = __builtin_shufflevector( \ 5409 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ 5410 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ 5411 vreinterpretq_m128i_s16(_shuf); \ 5414 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) 5436int16x8_t vc = vdupq_n_s16((
int16_t) c);
5459int32x4_t vc = vdupq_n_s32((
int32_t) c);
5482int64x2_t vc = vdupq_n_s64((
int64_t) c);
5565vld1q_u8(((
uint8_t const*)
tmp) + (16 - imm)));
5573 #if defined(__aarch64__) 5574 returnvreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(
a)));
5576 doublea0 = sqrt(((
double*) &
a)[0]);
5577 doublea1 = sqrt(((
double*) &
a)[1]);
5588 #if defined(__aarch64__) 5591 return _mm_set_pd(((
double*) &
a)[1], sqrt(((
double*) &
b)[0]));
5652 const int count= (imm & ~15) ? 15 : imm;
5653 return(
__m128i) vshlq_s16((int16x8_t)
a, vdupq_n_s16(-
count));
5670 #define _mm_srai_epi32(a, imm) \ 5673 if (_sse2neon_unlikely((imm) == 0)) { \ 5675 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ 5676 ret = vreinterpretq_m128i_s32( \ 5677 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ 5679 ret = vreinterpretq_m128i_s32( \ 5680 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ 5704int16x8_t vc = vdupq_n_s16(-(
int16_t) c);
5727int32x4_t vc = vdupq_n_s32(-(
int32_t) c);
5750int64x2_t vc = vdupq_n_s64(-(
int64_t) c);
5767 #define _mm_srli_epi16(a, imm) \ 5770 if (_sse2neon_unlikely((imm) & ~15)) { \ 5771 ret = _mm_setzero_si128(); \ 5773 ret = vreinterpretq_m128i_u16( \ 5774 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \ 5793 #define _mm_srli_epi32(a, imm) \ 5796 if (_sse2neon_unlikely((imm) & ~31)) { \ 5797 ret = _mm_setzero_si128(); \ 5799 ret = vreinterpretq_m128i_u32( \ 5800 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \ 5818 #define _mm_srli_epi64(a, imm) \ 5821 if (_sse2neon_unlikely((imm) & ~63)) { \ 5822 ret = _mm_setzero_si128(); \ 5824 ret = vreinterpretq_m128i_u64( \ 5825 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \ 5854 #if defined(__aarch64__) 5855vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(
a));
5867 #if defined(__aarch64__) 5868float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(
a));
5869vst1q_f64((float64_t *) mem_addr,
5870vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5873vst1q_f32((float32_t *) mem_addr,
5883 #if defined(__aarch64__) 5884vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(
a)));
5901 #define _mm_store1_pd _mm_store_pd1 5911 #if defined(__aarch64__) 5912vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(
a)));
5935 #if defined(__aarch64__) 5936vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(
a)));
5986 #if __has_builtin(__builtin_nontemporal_store) 5987__builtin_nontemporal_store(
a, (float32x4_t *) p);
5988 #elif defined(__aarch64__) 5989vst1q_f64(p, vreinterpretq_f64_m128d(
a));
6001 #if __has_builtin(__builtin_nontemporal_store) 6002__builtin_nontemporal_store(
a, p);
6014vst1q_lane_s32((
int32_t*) p, vdupq_n_s32(
a), 0);
6081 #if defined(__aarch64__) 6082 returnvreinterpretq_m128d_f64(
6083vsubq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
6085 double*da = (
double*) &
a;
6086 double*db = (
double*) &
b;
6088c[0] = da[0] - db[0];
6089c[1] = da[1] - db[1];
6090 returnvld1q_f32((float32_t *) c);
6169 #define _mm_ucomieq_sd _mm_comieq_sd 6170 #define _mm_ucomige_sd _mm_comige_sd 6171 #define _mm_ucomigt_sd _mm_comigt_sd 6172 #define _mm_ucomile_sd _mm_comile_sd 6173 #define _mm_ucomilt_sd _mm_comilt_sd 6174 #define _mm_ucomineq_sd _mm_comineq_sd 6180 #if defined(__GNUC__) || defined(__clang__) 6181 #pragma GCC diagnostic push 6182 #pragma GCC diagnostic ignored "-Wuninitialized" 6186 #if defined(__GNUC__) || defined(__clang__) 6187 #pragma GCC diagnostic pop 6206 #if defined(__aarch64__) 6212int16x4x2_t
result= vzip_s16(a1, b1);
6222 #if defined(__aarch64__) 6228int32x2x2_t
result= vzip_s32(a1, b1);
6259 #if defined(__aarch64__) 6267int8x8x2_t
result= vzip_s8(a1, b1);
6285 #if defined(__aarch64__) 6286 returnvreinterpretq_m128d_f64(
6287vzip2q_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
6310 #if defined(__aarch64__) 6316int16x4x2_t
result= vzip_s16(a1, b1);
6332 #if defined(__aarch64__) 6338int32x2x2_t
result= vzip_s32(a1, b1);
6364 #if defined(__aarch64__) 6370int8x8x2_t
result= vzip_s8(a1, b1);
6388 #if defined(__aarch64__) 6389 returnvreinterpretq_m128d_f64(
6390vzip1q_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
6440 #if defined(__aarch64__) 6441 returnvreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(
a),
6442vreinterpretq_f64_m128d(
b),
6443vreinterpretq_f64_m128d(
mask)));
6456 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) 6470 #if defined(__aarch64__) 6471 returnvreinterpretq_m128d_f64(
6472vpaddq_f64(vreinterpretq_f64_m128d(
a), vreinterpretq_f64_m128d(
b)));
6474 double*da = (
double*) &
a;
6475 double*db = (
double*) &
b;
6476 doublec[] = {da[0] + da[1], db[0] + db[1]};
6486 #if defined(__aarch64__) 6495vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6504 #if defined(__aarch64__) 6505float64x2_t
a= vreinterpretq_f64_m128d(_a);
6506float64x2_t
b= vreinterpretq_f64_m128d(_b);
6507 returnvreinterpretq_m128d_f64(
6508vsubq_f64(vuzp1q_f64(
a,
b), vuzp2q_f64(
a,
b)));
6510 double*da = (
double*) &_a;
6511 double*db = (
double*) &_b;
6512 doublec[] = {da[0] - da[1], db[0] - db[1]};
6524 #if defined(__aarch64__) 6526vsubq_f32(vuzp1q_f32(
a,
b), vuzp2q_f32(
a,
b)));
6528float32x4x2_t c = vuzpq_f32(
a,
b);
6540 #define _mm_lddqu_si128 _mm_loadu_si128 6549 #define _mm_loaddup_pd _mm_load1_pd 6556 #if defined(__aarch64__) 6557 returnvreinterpretq_m128d_f64(
6558vdupq_laneq_f64(vreinterpretq_f64_m128d(
a), 0));
6570 #if __has_builtin(__builtin_shufflevector) 6586 #if __has_builtin(__builtin_shufflevector) 6699 tmp[1] = vdupq_n_u8(0);
6715 #define _mm_alignr_pi8(a, b, imm) \ 6718 if (_sse2neon_unlikely((imm) >= 16)) { \ 6719 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ 6721 uint8x8_t tmp_low, tmp_high; \ 6723 const int idx = (imm) -8; \ 6724 tmp_low = vreinterpret_u8_m64(a); \ 6725 tmp_high = vdup_n_u8(0); \ 6726 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ 6728 const int idx = (imm); \ 6729 tmp_low = vreinterpret_u8_m64(b); \ 6730 tmp_high = vreinterpret_u8_m64(a); \ 6731 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ 6743 #if defined(__aarch64__) 6747vcombine_s16(vpadd_s16(vget_low_s16(
a), vget_high_s16(
a)),
6748vpadd_s16(vget_low_s16(
b), vget_high_s16(
b))));
6759vcombine_s32(vpadd_s32(vget_low_s32(
a), vget_high_s32(
a)),
6760vpadd_s32(vget_low_s32(
b), vget_high_s32(
b))));
6785 #if defined(__aarch64__) 6788 returnvreinterpretq_s64_s16(
6789vqaddq_s16(vuzp1q_s16(
a,
b), vuzp2q_s16(
a,
b)));
6796int16x8_t ab0246 = vcombine_s16(vmovn_s32(
a), vmovn_s32(
b));
6797int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(
a, 16), vshrn_n_s32(
b, 16));
6810 #if defined(__aarch64__) 6811 returnvreinterpret_s64_s16(vqadd_s16(vuzp1_s16(
a,
b), vuzp2_s16(
a,
b)));
6813int16x4x2_t res = vuzp_s16(
a,
b);
6814 returnvreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6825 #if defined(__aarch64__) 6827vsubq_s16(vuzp1q_s16(
a,
b), vuzp2q_s16(
a,
b)));
6829int16x8x2_t c = vuzpq_s16(
a,
b);
6841 #if defined(__aarch64__) 6843vsubq_s32(vuzp1q_s32(
a,
b), vuzp2q_s32(
a,
b)));
6845int32x4x2_t c = vuzpq_s32(
a,
b);
6857 #if defined(__aarch64__) 6860int16x4x2_t c = vuzp_s16(
a,
b);
6872 #if defined(__aarch64__) 6875int32x2x2_t c = vuzp_s32(
a,
b);
6887 #if defined(__aarch64__) 6889vqsubq_s16(vuzp1q_s16(
a,
b), vuzp2q_s16(
a,
b)));
6891int16x8x2_t c = vuzpq_s16(
a,
b);
6903 #if defined(__aarch64__) 6906int16x4x2_t c = vuzp_s16(
a,
b);
6923 #if defined(__aarch64__) 6926int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(
a))),
6927vmovl_s8(vget_low_s8(
b)));
6928int16x8_t
th= vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(
a))),
6929vmovl_s8(vget_high_s8(
b)));
6931vqaddq_s16(vuzp1q_s16(tl,
th), vuzp2q_s16(tl,
th)));
6939int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(
a, 8));
6940int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(
a, vdupq_n_u16(0xff00)));
6943int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(
b, 8), 8);
6944int16x8_t b_odd = vshrq_n_s16(
b, 8);
6947int16x8_t prod1 = vmulq_s16(a_even, b_even);
6948int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6966int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(
a, 8));
6967int16x4_t a_even = vreinterpret_s16_u16(vand_u16(
a, vdup_n_u16(0xff)));
6970int16x4_t b_even = vshr_n_s16(vshl_n_s16(
b, 8), 8);
6971int16x4_t b_odd = vshr_n_s16(
b, 8);
6974int16x4_t prod1 = vmul_s16(a_even, b_even);
6975int16x4_t prod2 = vmul_s16(a_odd, b_odd);
7003int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7004int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7016int32x4_t mul_extend =
7030uint8x16_t idx_masked =
7031vandq_u8(idx, vdupq_n_u8(0x8F));
7032 #if defined(__aarch64__) 7034 #elif defined(__GNUC__) 7038__asm__ __volatile__(
7039 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" 7040 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" 7042: [tbl]
"w"(tbl), [idx]
"w"(idx_masked));
7046int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7048vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7049vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7069 constint8x8_t controlMask =
7096uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(
b, 15));
7098 #if defined(__aarch64__) 7099int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(
b));
7101int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(
b, vdupq_n_s16(0)));
7106int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(
a),
a);
7108int16x8_t res = vbicq_s16(masked, zeroMask);
7133uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(
b, 31));
7136 #if defined(__aarch64__) 7137int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(
b));
7139int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(
b, vdupq_n_s32(0)));
7144int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(
a),
a);
7146int32x4_t res = vbicq_s32(masked, zeroMask);
7171uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(
b, 7));
7174 #if defined(__aarch64__) 7175int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(
b));
7177int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(
b, vdupq_n_s8(0)));
7182int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(
a),
a);
7184int8x16_t res = vbicq_s8(masked, zeroMask);
7212uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(
b, 15));
7215 #if defined(__aarch64__) 7216int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(
b));
7218int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(
b, vdup_n_s16(0)));
7223int16x4_t masked = vbsl_s16(ltMask, vneg_s16(
a),
a);
7225int16x4_t res = vbic_s16(masked, zeroMask);
7253uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(
b, 31));
7256 #if defined(__aarch64__) 7257int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(
b));
7259int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(
b, vdup_n_s32(0)));
7264int32x2_t masked = vbsl_s32(ltMask, vneg_s32(
a),
a);
7266int32x2_t res = vbic_s32(masked, zeroMask);
7294uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(
b, 7));
7297 #if defined(__aarch64__) 7298int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(
b));
7300int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(
b, vdup_n_s8(0)));
7305int8x8_t masked = vbsl_s8(ltMask, vneg_s8(
a),
a);
7307int8x8_t res = vbic_s8(masked, zeroMask);
7327 #define _mm_blend_epi16(a, b, imm) \ 7329 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ 7330 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ 7331 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ 7332 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ 7333 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ 7334 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ 7335 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ 7336 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \ 7337 uint16x8_t _mask_vec = vld1q_u16(_mask); \ 7338 uint16x8_t _a = vreinterpretq_u16_m128i(a); \ 7339 uint16x8_t _b = vreinterpretq_u16_m128i(b); \ 7340 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ 7346 #define _mm_blend_pd(a, b, imm) \ 7348 const uint64_t _mask[2] = { \ 7349 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ 7350 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \ 7351 uint64x2_t _mask_vec = vld1q_u64(_mask); \ 7352 uint64x2_t _a = vreinterpretq_u64_m128d(a); \ 7353 uint64x2_t _b = vreinterpretq_u64_m128d(b); \ 7354 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \ 7367uint32x4_t
mask= vld1q_u32(
data);
7401 #if defined(__aarch64__) 7402float64x2_t
a= vreinterpretq_f64_m128d(_a);
7403float64x2_t
b= vreinterpretq_f64_m128d(_b);
7404 returnvreinterpretq_m128d_f64(vbslq_f64(
mask,
b,
a));
7431 #if defined(__aarch64__) 7432 returnvreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(
a)));
7434 double*
f= (
double*) &
a;
7445 #if defined(__aarch64__) 7448 float*
f= (
float*) &
a;
7449 return _mm_set_ps(ceilf(
f[3]), ceilf(
f[2]), ceilf(
f[1]), ceilf(
f[0]));
7481 #if defined(__aarch64__) 7489uint32x4_t swapped = vrev64q_u32(
cmp);
7507int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7508int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7525int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7534int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7535int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7544int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7545int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7546int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7563uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7564uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7582uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7592uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7593uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7602uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7603uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7604uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7617 #if !SSE2NEON_PRECISE_DP 7622 #if !SSE2NEON_PRECISE_DP 7628 #if defined(__aarch64__) 7629 doubled0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(
a), 0) *
7630vgetq_lane_f64(vreinterpretq_f64_m128d(
b), 0)
7632 doubled1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(
a), 1) *
7633vgetq_lane_f64(vreinterpretq_f64_m128d(
b), 1)
7636 doubled0 = (imm & 0x10) ? ((
double*) &
a)[0] * ((
double*) &
b)[0] : 0;
7637 doubled1 = (imm & 0x20) ? ((
double*) &
a)[1] * ((
double*) &
b)[1] : 0;
7642 #if defined(__aarch64__) 7643 doublesum = vpaddd_f64(vreinterpretq_f64_m128d(
tmp));
7645 doublesum = *((
double*) &
tmp) + *(((
double*) &
tmp) + 1);
7660 #if defined(__aarch64__) 7690(imm & 0x1) ? s : 0,
7691(imm & 0x2) ? s : 0,
7692(imm & 0x4) ? s : 0,
7693(imm & 0x8) ? s : 0,
7701 #define _mm_extract_epi32(a, imm) \ 7702 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) 7707 #define _mm_extract_epi64(a, imm) \ 7708 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) 7714 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) 7718 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) 7726 #if defined(__aarch64__) 7727 returnvreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(
a)));
7729 double*
f= (
double*) &
a;
7740 #if defined(__aarch64__) 7743 float*
f= (
float*) &
a;
7744 return _mm_set_ps(floorf(
f[3]), floorf(
f[2]), floorf(
f[1]), floorf(
f[0]));
7776 #define _mm_insert_epi32(a, b, imm) \ 7778 vreinterpretq_m128i_s32( \ 7779 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ 7786 #define _mm_insert_epi64(a, b, imm) \ 7788 vreinterpretq_m128i_s64( \ 7789 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ 7796 #define _mm_insert_epi8(a, b, imm) \ 7798 vreinterpretq_m128i_s8( \ 7799 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ 7806 #define _mm_insert_ps(a, b, imm8) \ 7808 float32x4_t tmp1 = \ 7809 vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \ 7810 vreinterpretq_f32_m128(a), 0); \ 7811 float32x4_t tmp2 = \ 7812 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \ 7813 ((imm8 >> 4) & 0x3)); \ 7814 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ 7815 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ 7816 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ 7817 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \ 7818 uint32x4_t mask = vld1q_u32(data); \ 7819 float32x4_t all_zeros = vdupq_n_f32(0); \ 7821 vreinterpretq_m128_f32( \ 7822 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \ 7934 #if defined(__aarch64__) 7949 for(
i= 0;
i< 8;
i++) {
7976 switch(imm & 0x4) {
7986 #if defined(__GNUC__) || defined(__clang__) 7987__builtin_unreachable();
7992 switch(imm & 0x3) {
7994_b = vreinterpretq_u8_u32(
7998_b = vreinterpretq_u8_u32(
8002_b = vreinterpretq_u8_u32(
8006_b = vreinterpretq_u8_u32(
8010 #if defined(__GNUC__) || defined(__clang__) 8011__builtin_unreachable();
8016int16x8_t c04, c15, c26, c37;
8017uint8x8_t low_b = vget_low_u8(_b);
8018c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8019_a = vextq_u8(_a, _a, 1);
8020c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8021_a = vextq_u8(_a, _a, 1);
8022c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8023_a = vextq_u8(_a, _a, 1);
8024c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8025 #if defined(__aarch64__) 8027c04 = vpaddq_s16(c04, c26);
8029c15 = vpaddq_s16(c15, c37);
8032vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8034vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8036vreinterpretq_s16_s32(trn2_c)));
8038int16x4_t c01, c23, c45, c67;
8039c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8040c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8041c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8042c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8045vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8095 #if defined(__aarch64__) 8098 returnvreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(
a)));
8104 returnvreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(
a)));
8106 returnvreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(
a)));
8109 double*v_double = (
double*) &
a;
8114 doubleres[2],
tmp;
8115 for(
int i= 0;
i< 2;
i++) {
8116 tmp= (v_double[
i] < 0) ? -v_double[
i] : v_double[
i];
8117 doubleroundDown = floor(
tmp);
8118 doubleroundUp = ceil(
tmp);
8119 doublediffDown =
tmp- roundDown;
8120 doublediffUp = roundUp -
tmp;
8121 if(diffDown < diffUp) {
8123res[
i] = roundDown;
8124}
else if(diffDown > diffUp) {
8130 doublehalf = roundDown / 2;
8131 if(half != floor(half)) {
8138res[
i] = roundDown;
8141res[
i] = (v_double[
i] < 0) ? -res[
i] : res[
i];
8153 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8154v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8164 #if defined(__aarch64__) 8178 float*v_float = (
float*) &
a;
8183uint32x4_t signmask = vdupq_n_u32(0x80000000);
8186int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8188int32x4_t r_trunc = vcvtq_s32_f32(
8190int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8191vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
8192int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8194float32x4_t
delta= vsubq_f32(
8196vcvtq_f32_s32(r_trunc));
8197uint32x4_t is_delta_half =
8198vceqq_f32(
delta, half);
8200vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8210 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8211v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8212v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8213v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8256 #if __has_builtin(__builtin_nontemporal_store) 8257 return__builtin_nontemporal_load(p);
8268 return(
uint64_t)(vgetq_lane_s64(
a, 0) & vgetq_lane_s64(
a, 1)) ==
8277int64x2_t a_and_mask =
8279 return!(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8294uint64x2_t
result= vandq_u64(zf, cf);
8295 return!(vgetq_lane_u64(
result, 0) | vgetq_lane_u64(
result, 1));
8308 return!(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8317 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) 8328 return!(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8337 #if defined(__aarch64__) 8352 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8353__asm__ __volatile__(
"crc32ch %w[c], %w[c], %w[v]\n\t" 8368 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8369__asm__ __volatile__(
"crc32cw %w[c], %w[c], %w[v]\n\t" 8384 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8385__asm__ __volatile__(
"crc32cx %w[c], %w[c], %x[v]\n\t" 8400 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) 8401__asm__ __volatile__(
"crc32cb %w[c], %w[c], %w[v]\n\t" 8406 for(
intbit = 0; bit < 8; bit++) {
8408crc = (crc >> 1) ^
UINT32_C(0x82f63b78);
8418 #if !defined(__ARM_FEATURE_CRYPTO) 8420 #define SSE2NEON_AES_DATA(w) \ 8422 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ 8423 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ 8424 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ 8425 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ 8426 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ 8427 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ 8428 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ 8429 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ 8430 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ 8431 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ 8432 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ 8433 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ 8434 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ 8435 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ 8436 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ 8437 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ 8438 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ 8439 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ 8440 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ 8441 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ 8442 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ 8443 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ 8444 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ 8445 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ 8446 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ 8447 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ 8448 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ 8449 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ 8450 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ 8451 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ 8452 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ 8453 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ 8454 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ 8455 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ 8456 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ 8457 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ 8458 w(0xb0), w(0x54), w(0xbb), w(0x16) \ 8463 #define SSE2NEON_AES_H0(x) (x) 8465 #undef SSE2NEON_AES_H0 8475 #if defined(__aarch64__) 8476 static const uint8_tshift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
84770xe, 0x3, 0x8, 0xd, 0x2, 0x7,
84780xc, 0x1, 0x6, 0xb};
8479 static const uint8_tror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
84800x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8486w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8495w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
8496w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8497w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8503 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ 8504 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ 8506 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b )) 8507 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) 8508 #define SSE2NEON_AES_U0(p) \ 8509 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) 8510 #define SSE2NEON_AES_U1(p) \ 8511 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) 8512 #define SSE2NEON_AES_U2(p) \ 8513 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) 8514 #define SSE2NEON_AES_U3(p) \ 8515 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) 8522 #undef SSE2NEON_AES_B2W 8523 #undef SSE2NEON_AES_F2 8524 #undef SSE2NEON_AES_F3 8525 #undef SSE2NEON_AES_U0 8526 #undef SSE2NEON_AES_U1 8527 #undef SSE2NEON_AES_U2 8528 #undef SSE2NEON_AES_U3 8536(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8537aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8538(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8539aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8540(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8541aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8542(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8543aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8573 for(
int i= 0;
i< 16;
i++)
8589 for(
int i= 0;
i< 4; ++
i) {
8594((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8596 #undef SSE2NEON_AES_DATA 8627u8[0x4], u8[0x1], u8[0xE], u8[0xB],
8628u8[0x1], u8[0xE], u8[0xB], u8[0x4],
8629u8[0xC], u8[0x9], u8[0x6], u8[0x3],
8630u8[0x9], u8[0x6], u8[0x3], u8[0xC],
8632uint32x4_t
r= {0, (unsigned) rcon, 0, (
unsigned) rcon};
8646 switch(imm & 0x11) {
8668 #if defined(__aarch64__) 8675 #if defined(__aarch64__) 8676 asm volatile(
"mrs %0, FPCR":
"=r"(
r.value));
8678 asm volatile(
"vmrs %0, FPSCR":
"=r"(
r.value));
8689 #if defined(__aarch64__) 8690 #if __has_builtin(__builtin_popcount) 8691 return__builtin_popcount(
a);
8693 return(
int) vaddlv_u8(vcnt_u8(vcreate_u8((
uint64_t)
a)));
8697uint8x8_t input_val, count8x8_val;
8698uint16x4_t count16x4_val;
8699uint32x2_t count32x2_val;
8701input_val = vld1_u8((
uint8_t*) &
a);
8702count8x8_val = vcnt_u8(input_val);
8703count16x4_val = vpaddl_u8(count8x8_val);
8704count32x2_val = vpaddl_u16(count16x4_val);
8706vst1_u32(&
count, count32x2_val);
8716 #if defined(__aarch64__) 8717 #if __has_builtin(__builtin_popcountll) 8718 return__builtin_popcountll(
a);
8720 return(
int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(
a)));
8724uint8x8_t input_val, count8x8_val;
8725uint16x4_t count16x4_val;
8726uint32x2_t count32x2_val;
8727uint64x1_t count64x1_val;
8729input_val = vld1_u8((
uint8_t*) &
a);
8730count8x8_val = vcnt_u8(input_val);
8731count16x4_val = vpaddl_u8(count8x8_val);
8732count32x2_val = vpaddl_u16(count16x4_val);
8733count64x1_val = vpaddl_u32(count32x2_val);
8734vst1_u64(&
count, count64x1_val);
8745 #if defined(__aarch64__) 8752 #if defined(__aarch64__) 8753 asm volatile(
"mrs %0, FPCR":
"=r"(
r.value));
8755 asm volatile(
"vmrs %0, FPSCR":
"=r"(
r.value));
8760 #if defined(__aarch64__) 8761 asm volatile(
"msr FPCR, %0"::
"r"(
r));
8763 asm volatile(
"vmsr FPSCR, %0"::
"r"(
r));
8767 #if defined(__GNUC__) || defined(__clang__) 8768 #pragma pop_macro("ALIGN_STRUCT")
8769 #pragma pop_macro("FORCE_INLINE")
8772 #if defined(__GNUC__) && !defined(__clang__) 8773 #pragma GCC pop_optionsncbi::TMaskedQueryRegions mask
std::ofstream out("events_result.xml")
main entry point for tests
unsigned int
A callback function used to compare two keys in a database.
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
static size_t rnd(size_t minimal, size_t maximal)
std::istream & in(std::istream &in_, double &x_)
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
static __m128i _mm_max_epi16(__m128i a, __m128i b)
static __m128i _mm_packs_epi16(__m128i a, __m128i b)
static __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
static __m128 _mm_floor_ss(__m128 a, __m128 b)
static float _mm_cvtss_f32(__m128 a)
static __m128i _mm_subs_epu16(__m128i a, __m128i b)
#define vreinterpret_m64_f32(x)
static __m128i _mm_abs_epi32(__m128i a)
static __m128i _mm_undefined_si128(void)
static void _mm_stream_si64(int64_t *p, int64_t a)
#define _MM_FROUND_TO_POS_INF
static void _mm_storeh_pi(__m64 *p, __m128 a)
static __m128i _mm_cvtepi16_epi32(__m128i a)
static __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
#define vreinterpretq_u32_m128d(x)
static __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
#define vreinterpret_m64_s32(x)
static __m64 _mm_cvtps_pi8(__m128 a)
static __m128d _mm_div_pd(__m128d a, __m128d b)
static __m128i _mm_setzero_si128()
static __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
static __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
#define vreinterpretq_m128_s32(x)
static __m128 _mm_xor_ps(__m128 a, __m128 b)
static __m128i _mm_avg_epu16(__m128i a, __m128i b)
static __m128i _mm_abs_epi16(__m128i a)
static __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
static __m128i _mm_min_epi8(__m128i a, __m128i b)
static __m128 _mm_cmple_ss(__m128 a, __m128 b)
static __m128d _mm_loadu_pd(const double *p)
static unsigned int _mm_getcsr()
static void _mm_free(void *addr)
static __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
static __m128i _mm_mullo_epi32(__m128i a, __m128i b)
static __m64 _mm_cvt_ps2pi(__m128 a)
static __m128i _mm_max_epu8(__m128i a, __m128i b)
static __m128i _mm_add_epi8(__m128i a, __m128i b)
static __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
static __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
static int _mm_comile_sd(__m128d a, __m128d b)
static int _mm_test_all_ones(__m128i a)
static __m128i _mm_max_epi8(__m128i a, __m128i b)
static __m128 _mm_max_ss(__m128 a, __m128 b)
#define SSE2NEON_AES_U2(p)
#define vreinterpretq_m128_f32(x)
static int _mm_comieq_ss(__m128 a, __m128 b)
static __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
static __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
static __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
static __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
static int64_t _mm_cvttss_si64(__m128 a)
static __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
static __m128i _mm_xor_si128(__m128i a, __m128i b)
static __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
static uint32_t _mm_crc32_u8(uint32_t, uint8_t)
static __m128i _mm_adds_epi16(__m128i a, __m128i b)
#define vreinterpretq_m128i_s8(x)
static __m128d _mm_xor_pd(__m128d a, __m128d b)
static __m128i _mm_sub_epi16(__m128i a, __m128i b)
static __m64 _mm_min_pu8(__m64 a, __m64 b)
static __m128i _mm_shuffle_epi_3332(__m128i a)
static __m128i _mm_loadu_si32(const void *p)
static __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
static __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
#define vreinterpret_m64_s8(x)
static __m128i _mm_castps_si128(__m128)
static void _MM_SET_ROUNDING_MODE(int rounding)
static __m128d _mm_move_sd(__m128d, __m128d)
static __m128i _mm_sad_epu8(__m128i a, __m128i b)
#define vreinterpret_u16_m64(x)
static __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
#define vreinterpretq_m128i_u64(x)
static int64_t _mm_cvtss_si64(__m128 a)
static __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
static __m128d _mm_set1_pd(double d)
static void _mm_storer_ps(float *p, __m128 a)
#define vreinterpret_m64_s64(x)
static __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
static int _mm_comige_ss(__m128 a, __m128 b)
static __m128i _mm_sub_epi64(__m128i a, __m128i b)
static __m128 _mm_cmple_ps(__m128 a, __m128 b)
static __m64 _mm_abs_pi8(__m64 a)
#define vreinterpretq_u8_m128i(x)
static void _mm_storel_pi(__m64 *p, __m128 a)
#define vreinterpretq_s16_m128i(x)
static __m64 _mm_max_pu8(__m64 a, __m64 b)
#define _MM_FLUSH_ZERO_MASK
static __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
#define vreinterpretq_u64_m128d(x)
static __m128d _mm_cmpge_sd(__m128d a, __m128d b)
#define vreinterpretq_s8_m128(x)
static int64_t _mm_cvtsi128_si64(__m128i a)
static void _mm_stream_si128(__m128i *p, __m128i a)
static __m128i _mm_sll_epi32(__m128i a, __m128i count)
static __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
static unsigned int _sse2neon_mm_get_flush_zero_mode()
static __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
static __m128d _mm_castsi128_pd(__m128i a)
static __m128 _mm_loadu_ps(const float *p)
static uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
static __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
static __m128d _mm_load1_pd(const double *p)
static void _mm_empty(void)
static __m128i _mm_min_epu8(__m128i a, __m128i b)
static __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
static __m128i _mm_shuffle_epi_2301(__m128i a)
static __m128 _mm_load_ps(const float *p)
#define vreinterpret_m64_u8(x)
static __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
static __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
static __m128 _mm_cmpord_ps(__m128 a, __m128 b)
static void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
#define vreinterpretq_m128d_s32(x)
static __m128 _mm_set_ss(float a)
static int _mm_cvt_ss2si(__m128 a)
static __m128 _mm_and_ps(__m128 a, __m128 b)
static int _mm_cvtsi128_si32(__m128i a)
static __m128i _mm_set1_epi8(signed char w)
static __m128i _mm_loadu_si64(const void *p)
static __m128d _mm_ceil_sd(__m128d a, __m128d b)
static __m128 _mm_min_ps(__m128 a, __m128 b)
static void _mm_storel_pd(double *mem_addr, __m128d a)
static __m128 _mm_mul_ss(__m128 a, __m128 b)
static __m128 _mm_mul_ps(__m128 a, __m128 b)
static __m128i _mm_cvtsi32_si128(int a)
static __m128i _mm_cvttpd_epi32(__m128d a)
static __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
static int _mm_popcnt_u32(unsigned int a)
#define _sse2neon_unlikely(x)
static __m128 _mm_cvtepi32_ps(__m128i a)
static __m128d _mm_round_pd(__m128d, int)
static __m128i _mm_cvtepu16_epi64(__m128i a)
static __m128 _mm_load_ss(const float *p)
static __m128d _mm_setzero_pd(void)
static __m128i _mm_add_epi16(__m128i a, __m128i b)
static void _mm_stream_ps(float *p, __m128 a)
static __m128 _mm_cmpord_ss(__m128 a, __m128 b)
#define vreinterpretq_f32_m128i(x)
static __m128d _mm_cmpord_sd(__m128d a, __m128d b)
static __m128i _mm_mul_epi32(__m128i a, __m128i b)
static __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
static __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
static __m128 _mm_moveldup_ps(__m128 a)
static __m128i _mm_srli_si128(__m128i a, int imm)
static __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
#define vreinterpret_m64_u16(x)
#define SSE2NEON_AES_DATA(w)
static __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
static __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
static __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
static __m128 _mm_undefined_ps(void)
static __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
#define _MM_DENORMALS_ZERO_OFF
static __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
static __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
static __m128i _mm_cvtepi16_epi64(__m128i a)
static __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
static __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
static __m128i _mm_packus_epi32(__m128i a, __m128i b)
#define vreinterpretq_m128_u32(x)
#define SSE2NEON_AES_U0(p)
static int _mm_comineq_ss(__m128 a, __m128 b)
#define vreinterpretq_s32_m128i(x)
static __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
#define vreinterpret_s64_m64(x)
static __m128 _mm_hadd_ps(__m128 a, __m128 b)
static __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
static __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
static __m128i _mm_cvttps_epi32(__m128 a)
#define vreinterpretq_m128i_s16(x)
#define vreinterpretq_f32_m128d(x)
static __m128i _mm_set_epi64x(int64_t, int64_t)
static __m128d _mm_or_pd(__m128d a, __m128d b)
static uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
static __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
static __m128i _mm_avg_epu8(__m128i a, __m128i b)
#define vreinterpretq_m128d_s64(x)
static void _mm_stream_pi(__m64 *p, __m64 a)
static __m128i _mm_cvtepu8_epi16(__m128i a)
static __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
static void _mm_setcsr(unsigned int a)
static uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
static int _mm_movemask_pd(__m128d a)
static __m128d _mm_load_sd(const double *p)
static __m64 _mm_avg_pu16(__m64 a, __m64 b)
static __m128 _mm_castsi128_ps(__m128i a)
static int _mm_comineq_sd(__m128d a, __m128d b)
static __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
static __m128i _mm_min_epi32(__m128i a, __m128i b)
#define vreinterpretq_s8_m128i(x)
static __m128i _mm_slli_epi64(__m128i a, int imm)
static __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
static __m64 _mm_cvtt_ps2pi(__m128 a)
static __m128i _mm_madd_epi16(__m128i a, __m128i b)
static __m128i _mm_srl_epi16(__m128i a, __m128i count)
static __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
static __m64 _mm_max_pi16(__m64 a, __m64 b)
static __m128d _mm_div_sd(__m128d a, __m128d b)
static __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
static __m128 _mm_rcp_ps(__m128 in)
static __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
static __m128d _mm_cmpge_pd(__m128d a, __m128d b)
static __m128 _mm_add_ss(__m128 a, __m128 b)
static __m64 _mm_sad_pu8(__m64 a, __m64 b)
static int _mm_movemask_epi8(__m128i a)
#define vreinterpretq_s64_m128d(x)
static __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
static __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
static __m128 _mm_cvt_si2ss(__m128 a, int b)
static __m128d _mm_cmple_pd(__m128d a, __m128d b)
static __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
#define _MM_DENORMALS_ZERO_ON
static __m128d _mm_load_pd(const double *p)
static void _mm_sfence(void)
static __m128i _mm_sll_epi64(__m128i a, __m128i count)
static void _mm_storeu_si16(void *p, __m128i a)
static __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
static int _mm_comilt_ss(__m128 a, __m128 b)
#define vreinterpretq_m128i_u8(x)
static __m128 _mm_sqrt_ss(__m128 in)
#define _MM_ROUND_NEAREST
static __m128 _mm_set_ps(float w, float z, float y, float x)
static __m128i _mm_loadu_si16(const void *p)
static __m128d _mm_sqrt_sd(__m128d a, __m128d b)
#define vreinterpret_u32_m64(x)
#define vreinterpretq_nth_u64_m128i(x, n)
static __m128i _mm_srl_epi32(__m128i a, __m128i count)
static __m128i _mm_slli_si128(__m128i a, int imm)
#define vreinterpretq_m128d_f32(x)
static __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
#define vreinterpretq_f64_m128i(x)
static __m128 _mm_or_ps(__m128, __m128)
static __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
static __m128 _mm_andnot_ps(__m128 a, __m128 b)
static __m128 _mm_move_ss(__m128, __m128)
static __m128d _mm_cmple_sd(__m128d a, __m128d b)
static __m128d _mm_addsub_pd(__m128d a, __m128d b)
static __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
#define vreinterpret_m64_s16(x)
static __m128 _mm_cmpge_ss(__m128 a, __m128 b)
static __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
static __m128i _mm_shuffle_epi_2103(__m128i a)
static __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
static __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
static __m128d _mm_movedup_pd(__m128d a)
static __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
static __m128i _mm_adds_epi8(__m128i a, __m128i b)
static __m128i _mm_movpi64_epi64(__m64 a)
static __m64 _mm_add_si64(__m64 a, __m64 b)
static __m128i _mm_srai_epi16(__m128i a, int imm)
static __m128i _mm_set1_epi16(short w)
static __m128i _mm_add_epi64(__m128i a, __m128i b)
static __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
static __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
static __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
static void * _mm_malloc(size_t size, size_t align)
static __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
static __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
static __m128i _mm_slli_epi16(__m128i a, int imm)
static int _mm_cvtt_ss2si(__m128 a)
static __m128i _mm_set_epi64(__m64 i1, __m64 i2)
static unsigned int _sse2neon_mm_get_denormals_zero_mode()
static __m128i _mm_adds_epu8(__m128i a, __m128i b)
static __m128i _mm_sll_epi16(__m128i a, __m128i count)
static int _mm_movemask_ps(__m128 a)
static __m128d _mm_mul_pd(__m128d a, __m128d b)
#define _MM_FROUND_TO_NEG_INF
static __m128i _mm_subs_epi8(__m128i a, __m128i b)
static __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
static int _mm_comigt_sd(__m128d a, __m128d b)
static int _mm_test_all_zeros(__m128i a, __m128i mask)
static __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
static __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
static int _mm_comile_ss(__m128 a, __m128 b)
#define vreinterpretq_u64_m128(x)
static void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
static __m128 _mm_ceil_ss(__m128 a, __m128 b)
static __m128 _mm_movehdup_ps(__m128 a)
static __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
static __m128i _mm_mullo_epi16(__m128i a, __m128i b)
static __m128d _mm_hadd_pd(__m128d a, __m128d b)
static __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
static void _mm_store_pd1(double *mem_addr, __m128d a)
static __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
static __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
static __m128d _mm_sub_pd(__m128d a, __m128d b)
static void _mm_store_ps1(float *p, __m128 a)
static __m128d _mm_max_pd(__m128d a, __m128d b)
static __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
static __m128i _mm_max_epi32(__m128i a, __m128i b)
static int _mm_comilt_sd(__m128d a, __m128d b)
static __m128i _mm_slli_epi32(__m128i a, int imm)
#define vreinterpretq_s32_m128(x)
static __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
static __m128i _mm_loadu_si128(const __m128i *p)
static __m128i _mm_minpos_epu16(__m128i a)
static __m128 _mm_set1_ps(float _w)
static __m128d _mm_setr_pd(double e1, double e0)
#define vreinterpretq_u32_m128i(x)
static __m128d _mm_floor_sd(__m128d a, __m128d b)
static __m128i _mm_sra_epi32(__m128i a, __m128i count)
static __m128 _mm_addsub_ps(__m128 a, __m128 b)
static void _mm_store_si128(__m128i *p, __m128i a)
static __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
static __m128 _mm_rsqrt_ps(__m128 in)
static void _mm_store_sd(double *mem_addr, __m128d a)
static __m128 _mm_setr_ps(float w, float z, float y, float x)
static __m128 _mm_cmplt_ss(__m128 a, __m128 b)
static __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
#define vreinterpretq_m128i_s64(x)
#define vreinterpretq_m128_s64(x)
static int _mm_testc_si128(__m128i a, __m128i b)
#define vreinterpretq_u32_m128(x)
static __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
static __m128d _mm_min_pd(__m128d a, __m128d b)
static void _mm_prefetch(const void *p, int i)
static __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
static __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
static __m128i _mm_cvtepu16_epi32(__m128i a)
static __m128d _mm_min_sd(__m128d a, __m128d b)
static unsigned int _MM_GET_ROUNDING_MODE()
static void _mm_storeu_si64(void *p, __m128i a)
static __m64 _mm_min_pi16(__m64 a, __m64 b)
static __m128d _mm_floor_pd(__m128d)
static __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
static void _mm_clflush(void const *p)
static __m128i _mm_cvtepi8_epi32(__m128i a)
static void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
static __m128i _mm_cvtepu8_epi64(__m128i a)
#define vreinterpret_s32_m64(x)
static __m128d _mm_cvtepi32_pd(__m128i a)
#define vreinterpretq_m128d_u32(x)
static __m128i _mm_shuffle_epi_0101(__m128i a)
static __m128i _mm_cvtps_epi32(__m128)
static __m64 _mm_cvttpd_pi32(__m128d a)
static __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
static __m128i _mm_subs_epi16(__m128i a, __m128i b)
#define vreinterpret_m64_u64(x)
static __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
#define vreinterpretq_u64_m128i(x)
static __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
static __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
static __m128i _mm_min_epi16(__m128i a, __m128i b)
static __m64 _mm_avg_pu8(__m64 a, __m64 b)
static __m128 _mm_load1_ps(const float *p)
static void _mm_storer_pd(double *mem_addr, __m128d a)
static __m128i _mm_load_si128(const __m128i *p)
static __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
static __m64 _mm_hadd_pi32(__m64 a, __m64 b)
static __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
#define _MM_ROUND_TOWARD_ZERO
static __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
static __m128d _mm_andnot_pd(__m128d a, __m128d b)
static __m128i _mm_packs_epi32(__m128i a, __m128i b)
static __m128i _mm_abs_epi8(__m128i a)
static __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
static __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
static void _mm_store_ss(float *p, __m128 a)
static __m128i _mm_or_si128(__m128i, __m128i)
static int32_t _mm_cvttsd_si32(__m128d a)
static void _mm_storeu_pd(double *mem_addr, __m128d a)
static double _mm_cvtsd_f64(__m128d a)
static __m128 _mm_cmplt_ps(__m128 a, __m128 b)
static void _mm_store_ps(float *p, __m128 a)
static __m128i _mm_cvtepu32_epi64(__m128i a)
static __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
static __m128i _mm_min_epu32(__m128i a, __m128i b)
static int _mm_testz_si128(__m128i a, __m128i b)
static __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
static __m128d _mm_cmpord_pd(__m128d a, __m128d b)
static __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
static __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
static __m128d _mm_cvtps_pd(__m128 a)
static __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
static __m128i _mm_add_epi32(__m128i a, __m128i b)
static __m64 _mm_hadd_pi16(__m64 a, __m64 b)
static __m128d _mm_cmplt_pd(__m128d a, __m128d b)
#define _MM_FROUND_TO_NEAREST_INT
static __m128i _mm_subs_epu8(__m128i a, __m128i b)
static int _mm_movemask_pi8(__m64 a)
#define vreinterpretq_m128d_u64(x)
static __m128i _mm_cvtepu8_epi32(__m128i a)
static __m128 _mm_round_ps(__m128, int)
static __m128i _mm_set_epi32(int, int, int, int)
static __m128i _mm_shuffle_epi_2211(__m128i a)
static __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
static __m128 _mm_div_ss(__m128 a, __m128 b)
static __m128i _mm_cvtepi32_epi64(__m128i a)
static __m128i _mm_cvtpd_epi32(__m128d a)
#define SSE2NEON_AES_U3(p)
#define vreinterpret_s16_m64(x)
static __m128 _mm_sub_ss(__m128 a, __m128 b)
static uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
static __m128 _mm_min_ss(__m128 a, __m128 b)
static int _mm_comieq_sd(__m128d a, __m128d b)
#define vreinterpretq_nth_u8_m128i(x, n)
static void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
#define vreinterpretq_m128i_u32(x)
static __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
static __m64 _mm_movepi64_pi64(__m128i a)
static __m128d _mm_loadr_pd(const double *p)
static __m128 _mm_cvtpd_ps(__m128d a)
static __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
static int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
#define SSE2NEON_AES_U1(p)
static __m128 _mm_cvtpu16_ps(__m64 a)
static __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
static __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
static __m64 _mm_cvtpd_pi32(__m128d a)
static __m128 _mm_max_ps(__m128 a, __m128 b)
static __m128i _mm_sub_epi32(__m128i a, __m128i b)
static __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
static __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
static __m128i _mm_sub_epi8(__m128i a, __m128i b)
static __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
#define vreinterpretq_m128i_s32(x)
static __m128 _mm_cmpge_ps(__m128 a, __m128 b)
static int64_t _mm_cvttsd_si64(__m128d a)
static __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
static __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
static __m128d _mm_undefined_pd(void)
static __m128i _mm_shuffle_epi_1032(__m128i a)
static __m128 _mm_rsqrt_ss(__m128 in)
static __m128i _mm_shuffle_epi_1010(__m128i a)
static const uint8_t SSE2NEON_sbox[256]
static __m128d _mm_ceil_pd(__m128d)
static __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
static __m128d _mm_loadh_pd(__m128d a, const double *p)
#define vreinterpretq_f32_m128(x)
static __m128i _mm_set1_epi32(int)
static __m64 _mm_cvtps_pi16(__m128 a)
static __m128d _mm_loadl_pd(__m128d a, const double *p)
static __m128i _mm_move_epi64(__m128i a)
static __m128i _mm_cvtepi8_epi64(__m128i a)
static int64_t _mm_popcnt_u64(uint64_t a)
static __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
static __m128d _mm_cmplt_sd(__m128d a, __m128d b)
static __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
static __m128i _mm_sra_epi16(__m128i a, __m128i count)
static __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
static __m128d _mm_sub_sd(__m128d a, __m128d b)
static __m128i _mm_castpd_si128(__m128d a)
static void _mm_stream_pd(double *p, __m128d a)
#define _MM_FROUND_TO_ZERO
static __m128i _mm_shuffle_epi_0321(__m128i a)
static __m128d _mm_set_pd(double, double)
static __m128i _mm_stream_load_si128(__m128i *p)
static __m128 _mm_sub_ps(__m128 a, __m128 b)
static __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
static __m128d _mm_cvtss_sd(__m128d a, __m128 b)
static int _mm_comige_sd(__m128d a, __m128d b)
static __m128 _mm_div_ps(__m128 a, __m128 b)
static int32_t _mm_cvtsd_si32(__m128d a)
static __m64 _mm_mul_su32(__m64 a, __m64 b)
#define _MM_FLUSH_ZERO_ON
static __m128d _mm_sqrt_pd(__m128d a)
static __m128i _mm_adds_epu16(__m128i a, __m128i b)
static __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
#define _MM_DENORMALS_ZERO_MASK
static __m128i _mm_andnot_si128(__m128i a, __m128i b)
static __m128 _mm_ceil_ps(__m128)
#define vreinterpretq_m128i_u16(x)
static __m128d _mm_castps_pd(__m128 a)
static __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
static __m64 _mm_sub_si64(__m64 a, __m64 b)
static __m128d _mm_add_pd(__m128d a, __m128d b)
static __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
static __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
static __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
#define _MM_FROUND_CUR_DIRECTION
static __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
static __m128 _mm_floor_ps(__m128)
static __m128d _mm_cvtpi32_pd(__m64 a)
static void _mm_storeu_si128(__m128i *p, __m128i a)
static __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
static __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
static __m64 _mm_abs_pi32(__m64 a)
#define vreinterpretq_u16_m128i(x)
static __m128d _mm_set_sd(double a)
static __m128 _mm_rcp_ss(__m128 a)
static __m128i _mm_cvtepi8_epi16(__m128i a)
static void _mm_storel_epi64(__m128i *a, __m128i b)
static void _mm_stream_si32(int *p, int a)
static __m128i _mm_set1_epi64x(int64_t _i)
static __m128d _mm_and_pd(__m128d a, __m128d b)
static void _sse2neon_kadd_f32(float *sum, float *c, float y)
static __m128i _mm_loadl_epi64(__m128i const *p)
static __m128i _mm_set1_epi64(__m64 _i)
static void _mm_store_pd(double *mem_addr, __m128d a)
static __m64 _mm_abs_pi16(__m64 a)
static __m128 _mm_cvtpi8_ps(__m64 a)
static __m128i _mm_cvtsi64_si128(int64_t a)
static __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
static __m128i _mm_shuffle_epi_1001(__m128i a)
static __m128d _mm_max_sd(__m128d a, __m128d b)
#define vreinterpret_s8_m64(x)
static __m128d _mm_mul_sd(__m128d a, __m128d b)
static __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
static void _mm_storeh_pd(double *mem_addr, __m128d a)
static __m128 _mm_sqrt_ps(__m128 in)
static __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
#define _mm_shuffle_epi32(a, imm)
static __m128d _mm_add_sd(__m128d a, __m128d b)
static void _mm_storeu_ps(float *p, __m128 a)
#define vreinterpret_u8_m64(x)
static __m128 _mm_set_ps1(float)
static __m128i _mm_srl_epi64(__m128i a, __m128i count)
static __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
static __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
static int64_t _mm_cvtsd_si64(__m128d a)
static __m128 _mm_loadr_ps(const float *p)
static __m128i _mm_min_epu16(__m128i a, __m128i b)
static __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
static __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
static __m128 _mm_castpd_ps(__m128d a)
static __m128i _mm_mul_epu32(__m128i a, __m128i b)
static __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
static __m128 _mm_cvtpu8_ps(__m64 a)
static __m128i _mm_and_si128(__m128i, __m128i)
#define SSE2NEON_AES_H0(x)
static void _mm_storeu_si32(void *p, __m128i a)
static __m128i _mm_max_epu32(__m128i a, __m128i b)
static __m128i _mm_shuffle_epi_0122(__m128i a)
static __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
static __m128 _mm_add_ps(__m128 a, __m128 b)
static int _mm_comigt_ss(__m128 a, __m128 b)
#define _MM_FROUND_NO_EXC
#define vreinterpretq_s64_m128i(x)
static __m128 _mm_cvtpi16_ps(__m64 a)
#define _MM_FLUSH_ZERO_OFF
static __m128 _mm_setzero_ps(void)
static __m128i _mm_max_epu16(__m128i a, __m128i b)
static __m128i _mm_cmpeq_epi32(__m128i, __m128i)
static __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4