19 #include <emmintrin.h>
20 #if HWY_TARGET == HWY_SSSE3
21 #include <tmmintrin.h>
23 #include <smmintrin.h>
24 #include <wmmintrin.h>
37 #ifndef HWY_LOADDUP_ASM
38 #define HWY_LOADDUP_ASM 0
46 using Full128 = Simd<T, 16 /
sizeof(T)>;
65 template <
typename T,
size_t N = 16 /
sizeof(T)>
73 return *
this = (*
this * other);
76 return *
this = (*
this / other);
79 return *
this = (*
this + other);
82 return *
this = (*
this - other);
85 return *
this = (*
this & other);
88 return *
this = (*
this | other);
91 return *
this = (*
this ^ other);
100 template <
typename T>
103 #if HWY_TARGET <= HWY_AVX3
108 template <
size_t size>
129 template <
typename T,
size_t N>
143 template <
typename T,
size_t N>
156 template <
typename T,
size_t N>
160 template <
typename T>
162 return Simd<T, 32 /
sizeof(T)>();
164 template <
typename T>
166 return Simd<T, 64 /
sizeof(T)>();
192 template <
typename T,
size_t N>
198 template <
typename T>
199 struct BitCastFromInteger128 {
203 struct BitCastFromInteger128<float> {
211 template <
typename T,
size_t N>
213 Vec128<uint8_t, N *
sizeof(T)> v) {
219 template <
typename T,
size_t N,
typename FromT>
221 Vec128<FromT, N *
sizeof(T) /
sizeof(FromT)> v) {
228 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
230 return Vec128<T, N>{_mm_setzero_si128()};
232 template <
size_t N, HWY_IF_LE128(
float, N)>
234 return Vec128<float, N>{_mm_setzero_ps()};
236 template <
size_t N, HWY_IF_LE128(
double, N)>
247 template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
248 HWY_API Vec128<uint8_t, N>
Set(Simd<uint8_t, N> ,
const uint8_t t) {
249 return Vec128<uint8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
251 template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
252 HWY_API Vec128<uint16_t, N>
Set(Simd<uint16_t, N> ,
const uint16_t t) {
253 return Vec128<uint16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
255 template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
256 HWY_API Vec128<uint32_t, N>
Set(Simd<uint32_t, N> ,
const uint32_t t) {
257 return Vec128<uint32_t, N>{_mm_set1_epi32(
static_cast<int>(t))};
259 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
260 HWY_API Vec128<uint64_t, N>
Set(Simd<uint64_t, N> ,
const uint64_t t) {
261 return Vec128<uint64_t, N>{
262 _mm_set1_epi64x(
static_cast<long long>(t))};
264 template <
size_t N, HWY_IF_LE128(
int8_t, N)>
265 HWY_API Vec128<int8_t, N>
Set(Simd<int8_t, N> ,
const int8_t t) {
266 return Vec128<int8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
268 template <
size_t N, HWY_IF_LE128(
int16_t, N)>
269 HWY_API Vec128<int16_t, N>
Set(Simd<int16_t, N> ,
const int16_t t) {
270 return Vec128<int16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
272 template <
size_t N, HWY_IF_LE128(
int32_t, N)>
273 HWY_API Vec128<int32_t, N>
Set(Simd<int32_t, N> ,
const int32_t t) {
274 return Vec128<int32_t, N>{_mm_set1_epi32(t)};
276 template <
size_t N, HWY_IF_LE128(
int64_t, N)>
277 HWY_API Vec128<int64_t, N>
Set(Simd<int64_t, N> ,
const int64_t t) {
278 return Vec128<int64_t, N>{
279 _mm_set1_epi64x(
static_cast<long long>(t))};
281 template <
size_t N, HWY_IF_LE128(
float, N)>
282 HWY_API Vec128<float, N>
Set(Simd<float, N> ,
const float t) {
283 return Vec128<float, N>{_mm_set1_ps(t)};
285 template <
size_t N, HWY_IF_LE128(
double, N)>
294 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
298 return Vec128<T, N>{_mm_undefined_si128()};
300 template <
size_t N, HWY_IF_LE128(
float, N)>
304 template <
size_t N, HWY_IF_LE128(
double, N)>
316 return static_cast<uint8_t
>(_mm_cvtsi128_si32(v.raw) & 0xFF);
320 return static_cast<int8_t
>(_mm_cvtsi128_si32(v.raw) & 0xFF);
324 return static_cast<uint16_t
>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
328 return static_cast<int16_t
>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
332 return static_cast<uint32_t
>(_mm_cvtsi128_si32(v.raw));
336 return _mm_cvtsi128_si32(v.raw);
340 return _mm_cvtss_f32(v.raw);
345 alignas(16) uint64_t lanes[2];
346 Store(v, Simd<uint64_t, N>(), lanes);
349 return static_cast<uint64_t
>(_mm_cvtsi128_si64(v.raw));
355 alignas(16) int64_t lanes[2];
356 Store(v, Simd<int64_t, N>(), lanes);
359 return _mm_cvtsi128_si64(v.raw);
364 return _mm_cvtsd_f64(v.
raw);
371 template <
typename T,
size_t N>
372 HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
373 return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
389 template <
typename T,
size_t N>
390 HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
391 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
406 template <
typename T,
size_t N>
407 HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
408 return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
424 template <
typename T,
size_t N>
425 HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
426 return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
442 template <
typename T,
size_t N>
443 HWY_API Vec128<T, N>
Not(
const Vec128<T, N> v) {
445 #if HWY_TARGET <= HWY_AVX3
446 const __m128i vu =
BitCast(Simd<TU, N>(), v).raw;
448 Vec128<TU, N>{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
450 return Xor(v,
BitCast(Simd<T, N>(), Vec128<TU, N>{_mm_set1_epi32(-1)}));
456 template <
typename T,
size_t N>
457 HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
461 template <
typename T,
size_t N>
462 HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
466 template <
typename T,
size_t N>
467 HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
474 #if HWY_TARGET == HWY_AVX3_DL
476 #ifdef HWY_NATIVE_POPCNT
477 #undef HWY_NATIVE_POPCNT
479 #define HWY_NATIVE_POPCNT
484 template <
typename T,
size_t N>
489 template <
typename T,
size_t N>
494 template <
typename T,
size_t N>
499 template <
typename T,
size_t N>
507 template <
typename T,
size_t N>
518 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
519 HWY_API Vec128<T, N>
Neg(
const Vec128<T, N> v) {
523 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
524 HWY_API Vec128<T, N>
Neg(
const Vec128<T, N> v) {
525 return Zero(Simd<T, N>()) - v;
532 HWY_API Vec128<int8_t, N>
Abs(
const Vec128<int8_t, N> v) {
533 #if HWY_COMPILER_MSVC
535 const auto zero =
Zero(Simd<int8_t, N>());
536 return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
538 return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
542 HWY_API Vec128<int16_t, N>
Abs(
const Vec128<int16_t, N> v) {
543 return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
546 HWY_API Vec128<int32_t, N>
Abs(
const Vec128<int32_t, N> v) {
547 return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
551 HWY_API Vec128<float, N>
Abs(
const Vec128<float, N> v) {
552 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
553 return v &
BitCast(Simd<float, N>(), mask);
563 template <
typename T,
size_t N>
565 const Vec128<T, N> sign) {
566 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
571 #if HWY_TARGET <= HWY_AVX3
572 const Rebind<MakeUnsigned<T>, decltype(d)> du;
583 const __m128i out = _mm_ternarylogic_epi32(
591 template <
typename T,
size_t N>
593 const Vec128<T, N> sign) {
594 #if HWY_TARGET <= HWY_AVX3
604 #if HWY_TARGET <= HWY_AVX3
608 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
623 template <
typename T,
size_t N>
629 template <
typename T,
size_t N>
635 template <
typename T,
size_t N>
641 template <
typename T,
size_t N>
650 template <
typename T,
size_t N>
671 template <
typename T,
size_t N>
676 template <
typename T,
size_t N>
681 template <
typename T,
size_t N>
686 template <
typename T,
size_t N>
694 template <
typename T,
size_t N>
713 template <
typename T,
size_t N>
719 template <
typename T,
size_t N>
724 template <
typename T,
size_t N>
729 template <
typename T,
size_t N>
737 template <
typename T,
size_t N>
757 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) && \
758 (HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
759 HWY_COMPILER_CLANG >= 800)
760 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
762 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
767 template <
typename T,
size_t N>
770 #if HWY_COMPILER_HAS_MASK_INTRINSICS
776 template <
typename T,
size_t N>
779 #if HWY_COMPILER_HAS_MASK_INTRINSICS
785 template <
typename T,
size_t N>
788 #if HWY_COMPILER_HAS_MASK_INTRINSICS
794 template <
typename T,
size_t N>
797 #if HWY_COMPILER_HAS_MASK_INTRINSICS
804 template <
typename T,
size_t N>
807 #if HWY_COMPILER_HAS_MASK_INTRINSICS
813 template <
typename T,
size_t N>
816 #if HWY_COMPILER_HAS_MASK_INTRINSICS
822 template <
typename T,
size_t N>
825 #if HWY_COMPILER_HAS_MASK_INTRINSICS
831 template <
typename T,
size_t N>
834 #if HWY_COMPILER_HAS_MASK_INTRINSICS
841 template <
typename T,
size_t N>
844 #if HWY_COMPILER_HAS_MASK_INTRINSICS
850 template <
typename T,
size_t N>
853 #if HWY_COMPILER_HAS_MASK_INTRINSICS
859 template <
typename T,
size_t N>
862 #if HWY_COMPILER_HAS_MASK_INTRINSICS
868 template <
typename T,
size_t N>
871 #if HWY_COMPILER_HAS_MASK_INTRINSICS
878 template <
typename T,
size_t N>
881 #if HWY_COMPILER_HAS_MASK_INTRINSICS
887 template <
typename T,
size_t N>
890 #if HWY_COMPILER_HAS_MASK_INTRINSICS
896 template <
typename T,
size_t N>
899 #if HWY_COMPILER_HAS_MASK_INTRINSICS
905 template <
typename T,
size_t N>
908 #if HWY_COMPILER_HAS_MASK_INTRINSICS
917 template <
typename T,
size_t N>
918 HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
922 template <
typename T,
size_t N>
923 HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
927 template <
typename T,
size_t N>
928 HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
932 template <
typename T,
size_t N>
933 HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
937 template <
typename T,
size_t N>
938 HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
948 template <
typename T,
size_t N>
950 return Mask128<T, N>{v.raw};
953 template <
typename T,
size_t N>
955 return Vec128<T, N>{v.raw};
958 template <
typename T,
size_t N>
960 const Mask128<T, N> v) {
961 return Vec128<T, N>{v.raw};
964 #if HWY_TARGET == HWY_SSSE3
967 template <
typename T,
size_t N>
970 const auto vmask =
VecFromMask(Simd<T, N>(), mask);
977 template <
typename T,
size_t N>
980 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
984 const Vec128<float, N> yes,
985 const Vec128<float, N> no) {
986 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
990 const Vec128<double, N> yes,
991 const Vec128<double, N> no) {
992 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
998 template <
typename T,
size_t N>
1004 template <
typename T,
size_t N>
1011 template <
typename T,
size_t N>
1012 HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1016 template <
typename T,
size_t N>
1017 HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1022 template <
typename T,
size_t N>
1023 HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1028 template <
typename T,
size_t N>
1029 HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1034 template <
typename T,
size_t N>
1035 HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1054 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
1059 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
1064 static_assert(N == 2 || N == 4,
"Does not make sense for N=1");
1070 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1073 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1076 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
1090 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1093 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1096 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
1100 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1103 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1106 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
1111 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1114 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1117 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
1122 #if HWY_TARGET <= HWY_AVX3
1126 template <
typename TFrom,
size_t NFrom,
typename TTo,
size_t NTo>
1129 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1135 template <
typename T,
size_t N>
1140 template <
typename T,
size_t N>
1145 template <
typename T,
size_t N>
1150 template <
typename T,
size_t N>
1158 template <
typename T,
size_t N>
1159 HWY_API Mask128<T, N>
TestBit(
const Vec128<T, N> v,
const Vec128<T, N> bit) {
1160 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1166 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1171 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1173 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1176 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1178 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1181 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1183 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1188 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1199 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1204 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1206 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1209 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1211 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1214 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1216 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1221 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1234 HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1235 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1239 Vec128<int16_t, N> b) {
1240 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1244 Vec128<int32_t, N> b) {
1245 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1249 Vec128<int64_t, N> b) {
1250 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1253 HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
1254 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1265 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1277 template <
typename T,
size_t N>
1282 template <
typename T,
size_t N>
1287 template <
typename T,
size_t N>
1292 template <
typename T,
size_t N>
1300 template <
typename T,
size_t N>
1314 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1319 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1321 return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1324 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1326 return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1329 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1331 return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1344 template <
typename T,
size_t N>
1353 template <
typename TFrom,
typename TTo,
size_t N>
1355 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1356 const Simd<TFrom, N> d;
1360 template <
typename T,
size_t N>
1361 HWY_API Mask128<T, N>
TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1362 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1363 return (v & bit) == bit;
1371 const Vec128<uint8_t, N> b) {
1372 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1376 const Vec128<uint16_t, N> b) {
1377 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1381 const Vec128<uint32_t, N> b) {
1382 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1386 const Vec128<uint64_t, N> b) {
1387 #if HWY_TARGET == HWY_SSSE3
1388 const Simd<uint32_t, N * 2> d32;
1389 const Simd<uint64_t, N> d64;
1394 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1401 const Vec128<int8_t, N> b) {
1402 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1406 Vec128<int16_t, N> b) {
1407 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1411 const Vec128<int32_t, N> b) {
1412 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1416 const Vec128<int64_t, N> b) {
1418 const Simd<uint64_t, N> du;
1425 const Vec128<float, N> b) {
1426 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1430 const Vec128<double, N> b) {
1431 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1436 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1443 const Vec128<float, N> b) {
1444 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1448 const Vec128<double, N> b) {
1449 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1456 HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1457 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1461 Vec128<int16_t, N> b) {
1462 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1466 Vec128<int32_t, N> b) {
1467 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1470 HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
1471 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1474 HWY_API Mask128<double, N>
operator>(Vec128<double, N> a, Vec128<double, N> b) {
1475 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1480 const Vec128<int64_t, N> b) {
1481 #if HWY_TARGET == HWY_SSSE3
1483 const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw);
1486 const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw);
1487 const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0));
1488 const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi);
1490 const __m128i gt = _mm_or_si128(lo_gt, m_gt);
1492 return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))};
1494 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};
1503 const Vec128<float, N> b) {
1504 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1508 const Vec128<double, N> b) {
1509 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1514 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1515 HWY_API Mask128<T, N>
FirstN(
const Simd<T, N> d,
size_t num) {
1527 template <
typename T,
size_t N>
1532 template <
typename T,
size_t N>
1545 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1546 #if defined(__clang_analyzer__) || \
1547 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1548 #define HWY_SAFE_PARTIAL_LOAD_STORE 1
1550 #define HWY_SAFE_PARTIAL_LOAD_STORE 0
1556 template <
typename T>
1558 return Vec128<T>{_mm_load_si128(
reinterpret_cast<const __m128i*
>(aligned))};
1569 template <
typename T>
1571 return Vec128<T>{_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(p))};
1575 return Vec128<float>{_mm_loadu_ps(p)};
1582 template <
typename T>
1585 #if HWY_SAFE_PARTIAL_LOAD_STORE
1586 __m128i v = _mm_setzero_si128();
1587 CopyBytes<8>(p, &v);
1588 return Vec128<T, 8 /
sizeof(T)>{v};
1590 return Vec128<T, 8 /
sizeof(T)>{
1591 _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(p))};
1597 #if HWY_SAFE_PARTIAL_LOAD_STORE
1598 __m128 v = _mm_setzero_ps();
1599 CopyBytes<8>(p, &v);
1602 const __m128 hi = _mm_setzero_ps();
1603 return Vec128<float, 2>{_mm_loadl_pi(hi,
reinterpret_cast<const __m64*
>(p))};
1609 #if HWY_SAFE_PARTIAL_LOAD_STORE
1610 __m128d v = _mm_setzero_pd();
1611 CopyBytes<8>(p, &v);
1620 #if HWY_SAFE_PARTIAL_LOAD_STORE
1621 __m128 v = _mm_setzero_ps();
1622 CopyBytes<4>(p, &v);
1630 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
1632 constexpr
size_t kSize =
sizeof(T) * N;
1633 #if HWY_SAFE_PARTIAL_LOAD_STORE
1634 __m128 v = _mm_setzero_ps();
1635 CopyBytes<kSize>(p, &v);
1636 return Vec128<T, N>{v};
1639 CopyBytes<kSize>(p, &bits);
1640 return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1645 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1651 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1658 #if HWY_TARGET <= HWY_AVX3
1660 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
1666 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
1669 return Vec128<T, N>{_mm_maskz_load_epi64(m.raw, aligned)};
1672 template <
size_t N, HWY_IF_LE128(
float, N)>
1679 template <
size_t N, HWY_IF_LE128(
double, N)>
1687 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE128(T, N)>
1690 return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, aligned)};
1693 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE128(T, N)>
1696 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, aligned)};
1702 template <
class M,
class D>
1711 template <
typename T>
1713 _mm_store_si128(
reinterpret_cast<__m128i*
>(aligned), v.raw);
1717 _mm_store_ps(aligned, v.
raw);
1721 _mm_store_pd(aligned, v.
raw);
1724 template <
typename T>
1726 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(p), v.
raw);
1730 _mm_storeu_ps(p, v.raw);
1734 _mm_storeu_pd(p, v.
raw);
1737 template <
typename T>
1740 #if HWY_SAFE_PARTIAL_LOAD_STORE
1741 CopyBytes<8>(&v, p);
1743 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(p), v.raw);
1748 #if HWY_SAFE_PARTIAL_LOAD_STORE
1749 CopyBytes<8>(&v, p);
1751 _mm_storel_pi(
reinterpret_cast<__m64*
>(p), v.
raw);
1756 #if HWY_SAFE_PARTIAL_LOAD_STORE
1757 CopyBytes<8>(&v, p);
1759 _mm_storel_pd(p, v.
raw);
1764 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
1766 CopyBytes<sizeof(T) * N>(&v, p);
1768 HWY_API void Store(
const Vec128<float, 1> v, Simd<float, 1> ,
1770 #if HWY_SAFE_PARTIAL_LOAD_STORE
1771 CopyBytes<4>(&v, p);
1773 _mm_store_ss(p, v.raw);
1778 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1790 const Vec128<uint8_t, N> b) {
1791 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
1795 const Vec128<uint16_t, N> b) {
1796 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
1800 const Vec128<uint32_t, N> b) {
1801 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
1812 const Vec128<int8_t, N> b) {
1813 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
1817 const Vec128<int16_t, N> b) {
1818 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
1822 const Vec128<int32_t, N> b) {
1823 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
1834 const Vec128<float, N> b) {
1835 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
1848 const Vec128<uint8_t, N> b) {
1849 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
1853 Vec128<uint16_t, N> b) {
1854 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
1858 const Vec128<uint32_t, N> b) {
1859 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
1870 const Vec128<int8_t, N> b) {
1871 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
1875 const Vec128<int16_t, N> b) {
1876 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
1880 const Vec128<int32_t, N> b) {
1881 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
1892 const Vec128<float, N> b) {
1893 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
1908 const Vec128<uint8_t, N> b) {
1909 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
1913 const Vec128<uint16_t, N> b) {
1914 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
1920 const Vec128<int8_t, N> b) {
1921 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
1925 const Vec128<int16_t, N> b) {
1926 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
1936 const Vec128<uint8_t, N> b) {
1937 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
1941 const Vec128<uint16_t, N> b) {
1942 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
1948 const Vec128<int8_t, N> b) {
1949 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
1953 const Vec128<int16_t, N> b) {
1954 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
1964 const Vec128<uint8_t, N> b) {
1965 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
1969 const Vec128<uint16_t, N> b) {
1970 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
1977 const Vec128<uint16_t, N> b) {
1978 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
1982 const Vec128<int16_t, N> b) {
1983 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
1989 const Vec128<uint16_t, N> b) {
1990 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
1994 const Vec128<int16_t, N> b) {
1995 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2001 HWY_API Vec128<uint64_t, (N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
2002 const Vec128<uint32_t, N> b) {
2003 return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2006 #if HWY_TARGET == HWY_SSSE3
2008 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2014 const Vec128<int32_t> b) {
2015 alignas(16) int32_t a_lanes[4];
2016 alignas(16) int32_t b_lanes[4];
2017 const Full128<int32_t> di32;
2018 Store(a, di32, a_lanes);
2019 Store(b, di32, b_lanes);
2020 alignas(16) int64_t mul[2];
2021 mul[0] = int64_t(a_lanes[0]) * b_lanes[0];
2022 mul[1] = int64_t(a_lanes[2]) * b_lanes[2];
2023 return Load(Full128<int64_t>(), mul);
2029 HWY_API Vec128<int64_t, (N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
2030 const Vec128<int32_t, N> b) {
2031 return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2038 const Vec128<uint32_t, N> b) {
2039 #if HWY_TARGET == HWY_SSSE3
2043 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2044 const auto mullo_x2x0 =
MulEven(a, b);
2045 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2046 const auto mullo_x3x1 =
2047 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2050 const __m128i mul_20 =
2051 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2052 const __m128i mul_31 =
2053 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2054 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2056 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2062 const Vec128<int32_t, N> b) {
2064 const Simd<uint32_t, N> du;
2070 template <
int kBits,
size_t N>
2072 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2075 template <
int kBits,
size_t N>
2077 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2080 template <
int kBits,
size_t N>
2085 template <
int kBits,
size_t N>
2087 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2089 template <
int kBits,
size_t N>
2091 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2093 template <
int kBits,
size_t N>
2098 template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2100 const Simd<T, N> d8;
2102 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<
MakeWide<T>>{v.raw}).raw};
2105 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
2110 template <
int kBits,
size_t N>
2112 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
2114 template <
int kBits,
size_t N>
2116 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
2118 template <
int kBits,
size_t N>
2123 template <
int kBits,
size_t N>
2125 const Simd<uint8_t, N> d8;
2127 const Vec128<uint8_t, N> shifted{
2128 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
2129 return shifted &
Set(d8, 0xFF >> kBits);
2132 template <
int kBits,
size_t N>
2134 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
2136 template <
int kBits,
size_t N>
2138 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
2141 template <
int kBits,
size_t N>
2143 const Simd<int8_t, N> di;
2144 const Simd<uint8_t, N> du;
2145 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
2146 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
2147 return (shifted ^ shifted_sign) - shifted_sign;
2161 return ShiftRight<15>(v);
2166 return ShiftRight<31>(v);
2171 #if HWY_TARGET <= HWY_AVX3
2173 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2179 const auto sign = ShiftRight<31>(
BitCast(d32, v));
2181 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2186 HWY_API Vec128<int64_t, N>
Abs(
const Vec128<int64_t, N> v) {
2187 #if HWY_TARGET <= HWY_AVX3
2188 return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
2190 const auto zero =
Zero(Simd<int64_t, N>());
2195 template <
int kBits,
size_t N>
2197 #if HWY_TARGET <= HWY_AVX3
2202 const auto right =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
2204 return right | sign;
2209 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2212 #if HWY_TARGET == HWY_SSSE3
2226 return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2231 return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2242 return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2248 return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2257 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2259 const Simd<T, N> d8;
2261 const Vec128<T, N> shifted{
2263 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
2271 return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2276 return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2287 const Simd<uint8_t, N> d8;
2289 const Vec128<uint8_t, N> shifted{
2291 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
2297 return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2303 return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2308 #if HWY_TARGET <= HWY_AVX3
2315 return right | sign;
2321 const Simd<int8_t, N> di;
2322 const Simd<uint8_t, N> du;
2324 const auto shifted_sign =
2325 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
2326 return (shifted ^ shifted_sign) - shifted_sign;
2332 HWY_API Vec128<float, N>
operator*(Vec128<float, N> a, Vec128<float, N> b) {
2333 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2351 const Vec128<float, N> b) {
2352 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2371 return Vec128<float, N>{_mm_rcp_ps(v.raw)};
2380 const Vec128<float, N> b) {
2389 const Vec128<float, N> x,
2390 const Vec128<float, N> add) {
2391 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2392 return mul * x + add;
2394 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2401 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2402 return mul * x + add;
2411 const Vec128<float, N> x,
2412 const Vec128<float, N> add) {
2413 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2414 return add - mul * x;
2416 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2423 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2424 return add - mul * x;
2433 const Vec128<float, N> x,
2434 const Vec128<float, N> sub) {
2435 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2436 return mul * x - sub;
2438 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2445 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2446 return mul * x - sub;
2455 const Vec128<float, N> x,
2456 const Vec128<float, N> sub) {
2457 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2458 return Neg(mul) * x - sub;
2460 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2467 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2468 return Neg(mul) * x - sub;
2478 HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N> v) {
2479 return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
2495 return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
2505 template <
typename T,
size_t N>
2510 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
2519 HWY_API Vec128<uint8_t, N>
Min(
const Vec128<uint8_t, N> a,
2520 const Vec128<uint8_t, N> b) {
2521 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
2524 HWY_API Vec128<uint16_t, N>
Min(
const Vec128<uint16_t, N> a,
2525 const Vec128<uint16_t, N> b) {
2526 #if HWY_TARGET == HWY_SSSE3
2529 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
2533 HWY_API Vec128<uint32_t, N>
Min(
const Vec128<uint32_t, N> a,
2534 const Vec128<uint32_t, N> b) {
2535 #if HWY_TARGET == HWY_SSSE3
2538 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
2542 HWY_API Vec128<uint64_t, N>
Min(
const Vec128<uint64_t, N> a,
2543 const Vec128<uint64_t, N> b) {
2544 #if HWY_TARGET <= HWY_AVX3
2545 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
2553 HWY_API Vec128<int8_t, N>
Min(
const Vec128<int8_t, N> a,
2554 const Vec128<int8_t, N> b) {
2555 #if HWY_TARGET == HWY_SSSE3
2558 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
2562 HWY_API Vec128<int16_t, N>
Min(
const Vec128<int16_t, N> a,
2563 const Vec128<int16_t, N> b) {
2564 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
2567 HWY_API Vec128<int32_t, N>
Min(
const Vec128<int32_t, N> a,
2568 const Vec128<int32_t, N> b) {
2569 #if HWY_TARGET == HWY_SSSE3
2572 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
2576 HWY_API Vec128<int64_t, N>
Min(
const Vec128<int64_t, N> a,
2577 const Vec128<int64_t, N> b) {
2578 #if HWY_TARGET <= HWY_AVX3
2579 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
2587 HWY_API Vec128<float, N>
Min(
const Vec128<float, N> a,
2588 const Vec128<float, N> b) {
2589 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
2600 template <
typename T,
size_t N>
2605 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
2614 HWY_API Vec128<uint8_t, N>
Max(
const Vec128<uint8_t, N> a,
2615 const Vec128<uint8_t, N> b) {
2616 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
2619 HWY_API Vec128<uint16_t, N>
Max(
const Vec128<uint16_t, N> a,
2620 const Vec128<uint16_t, N> b) {
2621 #if HWY_TARGET == HWY_SSSE3
2624 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
2628 HWY_API Vec128<uint32_t, N>
Max(
const Vec128<uint32_t, N> a,
2629 const Vec128<uint32_t, N> b) {
2630 #if HWY_TARGET == HWY_SSSE3
2633 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
2637 HWY_API Vec128<uint64_t, N>
Max(
const Vec128<uint64_t, N> a,
2638 const Vec128<uint64_t, N> b) {
2639 #if HWY_TARGET <= HWY_AVX3
2640 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
2648 HWY_API Vec128<int8_t, N>
Max(
const Vec128<int8_t, N> a,
2649 const Vec128<int8_t, N> b) {
2650 #if HWY_TARGET == HWY_SSSE3
2653 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
2657 HWY_API Vec128<int16_t, N>
Max(
const Vec128<int16_t, N> a,
2658 const Vec128<int16_t, N> b) {
2659 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
2662 HWY_API Vec128<int32_t, N>
Max(
const Vec128<int32_t, N> a,
2663 const Vec128<int32_t, N> b) {
2664 #if HWY_TARGET == HWY_SSSE3
2667 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
2671 HWY_API Vec128<int64_t, N>
Max(
const Vec128<int64_t, N> a,
2672 const Vec128<int64_t, N> b) {
2673 #if HWY_TARGET <= HWY_AVX3
2674 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
2682 HWY_API Vec128<float, N>
Max(
const Vec128<float, N> a,
2683 const Vec128<float, N> b) {
2684 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
2698 template <
typename T,
size_t N>
2701 _mm_stream_si128(
reinterpret_cast<__m128i*
>(aligned), v.raw);
2706 _mm_stream_ps(aligned, v.
raw);
2711 _mm_stream_pd(aligned, v.
raw);
2722 static_assert(sizeof(
GatherIndex64) == 8, "Must be 64-bit type");
2724 #if HWY_TARGET <= HWY_AVX3
2727 template <
typename T,
size_t N>
2732 _mm_i32scatter_epi32(base, offset.
raw, v.
raw, 1);
2734 const __mmask8 mask = (1u << N) - 1;
2735 _mm_mask_i32scatter_epi32(base, mask, offset.
raw, v.
raw, 1);
2738 template <
typename T,
size_t N>
2743 _mm_i32scatter_epi32(base, index.
raw, v.
raw, 4);
2745 const __mmask8 mask = (1u << N) - 1;
2746 _mm_mask_i32scatter_epi32(base, mask, index.
raw, v.
raw, 4);
2750 template <
typename T,
size_t N>
2755 _mm_i64scatter_epi64(base, offset.
raw, v.
raw, 1);
2757 const __mmask8 mask = (1u << N) - 1;
2758 _mm_mask_i64scatter_epi64(base, mask, offset.
raw, v.
raw, 1);
2761 template <
typename T,
size_t N>
2766 _mm_i64scatter_epi64(base, index.
raw, v.
raw, 8);
2768 const __mmask8 mask = (1u << N) - 1;
2769 _mm_mask_i64scatter_epi64(base, mask, index.
raw, v.
raw, 8);
2775 template <
typename T,
size_t N,
typename Offset>
2778 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2781 template <
typename T,
size_t N,
typename Index>
2784 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2793 _mm_i32scatter_ps(base, offset.
raw, v.
raw, 1);
2795 const __mmask8 mask = (1u << N) - 1;
2796 _mm_mask_i32scatter_ps(base, mask, offset.
raw, v.
raw, 1);
2804 _mm_i32scatter_ps(base, index.
raw, v.
raw, 4);
2806 const __mmask8 mask = (1u << N) - 1;
2807 _mm_mask_i32scatter_ps(base, mask, index.
raw, v.
raw, 4);
2816 _mm_i64scatter_pd(base, offset.
raw, v.
raw, 1);
2818 const __mmask8 mask = (1u << N) - 1;
2819 _mm_mask_i64scatter_pd(base, mask, offset.
raw, v.
raw, 1);
2827 _mm_i64scatter_pd(base, index.
raw, v.
raw, 8);
2829 const __mmask8 mask = (1u << N) - 1;
2830 _mm_mask_i64scatter_pd(base, mask, index.
raw, v.
raw, 8);
2835 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
2837 const Vec128<Offset, N> offset) {
2838 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2840 alignas(16) T lanes[N];
2843 alignas(16) Offset offset_lanes[N];
2844 Store(offset, Simd<Offset, N>(), offset_lanes);
2846 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
2847 for (
size_t i = 0; i < N; ++i) {
2848 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2852 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
2854 const Vec128<Index, N> index) {
2855 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2857 alignas(16) T lanes[N];
2860 alignas(16) Index index_lanes[N];
2861 Store(index, Simd<Index, N>(), index_lanes);
2863 for (
size_t i = 0; i < N; ++i) {
2864 base[index_lanes[i]] = lanes[i];
2872 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2874 template <
typename T,
size_t N,
typename Offset>
2877 const Vec128<Offset, N> offset) {
2878 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2880 alignas(16) Offset offset_lanes[N];
2881 Store(offset, Simd<Offset, N>(), offset_lanes);
2883 alignas(16) T lanes[N];
2884 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
2885 for (
size_t i = 0; i < N; ++i) {
2886 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
2888 return Load(d, lanes);
2891 template <
typename T,
size_t N,
typename Index>
2893 const Vec128<Index, N> index) {
2894 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2896 alignas(16) Index index_lanes[N];
2897 Store(index, Simd<Index, N>(), index_lanes);
2899 alignas(16) T lanes[N];
2900 for (
size_t i = 0; i < N; ++i) {
2901 lanes[i] = base[index_lanes[i]];
2903 return Load(d, lanes);
2910 template <
typename T,
size_t N>
2914 const Vec128<int32_t, N> offset) {
2915 return Vec128<T, N>{_mm_i32gather_epi32(
2916 reinterpret_cast<const int32_t*
>(base), offset.raw, 1)};
2918 template <
typename T,
size_t N>
2922 const Vec128<int32_t, N> index) {
2923 return Vec128<T, N>{_mm_i32gather_epi32(
2924 reinterpret_cast<const int32_t*
>(base), index.raw, 4)};
2927 template <
typename T,
size_t N>
2931 const Vec128<int64_t, N> offset) {
2932 return Vec128<T, N>{_mm_i64gather_epi64(
2933 reinterpret_cast<const GatherIndex64*
>(base), offset.raw, 1)};
2935 template <
typename T,
size_t N>
2939 const Vec128<int64_t, N> index) {
2940 return Vec128<T, N>{_mm_i64gather_epi64(
2941 reinterpret_cast<const GatherIndex64*
>(base), index.raw, 8)};
2946 template <
typename T,
size_t N,
typename Offset>
2948 const Vec128<Offset, N> offset) {
2951 template <
typename T,
size_t N,
typename Index>
2953 const Vec128<Index, N> index) {
2960 const Vec128<int32_t, N> offset) {
2961 return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
2966 const Vec128<int32_t, N> index) {
2967 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
2973 const Vec128<int64_t, N> offset) {
2974 return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
2979 const Vec128<int64_t, N> index) {
2980 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
2992 template <
typename T,
size_t N>
2994 return Vec128<T, N / 2>{v.raw};
2997 template <
typename T,
size_t N>
3004 template <
int kBytes,
typename T,
size_t N>
3006 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3007 return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
3010 template <
int kBytes,
typename T,
size_t N>
3012 return ShiftLeftBytes<kBytes>(Simd<T, N>(), v);
3017 template <
int kLanes,
typename T,
size_t N>
3023 template <
int kLanes,
typename T,
size_t N>
3025 return ShiftLeftLanes<kLanes>(Simd<T, N>(), v);
3029 template <
int kBytes,
typename T,
size_t N>
3031 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3033 if (N != 16 /
sizeof(T)) {
3034 const Vec128<T> vfull{v.raw};
3037 return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
3041 template <
int kLanes,
typename T,
size_t N>
3050 template <
typename T>
3053 return Vec128<T, 8 /
sizeof(T)>{_mm_unpackhi_epi64(v.raw, v.raw)};
3056 return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
3064 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3070 return Vec128<T, (N + 1) / 2>{upper.raw};
3075 template <
int kBytes,
typename T,
class V = Vec128<T>>
3078 return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
3082 template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T, N),
3083 class V = Vec128<T, N>>
3085 constexpr
size_t kSize = N *
sizeof(T);
3086 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3088 const Full128<uint8_t> d_full8;
3089 using V8 =
VFromD<decltype(d_full8)>;
3090 const V8 hi8{
BitCast(d8, hi).raw};
3094 return V{
BitCast(Full128<T>(), r).raw};
3100 template <
int kLane,
size_t N>
3102 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3104 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3105 return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3107 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3108 return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3111 template <
int kLane,
size_t N>
3113 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3114 return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3116 template <
int kLane,
size_t N>
3118 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3123 template <
int kLane,
size_t N>
3125 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3127 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3128 return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3130 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3131 return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3134 template <
int kLane,
size_t N>
3136 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3137 return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3139 template <
int kLane,
size_t N>
3141 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3146 template <
int kLane,
size_t N>
3148 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3149 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
3151 template <
int kLane,
size_t N>
3153 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3158 template <
typename T,
size_t N,
typename TI,
size_t NI>
3160 const Vec128<TI, NI> from) {
3161 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
3166 template <
class V,
class VI>
3174 template <
typename T,
size_t N>
3179 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3181 #if HWY_IS_DEBUG_BUILD
3182 for (
size_t i = 0; i < N; ++i) {
3183 HWY_DASSERT(0 <= idx[i] && idx[i] <
static_cast<int32_t
>(N));
3188 alignas(16) uint8_t control[16] = {0};
3189 for (
size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
3190 for (
size_t idx_byte = 0; idx_byte <
sizeof(T); ++idx_byte) {
3191 control[idx_lane *
sizeof(T) + idx_byte] =
3192 static_cast<uint8_t
>(
size_t(idx[idx_lane]) *
sizeof(T) + idx_byte);
3195 return Indices128<T, N>{
Load(d8, control).raw};
3200 const Vec128<uint32_t, N> v,
const Indices128<uint32_t, N> idx) {
3205 const Indices128<int32_t, N> idx) {
3210 const Indices128<float, N> idx) {
3211 const Simd<int32_t, N> di;
3212 const Simd<float, N> df;
3219 template <
typename T>
3224 template <
typename T>
3225 HWY_API Vec128<T, 2>
Reverse(Simd<T, 2> ,
const Vec128<T, 2> v) {
3226 return Vec128<T, 2>{
Shuffle2301(Vec128<T>{v.raw}).raw};
3229 template <
typename T>
3230 HWY_API Vec128<T, 1>
Reverse(Simd<T, 1> ,
const Vec128<T, 1> v) {
3240 template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
3245 template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
3250 template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
3255 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
3261 template <
size_t N, HWY_IF_LE128(
int8_t, N)>
3266 template <
size_t N, HWY_IF_LE128(
int16_t, N)>
3271 template <
size_t N, HWY_IF_LE128(
int32_t, N)>
3276 template <
size_t N, HWY_IF_LE128(
int64_t, N)>
3282 template <
size_t N, HWY_IF_LE128(
float, N)>
3284 const Vec128<float, N> b) {
3285 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
3287 template <
size_t N, HWY_IF_LE128(
double, N)>
3294 template <
typename T,
size_t N, HWY_IF_LE128(T, N),
class V = Vec128<T, N>>
3339 const Vec128<float> b) {
3340 return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
3350 template <
typename T,
class V = Vec128<T>>
3356 template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
3358 const Half<decltype(d)> d2;
3366 template <
typename T,
size_t N,
class DW = RepartitionToW
ide<Simd<T, N>>>
3370 template <
typename T,
size_t N,
class D = Simd<T, N>,
3371 class DW = RepartitionToW
ide<D>>
3376 template <
typename T,
size_t N,
class D = Simd<T, N>,
3377 class DW = RepartitionToW
ide<D>>
3387 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3388 HWY_API Vec128<T, N>
Combine(Simd<T, N> d, Vec128<T, N / 2> hi_half,
3389 Vec128<T, N / 2> lo_half) {
3390 const Half<decltype(d)> d2;
3394 const VU lo{
BitCast(du2, lo_half).raw};
3395 const VU hi{
BitCast(du2, hi_half).raw};
3401 template <
typename T, HWY_IF_NOT_FLOAT(T)>
3403 Vec128<T, 8 /
sizeof(T)> lo) {
3404 return Vec128<T>{_mm_move_epi64(lo.raw)};
3407 template <
typename T, HWY_IF_FLOAT(T)>
3413 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3421 template <
typename T>
3428 template <
typename T>
3435 template <
typename T>
3437 const Vec128<T> lo) {
3438 return CombineShiftRightBytes<8>(d, hi, lo);
3442 template <
typename T>
3444 #if HWY_TARGET == HWY_SSSE3
3446 const __m128d concat = _mm_move_sd(
BitCast(dd, hi).raw,
BitCast(dd, lo).raw);
3466 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3469 const Half<decltype(d)> d2;
3473 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3476 const Half<decltype(d)> d2;
3480 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3482 const Vec128<T, N> lo) {
3483 const Half<decltype(d)> d2;
3487 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3490 const Half<decltype(d)> d2;
3497 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3501 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
3502 _MM_SHUFFLE(3, 1, 3, 1))});
3507 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
3511 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3512 HWY_API Vec128<T, 2>
ConcatOdd(Simd<T, 2> d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
3518 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3526 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3530 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
3531 _MM_SHUFFLE(2, 0, 2, 0))});
3536 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
3540 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3548 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3557 template <
typename T,
size_t N>
3559 const Vec128<T, N> b) {
3562 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3563 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3566 template <
typename T,
size_t N>
3568 const Vec128<T, N> b) {
3569 #if HWY_TARGET == HWY_SSSE3
3572 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
3573 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
3576 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
3579 template <
typename T,
size_t N>
3581 const Vec128<T, N> b) {
3582 #if HWY_TARGET == HWY_SSSE3
3583 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
3584 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
3585 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
3587 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
3590 template <
typename T,
size_t N>
3592 const Vec128<T, N> b) {
3593 #if HWY_TARGET == HWY_SSSE3
3594 const Full128<double> dd;
3595 const __m128d concat = _mm_move_sd(
BitCast(dd, a).raw,
BitCast(dd, b).raw);
3596 return BitCast(Full128<T>(), Vec128<double>{concat});
3598 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
3604 template <
typename T,
size_t N>
3605 HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
3610 const Vec128<float, N> b) {
3611 #if HWY_TARGET == HWY_SSSE3
3614 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
3615 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
3616 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
3618 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
3634 #if HWY_TARGET > HWY_AVX3
3638 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3639 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(
const Vec128<T, N> v) {
3642 const Rebind<float, decltype(dw)> df;
3643 const auto zero =
Zero(d);
3646 const auto upper = exp +
Set(d, 0x3F80);
3648 const auto f0 =
ZipLower(dw, zero, upper);
3649 const auto f1 =
ZipUpper(dw, zero, upper);
3651 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(
BitCast(df, f0).raw)};
3652 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(
BitCast(df, f1).raw)};
3653 return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
3657 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3658 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(
const Vec128<T, N> v) {
3660 const auto exp = ShiftLeft<23>(v);
3661 const auto f = exp +
Set(d, 0x3F800000);
3665 return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
3674 #if HWY_TARGET <= HWY_AVX3
3677 return v * detail::Pow2(bits);
3688 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3689 return v * detail::Pow2(bits);
3700 const Vec128<uint64_t> bits) {
3701 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3703 const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
3704 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
3705 const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
3708 return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
3712 const Vec128<uint64_t, 1> bits) {
3713 return Vec128<uint64_t, 1>{_mm_sll_epi64(v.raw, bits.raw)};
3717 template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
3718 HWY_API Vec128<T, N>
operator<<(
const Vec128<T, N> v,
const Vec128<T, N> bits) {
3719 const Simd<T, N> di;
3720 const Simd<MakeUnsigned<T>, N> du;
3734 #if HWY_TARGET <= HWY_AVX3
3739 const auto out =
MulHigh(in, detail::Pow2(
Set(d, 16) - bits));
3752 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3759 const auto mul = detail::Pow2(
Set(d32, 32) - bits);
3760 const auto out20 = ShiftRight<32>(
MulEven(in, mul));
3777 const Vec128<uint64_t> bits) {
3778 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3780 const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
3781 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
3782 const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
3785 return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
3789 const Vec128<uint64_t, 1> bits) {
3790 return Vec128<uint64_t, 1>{_mm_srl_epi64(v.raw, bits.raw)};
3793 #if HWY_TARGET > HWY_AVX3
3797 template <
class DI,
class V>
3798 HWY_INLINE V SignedShr(
const DI di,
const V v,
const V count_i) {
3799 const RebindToUnsigned<DI> du;
3800 const auto count =
BitCast(du, count_i);
3804 const auto abs =
BitCast(du, v ^ sign);
3805 return BitCast(di, abs >> count) ^ sign;
3814 #if HWY_TARGET <= HWY_AVX3
3828 #if HWY_TARGET <= HWY_AVX3
3842 #if HWY_TARGET <= HWY_AVX3
3852 const Vec128<uint64_t> b) {
3853 alignas(16) uint64_t mul[2];
3855 return Load(Full128<uint64_t>(), mul);
3859 const Vec128<uint64_t> b) {
3860 alignas(16) uint64_t mul[2];
3861 const Half<Full128<uint64_t>> d2;
3864 return Load(Full128<uint64_t>(), mul);
3871 Vec128<bfloat16_t, 2 * N> a,
3872 Vec128<bfloat16_t, 2 * N> b,
3873 const Vec128<float, N> sum0,
3874 Vec128<float, N>& sum1) {
3878 const Vec128<uint16_t, 2 * N> zero =
Zero(du16);
3881 const Vec128<uint32_t, N> a0 =
ZipLower(du32, zero,
BitCast(du16, a));
3882 const Vec128<uint32_t, N> a1 =
ZipUpper(du32, zero,
BitCast(du16, a));
3883 const Vec128<uint32_t, N> b0 =
ZipLower(du32, zero,
BitCast(du16, b));
3884 const Vec128<uint32_t, N> b1 =
ZipUpper(du32, zero,
BitCast(du16, b));
3896 const Vec128<uint8_t, N> v) {
3897 #if HWY_TARGET == HWY_SSSE3
3898 const __m128i zero = _mm_setzero_si128();
3899 return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
3901 return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
3906 const Vec128<uint16_t, N> v) {
3907 #if HWY_TARGET == HWY_SSSE3
3908 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
3910 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
3916 #if HWY_TARGET == HWY_SSSE3
3924 const Vec128<uint8_t, N> v) {
3925 #if HWY_TARGET == HWY_SSSE3
3926 const __m128i zero = _mm_setzero_si128();
3927 const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
3928 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
3930 return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
3937 const Vec128<uint8_t, N> v) {
3942 const Vec128<uint16_t, N> v) {
3947 const Vec128<uint8_t, N> v) {
3954 const Vec128<int8_t, N> v) {
3955 #if HWY_TARGET == HWY_SSSE3
3956 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
3958 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
3963 const Vec128<int16_t, N> v) {
3964 #if HWY_TARGET == HWY_SSSE3
3965 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
3967 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
3972 const Vec128<int32_t, N> v) {
3973 #if HWY_TARGET == HWY_SSSE3
3974 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
3976 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
3981 const Vec128<int8_t, N> v) {
3982 #if HWY_TARGET == HWY_SSSE3
3983 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
3984 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
3985 return ShiftRight<24>(Vec128<int32_t, N>{x4});
3987 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
3993 #if defined(MEMORY_SANITIZER) && \
3994 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
3995 #define HWY_INLINE_F16 HWY_NOINLINE
3997 #define HWY_INLINE_F16 HWY_INLINE
4002 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4007 const auto sign = ShiftRight<15>(bits16);
4008 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
4009 const auto mantissa = bits16 &
Set(du32, 0x3FF);
4010 const auto subnormal =
4012 Set(df32, 1.0f / 16384 / 1024));
4014 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
4015 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
4016 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4017 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
4018 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4027 const Vec128<bfloat16_t, N> v) {
4028 const Rebind<uint16_t, decltype(df32)> du16;
4041 const Vec128<int32_t, N> v) {
4042 return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
4049 const Vec128<int32_t, N> v) {
4050 #if HWY_TARGET == HWY_SSSE3
4051 const Simd<int32_t, N> di32;
4052 const Simd<uint16_t, N * 2> du16;
4053 const auto zero_if_neg =
AndNot(ShiftRight<31>(v), v);
4055 const auto clamped =
Or(zero_if_neg, too_big);
4057 alignas(16) constexpr uint16_t kLower2Bytes[16] = {
4058 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
4059 const auto lo2 =
Load(du16, kLower2Bytes);
4062 return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
4068 const Vec128<int32_t, N> v) {
4069 return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
4074 const Vec128<int32_t, N> v) {
4075 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4076 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
4081 const Vec128<int16_t, N> v) {
4082 return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
4087 const Vec128<int32_t, N> v) {
4088 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4089 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
4094 const Vec128<int16_t, N> v) {
4095 return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
4100 const Vec128<float, N> v) {
4101 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4103 const Rebind<uint32_t, decltype(df16)> du;
4105 const auto bits32 =
BitCast(du, v);
4106 const auto sign = ShiftRight<31>(bits32);
4107 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
4108 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
4110 const auto k15 =
Set(di, 15);
4111 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
4112 const auto is_tiny = exp <
Set(di, -24);
4114 const auto is_subnormal = exp <
Set(di, -14);
4115 const auto biased_exp16 =
4117 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
4118 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
4119 (mantissa32 >> (
Set(du, 13) + sub_exp));
4121 ShiftRight<13>(mantissa32));
4123 const auto sign16 = ShiftLeft<15>(sign);
4124 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
4129 return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
4135 const Vec128<float, N> v) {
4137 const Rebind<int32_t, decltype(dbf16)> di32;
4138 const Rebind<uint32_t, decltype(dbf16)> du32;
4139 const Rebind<uint16_t, decltype(dbf16)> du16;
4140 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32, v)));
4146 Simd<bfloat16_t, 2 * N> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
4149 const Repartition<uint32_t, decltype(dbf16)> du32;
4150 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
4166 -> decltype(
Zero(d)) {
4169 return Min(v,
Set(d, 2147483647.0));
4175 template <
typename TI,
size_t N,
class DF = Simd<MakeFloat<TI>, N>>
4177 decltype(
Zero(DF())) original,
4178 decltype(
Zero(di).raw) converted_raw)
4179 -> decltype(
Zero(di)) {
4185 const auto converted = decltype(
Zero(di)){converted_raw};
4186 const auto sign_wrong =
AndNot(
BitCast(di, original), converted);
4194 const Vec128<double, N> v) {
4196 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
4202 const Simd<uint32_t, N> d32;
4203 const Simd<uint8_t, N * 4> d8;
4204 alignas(16)
static constexpr uint32_t k8From32[4] = {
4205 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
4215 const Vec128<int32_t, N> v) {
4216 return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
4222 #if HWY_TARGET <= HWY_AVX3
4231 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
4232 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64, v)) ^ k84_63);
4235 const auto k52 =
Set(d32, 0x43300000);
4238 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
4239 return (v_upper - k84_63_52) + v_lower;
4246 const Vec128<float, N> v) {
4252 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
4254 #elif HWY_ARCH_X86_64
4255 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.
raw));
4257 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
UpperHalf(dd2, v).raw));
4260 using VI = decltype(
Zero(di));
4261 const VI k0 =
Zero(di);
4262 const VI k1 =
Set(di, 1);
4263 const VI k51 =
Set(di, 51);
4266 const VI biased_exp = ShiftRight<52>(
BitCast(di, v)) &
Set(di, 0x7FF);
4267 const VI exp = biased_exp -
Set(di, 0x3FF);
4268 const auto in_range = exp <
Set(di, 63);
4276 const VI shift_mnt =
Max(k51 - exp, k0);
4277 const VI shift_int =
Max(exp - k51, k0);
4278 const VI mantissa =
BitCast(di, v) &
Set(di, (1ULL << 52) - 1);
4280 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4282 const VI shifted = int52 << shift_int;
4284 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4288 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
4289 const VI magnitude =
IfThenElse(in_range, restored, limit);
4292 return (magnitude ^ sign_mask) - sign_mask;
4298 #if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
4310 const Simd<int32_t, N> di;
4316 #if HWY_TARGET == HWY_SSSE3
4319 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4325 const auto max =
Set(df, MantissaEnd<T>());
4327 const auto added = large + v;
4328 const auto rounded = added - large;
4338 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4346 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4352 const auto int_f =
ConvertTo(df, integer);
4358 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4364 const auto int_f =
ConvertTo(df, integer);
4373 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4379 const auto int_f =
ConvertTo(df, integer);
4391 HWY_API Vec128<float, N>
Round(
const Vec128<float, N> v) {
4392 return Vec128<float, N>{
4393 _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
4396 HWY_API Vec128<double, N>
Round(
const Vec128<double, N> v) {
4397 return Vec128<double, N>{
4398 _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
4403 HWY_API Vec128<float, N>
Trunc(
const Vec128<float, N> v) {
4404 return Vec128<float, N>{
4405 _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
4408 HWY_API Vec128<double, N>
Trunc(
const Vec128<double, N> v) {
4409 return Vec128<double, N>{
4410 _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
4415 HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N> v) {
4416 return Vec128<float, N>{
4417 _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
4420 HWY_API Vec128<double, N>
Ceil(
const Vec128<double, N> v) {
4421 return Vec128<double, N>{
4422 _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
4427 HWY_API Vec128<float, N>
Floor(
const Vec128<float, N> v) {
4428 return Vec128<float, N>{
4429 _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
4432 HWY_API Vec128<double, N>
Floor(
const Vec128<double, N> v) {
4433 return Vec128<double, N>{
4434 _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
4441 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
4444 #ifdef HWY_NATIVE_AES
4445 #undef HWY_NATIVE_AES
4447 #define HWY_NATIVE_AES
4451 Vec128<uint8_t> round_key) {
4452 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
4455 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
4457 Vec128<uint64_t, N> b) {
4458 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
4461 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
4463 Vec128<uint64_t, N> b) {
4464 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
4472 template <
typename T,
size_t N,
typename T2, HWY_IF_LE128(T, N)>
4475 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
4476 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
4478 return Load(d, lanes);
4481 #if HWY_TARGET <= HWY_AVX3
4486 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4489 uint64_t mask_bits = 0;
4490 constexpr
size_t kNumBytes = (N + 7) / 8;
4491 CopyBytes<kNumBytes>(bits, &mask_bits);
4493 mask_bits &= (1ull << N) - 1;
4502 template <
typename T,
size_t N>
4504 const Mask128<T, N> mask, uint8_t* bits) {
4505 constexpr
size_t kNumBytes = (N + 7) / 8;
4506 CopyBytes<kNumBytes>(&mask.raw, bits);
4510 const int mask = (1 << N) - 1;
4511 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
4521 template <
typename T,
size_t N>
4523 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.
raw) & ((1u << N) - 1);
4527 template <
typename T,
size_t N>
4529 const Mask128<T, N> mask) {
4530 const uint32_t mask_bits =
static_cast<uint32_t
>(mask.raw) & ((1u << N) - 1);
4534 template <
typename T,
size_t N>
4536 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.
raw) & ((1u << N) - 1);
4537 return mask_bits == 0;
4540 template <
typename T,
size_t N>
4541 HWY_API bool AllTrue(
const Simd<T, N> ,
const Mask128<T, N> mask) {
4542 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u << N) - 1);
4544 return mask_bits == (1u << N) - 1;
4549 #if HWY_TARGET != HWY_AVX3_DL
4553 HWY_INLINE Vec128<uint16_t, 8> IndicesForCompress16(uint64_t mask_bits) {
4554 Full128<uint16_t> du16;
4558 Rebind<uint8_t, decltype(du16)> du8;
4559 alignas(16) constexpr uint8_t tbl[2048] = {
4560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
4561 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
4562 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
4563 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
4564 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
4565 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
4566 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
4567 0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
4568 0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
4569 3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
4570 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
4571 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
4572 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
4573 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
4574 0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
4575 0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
4576 1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
4577 2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
4578 5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
4579 4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
4580 5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
4581 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
4582 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
4583 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
4584 0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
4585 2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
4586 6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
4587 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
4588 6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
4589 0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
4590 0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
4591 0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
4592 2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
4593 1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
4594 5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
4595 5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
4596 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
4597 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
4598 0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
4599 0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
4600 0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
4601 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
4602 7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
4603 0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
4604 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
4605 0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
4606 0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
4607 0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
4608 1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
4609 3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
4610 4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
4611 3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
4612 0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
4613 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
4614 0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
4615 0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
4616 0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
4617 4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
4618 4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
4619 7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
4620 5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
4621 7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
4622 0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
4623 0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
4624 3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
4625 1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
4626 3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
4627 7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
4628 0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
4629 7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
4630 0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
4631 0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
4632 0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
4633 5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
4634 2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
4635 6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
4636 6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
4637 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
4638 0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
4639 0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
4640 1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
4641 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
4648 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4651 const Rebind<uint16_t, decltype(d)> du;
4652 const auto vu =
BitCast(du, v);
4654 #if HWY_TARGET == HWY_AVX3_DL
4657 const auto idx = detail::IndicesForCompress16(uint64_t{mask.
raw});
4663 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4665 return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
4668 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4670 return Vec128<T, N>{_mm_maskz_compress_epi64(mask.raw, v.raw)};
4686 template <
typename T,
size_t N>
4694 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4697 const Rebind<uint16_t, decltype(d)> du;
4698 const auto vu =
BitCast(du, v);
4700 const uint64_t mask_bits{mask.
raw};
4702 #if HWY_TARGET == HWY_AVX3_DL
4703 _mm_mask_compressstoreu_epi16(unaligned, mask.
raw, vu.raw);
4705 const auto idx = detail::IndicesForCompress16(mask_bits);
4709 return PopCount(uint64_t{mask.
raw} & ((1ull << N) - 1));
4712 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4715 _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
4716 return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4719 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4722 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4723 return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4726 template <
size_t N, HWY_IF_LE128(
float, N)>
4730 _mm_mask_compressstoreu_ps(unaligned, mask.
raw, v.
raw);
4731 return PopCount(uint64_t{mask.
raw} & ((1ull << N) - 1));
4734 template <
size_t N, HWY_IF_LE128(
double, N)>
4738 _mm_mask_compressstoreu_pd(unaligned, mask.
raw, v.
raw);
4739 return PopCount(uint64_t{mask.
raw} & ((1ull << N) - 1));
4744 template <
typename T,
size_t N>
4757 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4762 const Vec128<T, N> vbits{_mm_cvtsi32_si128(
static_cast<int>(mask_bits))};
4765 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4766 1, 1, 1, 1, 1, 1, 1, 1};
4769 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4770 1, 2, 4, 8, 16, 32, 64, 128};
4774 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4777 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4778 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4782 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4785 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4786 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4790 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4793 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4800 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4803 uint64_t mask_bits = 0;
4804 constexpr
size_t kNumBytes = (N + 7) / 8;
4805 CopyBytes<kNumBytes>(bits, &mask_bits);
4807 mask_bits &= (1ull << N) - 1;
4817 constexpr
HWY_INLINE uint64_t U64FromInt(
int mask_bits) {
4818 return static_cast<uint64_t
>(
static_cast<unsigned>(mask_bits));
4821 template <
typename T,
size_t N>
4823 const Mask128<T, N> mask) {
4826 return U64FromInt(_mm_movemask_epi8(sign_bits));
4829 template <
typename T,
size_t N>
4831 const Mask128<T, N> mask) {
4833 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
4834 return U64FromInt(_mm_movemask_epi8(sign_bits));
4837 template <
typename T,
size_t N>
4839 const Mask128<T, N> mask) {
4841 const Simd<float, N> df;
4843 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
4846 template <
typename T,
size_t N>
4848 const Mask128<T, N> mask) {
4850 const Simd<double, N> df;
4852 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
4856 template <
typename T,
size_t N>
4857 constexpr uint64_t
OnlyActive(uint64_t mask_bits) {
4858 return ((N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
4861 template <
typename T,
size_t N>
4869 template <
typename T,
size_t N>
4871 const Mask128<T, N> mask, uint8_t* bits) {
4872 constexpr
size_t kNumBytes = (N + 7) / 8;
4874 CopyBytes<kNumBytes>(&mask_bits, bits);
4880 template <
typename T,
size_t N>
4886 template <
typename T,
size_t N>
4887 HWY_API bool AllTrue(
const Simd<T, N> ,
const Mask128<T, N> mask) {
4888 constexpr uint64_t kAllBits =
4889 detail::OnlyActive<T, N>((1ull << (16 /
sizeof(T))) - 1);
4893 template <
typename T,
size_t N>
4898 template <
typename T,
size_t N>
4900 const Mask128<T, N> mask) {
4909 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4910 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
4912 const Rebind<uint8_t, decltype(d)> d8;
4913 const Simd<uint16_t, N> du;
4923 alignas(16) constexpr uint8_t table[2048] = {
4924 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
4925 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
4926 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
4927 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
4928 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
4929 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
4930 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
4931 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
4932 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
4933 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
4934 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
4935 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
4936 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
4937 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
4938 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
4939 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
4940 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
4941 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
4942 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
4943 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
4944 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
4945 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
4946 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
4947 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
4948 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
4949 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
4950 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
4951 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
4952 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
4953 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
4954 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
4955 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
4956 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
4957 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
4958 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
4959 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
4960 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
4961 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
4962 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
4963 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
4964 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
4965 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
4966 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
4967 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
4968 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
4969 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
4970 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
4971 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
4972 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
4973 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
4974 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
4975 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
4976 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
4977 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
4978 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
4979 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
4980 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
4981 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
4982 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
4983 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
4984 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
4985 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
4986 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
4987 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
4988 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
4989 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
4990 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
4991 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
4992 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
4993 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
4994 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
4995 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
4996 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
4997 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
4998 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
4999 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
5000 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
5001 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
5002 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
5003 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
5004 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
5005 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
5006 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
5007 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
5008 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
5009 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
5010 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
5011 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
5012 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
5013 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
5014 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
5015 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
5016 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
5017 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
5018 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
5019 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
5020 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
5021 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
5022 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
5023 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
5024 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
5025 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
5026 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
5027 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
5028 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
5029 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
5030 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
5031 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
5032 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
5033 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
5034 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
5035 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
5036 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5037 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5039 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
5040 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
5044 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
5045 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
5049 alignas(16) constexpr uint8_t packed_array[256] = {
5050 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
5051 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
5052 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
5053 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3,
5054 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
5055 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
5056 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
5057 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
5058 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
5059 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5060 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5061 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3,
5062 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5063 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
5064 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
5065 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5068 return BitCast(d,
Load(d8, packed_array + 16 * mask_bits));
5071 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5072 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
5076 alignas(16) constexpr uint8_t packed_array[64] = {
5077 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5078 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5079 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5080 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5083 return BitCast(d,
Load(d8, packed_array + 16 * mask_bits));
5088 template <
typename T,
size_t N>
5096 const auto indices =
BitCast(du, detail::IndicesFromBits(d, mask_bits));
5100 template <
typename T,
size_t N>
5106 uint64_t mask_bits = 0;
5107 constexpr
size_t kNumBytes = (N + 7) / 8;
5108 CopyBytes<kNumBytes>(bits, &mask_bits);
5110 mask_bits &= (1ull << N) - 1;
5113 const auto indices =
BitCast(du, detail::IndicesFromBits(d, mask_bits));
5119 template <
typename T,
size_t N>
5128 const auto indices =
BitCast(du, detail::IndicesFromBits(d, mask_bits));
5130 StoreU(compressed, d, unaligned);
5134 template <
typename T,
size_t N>
5140 uint64_t mask_bits = 0;
5141 constexpr
size_t kNumBytes = (N + 7) / 8;
5142 CopyBytes<kNumBytes>(bits, &mask_bits);
5144 mask_bits &= (1ull << N) - 1;
5148 const auto indices =
BitCast(du, detail::IndicesFromBits(d, mask_bits));
5150 StoreU(compressed, d, unaligned);
5161 const Vec128<uint8_t> v1,
5162 const Vec128<uint8_t> v2, Full128<uint8_t> d,
5164 const auto k5 =
Set(d, 5);
5165 const auto k6 =
Set(d, 6);
5169 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
5170 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
5171 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5172 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
5173 0x80, 0, 0x80, 0x80, 1, 0x80,
5174 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5175 const auto shuf_r0 =
Load(d, tbl_r0);
5176 const auto shuf_g0 =
Load(d, tbl_g0);
5177 const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
5181 const auto int0 = r0 | g0 | b0;
5182 StoreU(int0, d, unaligned + 0 * 16);
5185 const auto shuf_r1 = shuf_b0 + k6;
5186 const auto shuf_g1 = shuf_r0 + k5;
5187 const auto shuf_b1 = shuf_g0 + k5;
5191 const auto int1 = r1 | g1 | b1;
5192 StoreU(int1, d, unaligned + 1 * 16);
5195 const auto shuf_r2 = shuf_b1 + k6;
5196 const auto shuf_g2 = shuf_r1 + k5;
5197 const auto shuf_b2 = shuf_g1 + k5;
5201 const auto int2 = r2 | g2 | b2;
5202 StoreU(int2, d, unaligned + 2 * 16);
5207 const Vec128<uint8_t, 8> v1,
5208 const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d,
5211 const Full128<uint8_t> d_full;
5212 const auto k5 =
Set(d_full, 5);
5213 const auto k6 =
Set(d_full, 6);
5215 const Vec128<uint8_t> full_a{v0.raw};
5216 const Vec128<uint8_t> full_b{v1.raw};
5217 const Vec128<uint8_t> full_c{v2.raw};
5221 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
5222 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
5223 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5224 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
5225 0x80, 0, 0x80, 0x80, 1, 0x80,
5226 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5227 const auto shuf_r0 =
Load(d_full, tbl_r0);
5228 const auto shuf_g0 =
Load(d_full, tbl_g0);
5229 const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
5233 const auto int0 = r0 | g0 | b0;
5234 StoreU(int0, d_full, unaligned + 0 * 16);
5237 const auto shuf_r1 = shuf_b0 + k6;
5238 const auto shuf_g1 = shuf_r0 + k5;
5239 const auto shuf_b1 = shuf_g0 + k5;
5243 const decltype(
Zero(d)) int1{(r1 | g1 | b1).raw};
5244 StoreU(int1, d, unaligned + 1 * 16);
5248 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
5250 const Vec128<uint8_t, N> v1,
5251 const Vec128<uint8_t, N> v2,
5255 const Full128<uint8_t> d_full;
5257 const Vec128<uint8_t> full_a{v0.raw};
5258 const Vec128<uint8_t> full_b{v1.raw};
5259 const Vec128<uint8_t> full_c{v2.raw};
5263 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
5264 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,
5265 0x80, 0x80, 0x80, 0x80};
5266 const auto shuf_r0 =
Load(d_full, tbl_r0);
5267 const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
5268 const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
5272 const auto int0 = r0 | g0 | b0;
5273 alignas(16) uint8_t buf[16];
5274 StoreU(int0, d_full, buf);
5275 CopyBytes<N * 3>(buf, unaligned);
5282 const Vec128<uint8_t> v1,
5283 const Vec128<uint8_t> v2,
5284 const Vec128<uint8_t> v3, Full128<uint8_t> d8,
5289 const auto ba0 =
ZipLower(d16, v0, v1);
5290 const auto dc0 =
ZipLower(d16, v2, v3);
5291 const auto ba8 =
ZipUpper(d16, v0, v1);
5292 const auto dc8 =
ZipUpper(d16, v2, v3);
5293 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
5294 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
5295 const auto dcba_8 =
ZipLower(d32, ba8, dc8);
5296 const auto dcba_C =
ZipUpper(d32, ba8, dc8);
5305 const Vec128<uint8_t, 8> in1,
5306 const Vec128<uint8_t, 8> in2,
5307 const Vec128<uint8_t, 8> in3,
5311 const Full128<uint8_t> d_full8;
5314 const Vec128<uint8_t> v0{in0.raw};
5315 const Vec128<uint8_t> v1{in1.raw};
5316 const Vec128<uint8_t> v2{in2.raw};
5317 const Vec128<uint8_t> v3{in3.raw};
5319 const auto ba0 =
ZipLower(d16, v0, v1);
5320 const auto dc0 =
ZipLower(d16, v2, v3);
5321 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
5322 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
5323 StoreU(
BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
5324 StoreU(
BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
5328 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
5330 const Vec128<uint8_t, N> in1,
5331 const Vec128<uint8_t, N> in2,
5332 const Vec128<uint8_t, N> in3,
5336 const Full128<uint8_t> d_full8;
5339 const Vec128<uint8_t> v0{in0.raw};
5340 const Vec128<uint8_t> v1{in1.raw};
5341 const Vec128<uint8_t> v2{in2.raw};
5342 const Vec128<uint8_t> v3{in3.raw};
5344 const auto ba0 =
ZipLower(d16, v0, v1);
5345 const auto dc0 =
ZipLower(d16, v2, v3);
5346 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
5347 alignas(16) uint8_t buf[16];
5349 CopyBytes<4 * N>(buf, unaligned);
5357 template <
typename T>
5359 const Vec128<T, 1> v) {
5362 template <
typename T>
5364 const Vec128<T, 1> v) {
5367 template <
typename T>
5369 const Vec128<T, 1> v) {
5376 template <
typename T>
5378 const Vec128<T, 2> v10) {
5381 template <
typename T>
5383 const Vec128<T, 2> v10) {
5386 template <
typename T>
5388 const Vec128<T, 2> v10) {
5393 template <
typename T>
5395 const Vec128<T> v3210) {
5397 const Vec128<T> v31_20_31_20 = v3210 + v1032;
5398 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5399 return v20_31_20_31 + v31_20_31_20;
5401 template <
typename T>
5403 const Vec128<T> v3210) {
5405 const Vec128<T> v31_20_31_20 =
Min(v3210, v1032);
5406 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5407 return Min(v20_31_20_31, v31_20_31_20);
5409 template <
typename T>
5411 const Vec128<T> v3210) {
5413 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
5414 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5415 return Max(v20_31_20_31, v31_20_31_20);
5421 template <
typename T>
5423 const Vec128<T> v10) {
5427 template <
typename T>
5429 const Vec128<T> v10) {
5431 return Min(v10, v01);
5433 template <
typename T>
5435 const Vec128<T> v10) {
5437 return Max(v10, v01);
5443 template <
typename T,
size_t N>
5447 template <
typename T,
size_t N>
5451 template <
typename T,
size_t N>
5458 template <
typename T,
size_t N>
5463 template <
typename T,
size_t N>
5465 return AllTrue(Simd<T, N>(), mask);
5468 template <
typename T,
size_t N>
5470 return AllFalse(Simd<T, N>(), mask);
5473 template <
typename T,
size_t N>
5478 template <
typename T,
size_t N>
5482 template <
typename T,
size_t N>
5486 template <
typename T,
size_t N>
5491 template <
typename T,
size_t N>
5496 template <
int kBytes,
typename T,
size_t N>
5498 return ShiftRightBytes<kBytes>(Simd<T, N>(), v);
5501 template <
int kLanes,
typename T,
size_t N>
5503 return ShiftRightLanes<kLanes>(Simd<T, N>(), v);
5506 template <
size_t kBytes,
typename T,
size_t N>
5508 return CombineShiftRightBytes<kBytes>(Simd<T, N>(), hi, lo);
5511 template <
typename T,
size_t N>
5516 template <
typename T,
size_t N,
class D = Simd<T, N>>
5517 HWY_API VFromD<RepartitionToWide<D>>
ZipUpper(Vec128<T, N> a, Vec128<T, N> b) {
5521 template <
typename T,
size_t N2>
5522 HWY_API Vec128<T, N2 * 2>
Combine(Vec128<T, N2> hi2, Vec128<T, N2> lo2) {
5523 return Combine(Simd<T, N2 * 2>(), hi2, lo2);
5526 template <
typename T,
size_t N2, HWY_IF_LE64(T, N2)>
5531 template <
typename T,
size_t N>
5536 template <
typename T,
size_t N>
5541 template <
typename T,
size_t N>
5543 const Vec128<T, N> lo) {
5547 template <
typename T,
size_t N>
5584 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
5588 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
5592 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
5597 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
5601 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
5606 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_IF_LE64(T, N)
Definition: base.h:271
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
#define HWY_MAYBE_UNUSED
Definition: base.h:70
Definition: arm_neon-inl.h:506
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:133
Raw raw
Definition: arm_neon-inl.h:516
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:508
Definition: arm_neon-inl.h:468
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:84
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:87
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:75
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:90
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:72
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:469
Raw raw
Definition: arm_neon-inl.h:501
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:78
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:81
Definition: x86_256-inl.h:67
Definition: x86_512-inl.h:101
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2811
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4233
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:842
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:2739
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:879
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:672
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:768
HWY_INLINE auto FixConversionOverflow(Simd< TI, N > di, decltype(Zero(DF())) original, decltype(Zero(di).raw) converted_raw) -> decltype(Zero(di))
Definition: x86_128-inl.h:4176
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2601
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2506
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2184
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4431
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4165
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:2728
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:714
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2176
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:797
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: shared-inl.h:151
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
Simd< T, 16/sizeof(T)> Full128
Definition: arm_neon-inl.h:30
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
long long int GatherIndex64
Definition: x86_128-inl.h:2721
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
typename D::T TFromD
Definition: shared-inl.h:140
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:613
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:535
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:555
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_neon-inl.h:3318
__m128i raw
Definition: x86_128-inl.h:3176
Definition: shared-inl.h:35
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:208
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:204
Definition: wasm_128-inl.h:142
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:200
Definition: arm_neon-inl.h:522
Simd< T, N > operator()(const Vec128< T, N > *) const
Definition: x86_128-inl.h:157
Definition: x86_128-inl.h:172
decltype(DeduceD()(static_cast< V * >(nullptr))) type
Definition: x86_128-inl.h:173
__m128d type
Definition: x86_128-inl.h:60
__f32x4 type
Definition: wasm_128-inl.h:62
Definition: x86_128-inl.h:51
__v128_u type
Definition: wasm_128-inl.h:58
Definition: x86_128-inl.h:111
__mmask16 type
Definition: x86_128-inl.h:112
Definition: x86_128-inl.h:115
__mmask8 type
Definition: x86_128-inl.h:116
Definition: x86_128-inl.h:119
__mmask8 type
Definition: x86_128-inl.h:120
Definition: x86_128-inl.h:123
__mmask8 type
Definition: x86_128-inl.h:124
Definition: x86_128-inl.h:109
#define HWY_INLINE_F16
Definition: x86_128-inl.h:3997