22#if HWY_TARGET == HWY_SSSE3
35#include <sanitizer/msan_interface.h>
42#if HWY_TARGET <= HWY_AVX2
44using Full256 = Simd<T, 32 /
sizeof(T), 0>;
47#if HWY_TARGET <= HWY_AVX3
69template <
typename T,
size_t N = 16 /
sizeof(T)>
77 return *
this = (*
this * other);
80 return *
this = (*
this / other);
83 return *
this = (*
this + other);
86 return *
this = (*
this - other);
89 return *
this = (*
this & other);
92 return *
this = (*
this | other);
95 return *
this = (*
this ^ other);
102using Vec64 = Vec128<T, 8 /
sizeof(T)>;
105using Vec32 = Vec128<T, 4 /
sizeof(T)>;
107#if HWY_TARGET <= HWY_AVX3
116template <
size_t size>
137template <
typename T,
size_t N = 16 /
sizeof(T)>
151template <
typename T,
size_t N = 16 /
sizeof(T)>
158#if HWY_TARGET <= HWY_AVX2
170 template <
typename T,
size_t N>
174#if HWY_TARGET <= HWY_AVX2
175 template <
typename T>
180#if HWY_TARGET <= HWY_AVX3
181 template <
typename T>
210template <
typename T,
size_t N>
217struct BitCastFromInteger128 {
221struct BitCastFromInteger128<float> {
229template <
typename T,
size_t N>
237template <
typename T,
size_t N,
typename FromT>
239 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
246template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
248 return Vec128<T, N>{_mm_setzero_si128()};
250template <
size_t N, HWY_IF_LE128(
float, N)>
252 return Vec128<float, N>{_mm_setzero_ps()};
254template <
size_t N, HWY_IF_LE128(
double, N)>
265template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
266HWY_API Vec128<uint8_t, N>
Set(Simd<uint8_t, N, 0> ,
const uint8_t t) {
267 return Vec128<uint8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
269template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
270HWY_API Vec128<uint16_t, N>
Set(Simd<uint16_t, N, 0> ,
272 return Vec128<uint16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
274template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
275HWY_API Vec128<uint32_t, N>
Set(Simd<uint32_t, N, 0> ,
277 return Vec128<uint32_t, N>{_mm_set1_epi32(
static_cast<int>(t))};
279template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
280HWY_API Vec128<uint64_t, N>
Set(Simd<uint64_t, N, 0> ,
282 return Vec128<uint64_t, N>{
283 _mm_set1_epi64x(
static_cast<long long>(t))};
285template <
size_t N, HWY_IF_LE128(
int8_t, N)>
286HWY_API Vec128<int8_t, N>
Set(Simd<int8_t, N, 0> ,
const int8_t t) {
287 return Vec128<int8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
289template <
size_t N, HWY_IF_LE128(
int16_t, N)>
290HWY_API Vec128<int16_t, N>
Set(Simd<int16_t, N, 0> ,
const int16_t t) {
291 return Vec128<int16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
293template <
size_t N, HWY_IF_LE128(
int32_t, N)>
294HWY_API Vec128<int32_t, N>
Set(Simd<int32_t, N, 0> ,
const int32_t t) {
295 return Vec128<int32_t, N>{_mm_set1_epi32(t)};
297template <
size_t N, HWY_IF_LE128(
int64_t, N)>
298HWY_API Vec128<int64_t, N>
Set(Simd<int64_t, N, 0> ,
const int64_t t) {
299 return Vec128<int64_t, N>{
300 _mm_set1_epi64x(
static_cast<long long>(t))};
302template <
size_t N, HWY_IF_LE128(
float, N)>
303HWY_API Vec128<float, N>
Set(Simd<float, N, 0> ,
const float t) {
304 return Vec128<float, N>{_mm_set1_ps(t)};
306template <
size_t N, HWY_IF_LE128(
double, N)>
315template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
319 return Vec128<T, N>{_mm_undefined_si128()};
321template <
size_t N, HWY_IF_LE128(
float, N)>
325template <
size_t N, HWY_IF_LE128(
double, N)>
335template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
337 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw) & 0xFF);
339template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
341 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw) & 0xFFFF);
343template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
345 return static_cast<T
>(_mm_cvtsi128_si32(
v.raw));
349 return _mm_cvtss_f32(
v.raw);
354 alignas(16) uint64_t lanes[2];
358 return static_cast<uint64_t
>(_mm_cvtsi128_si64(
v.raw));
364 alignas(16) int64_t lanes[2];
368 return _mm_cvtsi128_si64(
v.raw);
373 return _mm_cvtsd_f64(
v.raw);
380template <
typename T,
size_t N>
381HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
382 return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
398template <
typename T,
size_t N>
399HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
400 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
415template <
typename T,
size_t N>
416HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
417 return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
433template <
typename T,
size_t N>
434HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
435 return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
451template <
typename T,
size_t N>
455 using VU =
VFromD<
decltype(du)>;
456#if HWY_TARGET <= HWY_AVX3
457 const __m128i vu =
BitCast(du,
v).raw;
458 return BitCast(
d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
466template <
typename T,
size_t N>
467HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
468#if HWY_TARGET <= HWY_AVX3
471 using VU =
VFromD<
decltype(du)>;
472 const __m128i ret = _mm_ternarylogic_epi64(
476 return Or(o1,
Or(o2, o3));
482template <
typename T,
size_t N>
483HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
484#if HWY_TARGET <= HWY_AVX3
487 using VU =
VFromD<
decltype(du)>;
488 const __m128i ret = _mm_ternarylogic_epi64(
492 return Or(o,
And(a1, a2));
498template <
typename T,
size_t N>
501#if HWY_TARGET <= HWY_AVX3
504 using VU =
VFromD<
decltype(du)>;
506 d, VU{_mm_ternarylogic_epi64(
BitCast(du, mask).raw,
BitCast(du, yes).raw,
515template <
typename T,
size_t N>
520template <
typename T,
size_t N>
525template <
typename T,
size_t N>
533#if HWY_TARGET == HWY_AVX3_DL
535#ifdef HWY_NATIVE_POPCNT
536#undef HWY_NATIVE_POPCNT
538#define HWY_NATIVE_POPCNT
543template <
typename T,
size_t N>
548template <
typename T,
size_t N>
553template <
typename T,
size_t N>
558template <
typename T,
size_t N>
566template <
typename T,
size_t N>
577template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
582template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
591HWY_API Vec128<int8_t, N>
Abs(
const Vec128<int8_t, N>
v) {
595 return Vec128<int8_t, N>{_mm_max_epi8(
v.raw, (zero -
v).raw)};
597 return Vec128<int8_t, N>{_mm_abs_epi8(
v.raw)};
601HWY_API Vec128<int16_t, N>
Abs(
const Vec128<int16_t, N>
v) {
602 return Vec128<int16_t, N>{_mm_abs_epi16(
v.raw)};
605HWY_API Vec128<int32_t, N>
Abs(
const Vec128<int32_t, N>
v) {
606 return Vec128<int32_t, N>{_mm_abs_epi32(
v.raw)};
610HWY_API Vec128<float, N>
Abs(
const Vec128<float, N>
v) {
611 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
622template <
typename T,
size_t N>
624 const Vec128<T, N> sign) {
625 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
627 const DFromV<
decltype(magn)>
d;
630#if HWY_TARGET <= HWY_AVX3
642 const __m128i out = _mm_ternarylogic_epi32(
650template <
typename T,
size_t N>
652 const Vec128<T, N> sign) {
653#if HWY_TARGET <= HWY_AVX3
663#if HWY_TARGET <= HWY_AVX3
672template <
typename T,
size_t N>
678template <
typename T,
size_t N>
684template <
typename T,
size_t N>
690template <
typename T,
size_t N>
699template <
typename T,
size_t N>
720template <
typename T,
size_t N>
725template <
typename T,
size_t N>
730template <
typename T,
size_t N>
735template <
typename T,
size_t N>
743template <
typename T,
size_t N>
762template <
typename T,
size_t N>
768template <
typename T,
size_t N>
773template <
typename T,
size_t N>
778template <
typename T,
size_t N>
786template <
typename T,
size_t N>
806#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
807#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
808 HWY_COMPILER_CLANG >= 800
809#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
811#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
817template <
typename T,
size_t N>
820#if HWY_COMPILER_HAS_MASK_INTRINSICS
826template <
typename T,
size_t N>
829#if HWY_COMPILER_HAS_MASK_INTRINSICS
835template <
typename T,
size_t N>
838#if HWY_COMPILER_HAS_MASK_INTRINSICS
844template <
typename T,
size_t N>
847#if HWY_COMPILER_HAS_MASK_INTRINSICS
854template <
typename T,
size_t N>
857#if HWY_COMPILER_HAS_MASK_INTRINSICS
863template <
typename T,
size_t N>
866#if HWY_COMPILER_HAS_MASK_INTRINSICS
872template <
typename T,
size_t N>
875#if HWY_COMPILER_HAS_MASK_INTRINSICS
881template <
typename T,
size_t N>
884#if HWY_COMPILER_HAS_MASK_INTRINSICS
891template <
typename T,
size_t N>
894#if HWY_COMPILER_HAS_MASK_INTRINSICS
900template <
typename T,
size_t N>
903#if HWY_COMPILER_HAS_MASK_INTRINSICS
909template <
typename T,
size_t N>
912#if HWY_COMPILER_HAS_MASK_INTRINSICS
918template <
typename T,
size_t N>
921#if HWY_COMPILER_HAS_MASK_INTRINSICS
928template <
typename T,
size_t N>
931#if HWY_COMPILER_HAS_MASK_INTRINSICS
937template <
typename T,
size_t N>
940#if HWY_COMPILER_HAS_MASK_INTRINSICS
946template <
typename T,
size_t N>
949#if HWY_COMPILER_HAS_MASK_INTRINSICS
955template <
typename T,
size_t N>
958#if HWY_COMPILER_HAS_MASK_INTRINSICS
967template <
typename T,
size_t N>
968HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
972template <
typename T,
size_t N>
973HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
977template <
typename T,
size_t N>
978HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
982template <
typename T,
size_t N>
983HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
987template <
typename T,
size_t N>
988HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
999template <
typename T,
size_t N>
1001 return Mask128<T, N>{
v.raw};
1004template <
typename T,
size_t N>
1006 return Vec128<T, N>{
v.raw};
1009template <
typename T,
size_t N>
1011 const Mask128<T, N>
v) {
1012 return Vec128<T, N>{
v.raw};
1015#if HWY_TARGET == HWY_SSSE3
1018template <
typename T,
size_t N>
1028template <
typename T,
size_t N>
1031 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1035 const Vec128<float, N> yes,
1036 const Vec128<float, N> no) {
1037 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1041 const Vec128<double, N> yes,
1042 const Vec128<double, N> no) {
1043 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1049template <
typename T,
size_t N>
1055template <
typename T,
size_t N>
1062template <
typename T,
size_t N>
1063HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1067template <
typename T,
size_t N>
1068HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1069 const Simd<T, N, 0>
d;
1073template <
typename T,
size_t N>
1074HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1075 const Simd<T, N, 0>
d;
1079template <
typename T,
size_t N>
1080HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1081 const Simd<T, N, 0>
d;
1085template <
typename T,
size_t N>
1086HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1087 const Simd<T, N, 0>
d;
1095template <
int kBits,
size_t N>
1097 return Vec128<uint16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
1100template <
int kBits,
size_t N>
1102 return Vec128<uint32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
1105template <
int kBits,
size_t N>
1107 return Vec128<uint64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
1110template <
int kBits,
size_t N>
1112 return Vec128<int16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
1114template <
int kBits,
size_t N>
1116 return Vec128<int32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
1118template <
int kBits,
size_t N>
1120 return Vec128<int64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
1123template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1127 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{
v.raw}).raw};
1130 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
1135template <
int kBits,
size_t N>
1137 return Vec128<uint16_t, N>{_mm_srli_epi16(
v.raw, kBits)};
1139template <
int kBits,
size_t N>
1141 return Vec128<uint32_t, N>{_mm_srli_epi32(
v.raw, kBits)};
1143template <
int kBits,
size_t N>
1145 return Vec128<uint64_t, N>{_mm_srli_epi64(
v.raw, kBits)};
1148template <
int kBits,
size_t N>
1152 const Vec128<uint8_t, N> shifted{
1153 ShiftRight<kBits>(Vec128<uint16_t>{
v.raw}).raw};
1154 return shifted &
Set(d8, 0xFF >> kBits);
1157template <
int kBits,
size_t N>
1159 return Vec128<int16_t, N>{_mm_srai_epi16(
v.raw, kBits)};
1161template <
int kBits,
size_t N>
1163 return Vec128<int32_t, N>{_mm_srai_epi32(
v.raw, kBits)};
1166template <
int kBits,
size_t N>
1171 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1172 return (shifted ^ shifted_sign) - shifted_sign;
1180template <
typename T,
size_t N,
typename TI,
size_t NI>
1182 const Vec128<TI, NI> from) {
1183 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
1188template <
class V,
class VI>
1201template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1203 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1204 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, 0xB1)};
1208 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1217template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1220 const auto ba =
Combine(d2, b, a);
1221 alignas(16)
const T kShuffle[8] = {1, 0, 7, 6};
1224template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1227 const auto ba =
Combine(d2, b, a);
1228 alignas(16)
const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
1231template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1235 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
1240template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1243 const auto ba =
Combine(d2, b, a);
1244 alignas(16)
const T kShuffle[8] = {0, 3, 6, 5};
1247template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1250 const auto ba =
Combine(d2, b, a);
1251 alignas(16)
const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
1254template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1258 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
1263template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1266 const auto ba =
Combine(d2, b, a);
1267 alignas(16)
const T kShuffle[8] = {2, 1, 4, 7};
1270template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1273 const auto ba =
Combine(d2, b, a);
1274 alignas(16)
const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
1277template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1281 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
1290 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x4E)};
1293 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x4E)};
1296 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x4E)};
1310 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x39)};
1313 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x39)};
1316 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x39)};
1320 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x93)};
1323 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x93)};
1326 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x93)};
1331 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x1B)};
1334 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x1B)};
1337 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x1B)};
1342#if HWY_TARGET <= HWY_AVX3
1346template <
typename TFrom,
size_t NFrom,
typename TTo,
size_t NTo>
1349 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1355template <
typename T,
size_t N>
1360template <
typename T,
size_t N>
1365template <
typename T,
size_t N>
1370template <
typename T,
size_t N>
1378template <
typename T,
size_t N>
1379HWY_API Mask128<T, N>
TestBit(
const Vec128<T, N>
v,
const Vec128<T, N> bit) {
1380 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1386template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1391template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1393 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1396template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1398 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1401template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1403 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1408 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1419template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1424template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1426 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1429template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1431 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1434template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1436 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1441 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1454HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1455 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1459 Vec128<int16_t, N> b) {
1460 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1464 Vec128<int32_t, N> b) {
1465 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1469 Vec128<int64_t, N> b) {
1470 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1475 Vec128<uint8_t, N> b) {
1476 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1480 Vec128<uint16_t, N> b) {
1481 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1485 Vec128<uint32_t, N> b) {
1486 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1490 Vec128<uint64_t, N> b) {
1491 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1495HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
1496 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1507 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1519template <
typename T,
size_t N>
1524template <
typename T,
size_t N>
1529template <
typename T,
size_t N>
1534template <
typename T,
size_t N>
1542template <
typename T,
size_t N>
1558template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1563template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1565 return Vec128<T, N>{_mm_movm_epi16(
v.raw)};
1568template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1570 return Vec128<T, N>{_mm_movm_epi32(
v.raw)};
1573template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1575 return Vec128<T, N>{_mm_movm_epi64(
v.raw)};
1588template <
typename T,
size_t N>
1590 const Mask128<T, N>
v) {
1598template <
typename TFrom,
typename TTo,
size_t N>
1600 Mask128<TFrom, N> m) {
1601 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1602 const Simd<TFrom, N, 0>
d;
1606template <
typename T,
size_t N>
1608 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1609 return (
v & bit) == bit;
1617 const Vec128<uint8_t, N> b) {
1618 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1622 const Vec128<uint16_t, N> b) {
1623 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1627 const Vec128<uint32_t, N> b) {
1628 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1632 const Vec128<uint64_t, N> b) {
1633#if HWY_TARGET == HWY_SSSE3
1634 const Simd<uint32_t, N * 2, 0> d32;
1635 const Simd<uint64_t, N, 0> d64;
1640 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1647 const Vec128<int8_t, N> b) {
1648 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1652 Vec128<int16_t, N> b) {
1653 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1657 const Vec128<int32_t, N> b) {
1658 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1662 const Vec128<int64_t, N> b) {
1672 const Vec128<float, N> b) {
1673 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1677 const Vec128<double, N> b) {
1678 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1688 Vec128<uint8_t, N> b) {
1693 Vec128<uint16_t, N> b) {
1698 Vec128<uint32_t, N> b) {
1703 Vec128<uint64_t, N> b) {
1708 Vec128<int8_t, N> b) {
1713 Vec128<int16_t, N> b) {
1718 Vec128<int32_t, N> b) {
1723 Vec128<int64_t, N> b) {
1729 const Vec128<float, N> b) {
1730 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1734 const Vec128<double, N> b) {
1735 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1742HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1743 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1747 Vec128<int16_t, N> b) {
1748 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1752 Vec128<int32_t, N> b) {
1753 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1756template <
typename T,
size_t N, HWY_IF_UNSIGNED(T)>
1758 const DFromV<
decltype(a)> du;
1760 const Vec128<T, N> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
1765HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
1766 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1769HWY_API Mask128<double, N>
operator>(Vec128<double, N> a, Vec128<double, N> b) {
1770 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1775 const Vec128<int64_t, N> b) {
1776#if HWY_TARGET == HWY_SSSE3
1778 const Simd<int64_t, N, 0>
d;
1784 const __m128i upper =
OrAnd(m_gt32, m_eq32,
Sub(b, a)).raw;
1786 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
1788 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};
1796 const Vec128<float, N> b) {
1797 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1801 const Vec128<double, N> b) {
1802 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1809template <
typename T,
size_t N>
1814template <
typename T,
size_t N>
1821template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1823#if HWY_TARGET <= HWY_AVX3
1825 const uint64_t all = (1ull <<
N) - 1;
1827 const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1846#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1847#if defined(__clang_analyzer__) || \
1848 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1849#define HWY_SAFE_PARTIAL_LOAD_STORE 1
1851#define HWY_SAFE_PARTIAL_LOAD_STORE 0
1857template <
typename T>
1859 return Vec128<T>{_mm_load_si128(
reinterpret_cast<const __m128i*
>(aligned))};
1870template <
typename T>
1872 return Vec128<T>{_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(p))};
1876 return Vec128<float>{_mm_loadu_ps(p)};
1883template <
typename T>
1885#if HWY_SAFE_PARTIAL_LOAD_STORE
1886 __m128i
v = _mm_setzero_si128();
1887 CopyBytes<8>(p, &
v);
1890 return Vec64<T>{_mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(p))};
1896#if HWY_SAFE_PARTIAL_LOAD_STORE
1897 __m128
v = _mm_setzero_ps();
1898 CopyBytes<8>(p, &
v);
1901 const __m128 hi = _mm_setzero_ps();
1902 return Vec128<float, 2>{_mm_loadl_pi(hi,
reinterpret_cast<const __m64*
>(p))};
1908#if HWY_SAFE_PARTIAL_LOAD_STORE
1909 __m128d
v = _mm_setzero_pd();
1910 CopyBytes<8>(p, &
v);
1919#if HWY_SAFE_PARTIAL_LOAD_STORE
1920 __m128
v = _mm_setzero_ps();
1921 CopyBytes<4>(p, &
v);
1929template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
1931 constexpr size_t kSize =
sizeof(T) *
N;
1932#if HWY_SAFE_PARTIAL_LOAD_STORE
1933 __m128
v = _mm_setzero_ps();
1934 CopyBytes<kSize>(p, &
v);
1935 return Vec128<T, N>{
v};
1938 CopyBytes<kSize>(p, &bits);
1939 return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1944template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1950template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1956template <
typename T,
size_t N,
typename T2, HWY_IF_LE128(T, N)>
1959 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
1960 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
1962 return Load(
d, lanes);
1967#if HWY_TARGET <= HWY_AVX3
1969template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1975template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1978 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
1981template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1984 return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
1987template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1990 return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
2007#elif HWY_TARGET == HWY_AVX2
2009template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2012 auto p_p =
reinterpret_cast<const int*
>(p);
2013 return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
2016template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2019 auto p_p =
reinterpret_cast<const long long*
>(p);
2020 return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
2026 const Vec128<int32_t, N> mi =
2028 return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
2034 const Vec128<int64_t, N> mi =
2036 return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
2040template <
typename T,
size_t N, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2041HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
2042 const T* HWY_RESTRICT p) {
2043 return IfThenElseZero(m, Load(d, p));
2049template <
typename T,
size_t N>
2059template <
typename T>
2061 _mm_store_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
2065 _mm_store_ps(aligned,
v.raw);
2069 _mm_store_pd(aligned,
v.raw);
2072template <
typename T>
2074 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(p),
v.raw);
2078 _mm_storeu_ps(p,
v.raw);
2082 _mm_storeu_pd(p,
v.raw);
2085template <
typename T>
2087#if HWY_SAFE_PARTIAL_LOAD_STORE
2088 CopyBytes<8>(&
v, p);
2090 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(p),
v.raw);
2095#if HWY_SAFE_PARTIAL_LOAD_STORE
2096 CopyBytes<8>(&
v, p);
2098 _mm_storel_pi(
reinterpret_cast<__m64*
>(p),
v.raw);
2103#if HWY_SAFE_PARTIAL_LOAD_STORE
2104 CopyBytes<8>(&
v, p);
2106 _mm_storel_pd(p,
v.raw);
2111template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
2113 CopyBytes<sizeof(T) * N>(&
v, p);
2117#if HWY_SAFE_PARTIAL_LOAD_STORE
2118 CopyBytes<4>(&
v, p);
2120 _mm_store_ss(p,
v.raw);
2125template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2138template <
typename T,
size_t N>
2142 using TI =
TFromD<
decltype(di)>;
2143 alignas(16) TI buf[
N];
2144 alignas(16) TI mask[
N];
2147 for (
size_t i = 0; i <
N; ++i) {
2149 CopyBytes<sizeof(T)>(buf + i, p + i);
2155#if HWY_TARGET <= HWY_AVX3
2157template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2160 _mm_mask_storeu_epi8(p, m.
raw,
v.raw);
2162template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2165 _mm_mask_storeu_epi16(p, m.raw,
v.raw);
2168template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2171 auto pi =
reinterpret_cast<int*
>(p);
2172 _mm_mask_storeu_epi32(pi, m.raw,
v.raw);
2175template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2178 auto pi =
reinterpret_cast<long long*
>(p);
2179 _mm_mask_storeu_epi64(pi, m.raw,
v.raw);
2185 _mm_mask_storeu_ps(p, m.
raw,
v.raw);
2191 _mm_mask_storeu_pd(p, m.
raw,
v.raw);
2194#elif HWY_TARGET == HWY_AVX2
2196template <
typename T,
size_t N, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2197HWY_API
void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2198 T* HWY_RESTRICT p) {
2199 detail::ScalarMaskedStore(v, m, d, p);
2202template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2207 const Full128<T> df;
2208 const Mask128<T> mf{m.raw};
2209 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2212 auto pi =
reinterpret_cast<int*
>(p);
2213 _mm_maskstore_epi32(pi, m.raw,
v.raw);
2216template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2221 const Full128<T> df;
2222 const Mask128<T> mf{m.raw};
2223 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2226 auto pi =
reinterpret_cast<long long*
>(p);
2227 _mm_maskstore_epi64(pi, m.raw,
v.raw);
2236 const Full128<T> df;
2237 const Mask128<T> mf{m.raw};
2238 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2241 const Vec128<MakeSigned<T>,
N> mi =
2243 _mm_maskstore_ps(p, mi.raw,
v.raw);
2252 const Full128<T> df;
2253 const Mask128<T> mf{m.raw};
2254 m = Mask128<T, N>{
And(mf,
FirstN(df,
N)).raw};
2257 const Vec128<MakeSigned<T>,
N> mi =
2259 _mm_maskstore_pd(p, mi.raw,
v.raw);
2264template <
typename T,
size_t N>
2280 const Vec128<uint8_t, N> b) {
2281 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2285 const Vec128<uint16_t, N> b) {
2286 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2290 const Vec128<uint32_t, N> b) {
2291 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2295 const Vec128<uint64_t, N> b) {
2296 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2302 const Vec128<int8_t, N> b) {
2303 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2307 const Vec128<int16_t, N> b) {
2308 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2312 const Vec128<int32_t, N> b) {
2313 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2317 const Vec128<int64_t, N> b) {
2318 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2324 const Vec128<float, N> b) {
2325 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
2338 const Vec128<uint8_t, N> b) {
2339 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2343 Vec128<uint16_t, N> b) {
2344 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2348 const Vec128<uint32_t, N> b) {
2349 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2353 const Vec128<uint64_t, N> b) {
2354 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2360 const Vec128<int8_t, N> b) {
2361 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2365 const Vec128<int16_t, N> b) {
2366 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2370 const Vec128<int32_t, N> b) {
2371 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2375 const Vec128<int64_t, N> b) {
2376 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2382 const Vec128<float, N> b) {
2383 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
2394 return Vec128<uint64_t,
N / 8>{_mm_sad_epu8(
v.raw, _mm_setzero_si128())};
2404 const Vec128<uint8_t, N> b) {
2405 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
2409 const Vec128<uint16_t, N> b) {
2410 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
2416 const Vec128<int8_t, N> b) {
2417 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2421 const Vec128<int16_t, N> b) {
2422 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2432 const Vec128<uint8_t, N> b) {
2433 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2437 const Vec128<uint16_t, N> b) {
2438 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2444 const Vec128<int8_t, N> b) {
2445 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2449 const Vec128<int16_t, N> b) {
2450 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2460 const Vec128<uint8_t, N> b) {
2461 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2465 const Vec128<uint16_t, N> b) {
2466 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2473 const Vec128<uint16_t, N> b) {
2474 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2478 const Vec128<int16_t, N> b) {
2479 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2485 const Vec128<uint16_t, N> b) {
2486 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2490 const Vec128<int16_t, N> b) {
2491 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2496 const Vec128<int16_t, N> b) {
2497 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
2503HWY_API Vec128<uint64_t, (
N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
2504 const Vec128<uint32_t, N> b) {
2505 return Vec128<uint64_t, (
N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2508#if HWY_TARGET == HWY_SSSE3
2510template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2513 return Set(
Simd<int64_t, (
N + 1) / 2, 0>(),
2517 const Vec128<int32_t> b) {
2518 alignas(16) int32_t a_lanes[4];
2519 alignas(16) int32_t b_lanes[4];
2520 const Full128<int32_t> di32;
2521 Store(a, di32, a_lanes);
2522 Store(b, di32, b_lanes);
2523 alignas(16) int64_t mul[2];
2524 mul[0] =
static_cast<int64_t
>(a_lanes[0]) * b_lanes[0];
2525 mul[1] =
static_cast<int64_t
>(a_lanes[2]) * b_lanes[2];
2526 return Load(Full128<int64_t>(), mul);
2532HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
2533 const Vec128<int32_t, N> b) {
2534 return Vec128<int64_t, (
N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2541 const Vec128<uint32_t, N> b) {
2542#if HWY_TARGET == HWY_SSSE3
2546 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2547 const auto mullo_x2x0 =
MulEven(a, b);
2548 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2549 const auto mullo_x3x1 =
2550 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2553 const __m128i mul_20 =
2554 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2555 const __m128i mul_31 =
2556 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2557 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2559 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2565 const Vec128<int32_t, N> b) {
2574template <
int kBits,
size_t N>
2576 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
2577#if HWY_TARGET <= HWY_AVX3
2578 return Vec128<uint32_t, N>{_mm_ror_epi32(
v.raw, kBits)};
2580 if (kBits == 0)
return v;
2585template <
int kBits,
size_t N>
2587 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
2588#if HWY_TARGET <= HWY_AVX3
2589 return Vec128<uint64_t, N>{_mm_ror_epi64(
v.raw, kBits)};
2591 if (kBits == 0)
return v;
2606 return ShiftRight<15>(
v);
2611 return ShiftRight<31>(
v);
2617#if HWY_TARGET <= HWY_AVX3
2620#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2626 const auto sign = ShiftRight<31>(
BitCast(d32,
v));
2628 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2633HWY_API Vec128<int64_t, N>
Abs(
const Vec128<int64_t, N>
v) {
2634#if HWY_TARGET <= HWY_AVX3
2635 return Vec128<int64_t, N>{_mm_abs_epi64(
v.raw)};
2642template <
int kBits,
size_t N>
2644#if HWY_TARGET <= HWY_AVX3
2645 return Vec128<int64_t, N>{_mm_srai_epi64(
v.raw, kBits)};
2651 return right | sign;
2656template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2659#if HWY_TARGET == HWY_SSSE3
2677template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2680 static_assert(IsSigned<T>(),
"Only works for signed/float");
2689template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2692 static_assert(IsSigned<T>(),
"Only works for signed/float");
2706 return Vec128<uint16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2711 return Vec128<uint32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2716 return Vec128<uint64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2722 return Vec128<int16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2728 return Vec128<int32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2734 return Vec128<int64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2737template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2741 const Vec128<T, N> shifted{
2743 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
2751 return Vec128<uint16_t, N>{_mm_srl_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2756 return Vec128<uint32_t, N>{_mm_srl_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2761 return Vec128<uint64_t, N>{_mm_srl_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2769 const Vec128<uint8_t, N> shifted{
2771 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
2777 return Vec128<int16_t, N>{_mm_sra_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2783 return Vec128<int32_t, N>{_mm_sra_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2788#if HWY_TARGET <= HWY_AVX3
2789 return Vec128<int64_t, N>{_mm_sra_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2795 return right | sign;
2804 const auto shifted_sign =
2805 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
2806 return (shifted ^ shifted_sign) - shifted_sign;
2812HWY_API Vec128<float, N>
operator*(Vec128<float, N> a, Vec128<float, N> b) {
2813 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2830 const Vec128<float, N> b) {
2831 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2849 return Vec128<float, N>{_mm_rcp_ps(
v.raw)};
2858 const Vec128<float, N> b) {
2867 const Vec128<float, N> x,
2868 const Vec128<float, N> add) {
2869#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2870 return mul * x + add;
2872 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2879#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2880 return mul * x + add;
2889 const Vec128<float, N> x,
2890 const Vec128<float, N> add) {
2891#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2892 return add - mul * x;
2894 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2901#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2902 return add - mul * x;
2911 const Vec128<float, N> x,
2912 const Vec128<float, N> sub) {
2913#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2914 return mul * x - sub;
2916 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2923#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2924 return mul * x - sub;
2933 const Vec128<float, N> x,
2934 const Vec128<float, N> sub) {
2935#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2936 return Neg(mul) * x - sub;
2938 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2945#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2946 return Neg(mul) * x - sub;
2956HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
2957 return Vec128<float, N>{_mm_sqrt_ps(
v.raw)};
2973 return Vec128<float, N>{_mm_rsqrt_ps(
v.raw)};
2983template <
typename T,
size_t N>
2989 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
2998HWY_API Vec128<uint8_t, N>
Min(
const Vec128<uint8_t, N> a,
2999 const Vec128<uint8_t, N> b) {
3000 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
3003HWY_API Vec128<uint16_t, N>
Min(
const Vec128<uint16_t, N> a,
3004 const Vec128<uint16_t, N> b) {
3005#if HWY_TARGET == HWY_SSSE3
3008 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
3012HWY_API Vec128<uint32_t, N>
Min(
const Vec128<uint32_t, N> a,
3013 const Vec128<uint32_t, N> b) {
3014#if HWY_TARGET == HWY_SSSE3
3017 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
3021HWY_API Vec128<uint64_t, N>
Min(
const Vec128<uint64_t, N> a,
3022 const Vec128<uint64_t, N> b) {
3023#if HWY_TARGET <= HWY_AVX3
3024 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
3032HWY_API Vec128<int8_t, N>
Min(
const Vec128<int8_t, N> a,
3033 const Vec128<int8_t, N> b) {
3034#if HWY_TARGET == HWY_SSSE3
3037 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
3041HWY_API Vec128<int16_t, N>
Min(
const Vec128<int16_t, N> a,
3042 const Vec128<int16_t, N> b) {
3043 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
3046HWY_API Vec128<int32_t, N>
Min(
const Vec128<int32_t, N> a,
3047 const Vec128<int32_t, N> b) {
3048#if HWY_TARGET == HWY_SSSE3
3051 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
3055HWY_API Vec128<int64_t, N>
Min(
const Vec128<int64_t, N> a,
3056 const Vec128<int64_t, N> b) {
3057#if HWY_TARGET <= HWY_AVX3
3058 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
3066HWY_API Vec128<float, N>
Min(
const Vec128<float, N> a,
3067 const Vec128<float, N> b) {
3068 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
3079template <
typename T,
size_t N>
3085 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
3094HWY_API Vec128<uint8_t, N>
Max(
const Vec128<uint8_t, N> a,
3095 const Vec128<uint8_t, N> b) {
3096 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
3099HWY_API Vec128<uint16_t, N>
Max(
const Vec128<uint16_t, N> a,
3100 const Vec128<uint16_t, N> b) {
3101#if HWY_TARGET == HWY_SSSE3
3104 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
3108HWY_API Vec128<uint32_t, N>
Max(
const Vec128<uint32_t, N> a,
3109 const Vec128<uint32_t, N> b) {
3110#if HWY_TARGET == HWY_SSSE3
3113 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
3117HWY_API Vec128<uint64_t, N>
Max(
const Vec128<uint64_t, N> a,
3118 const Vec128<uint64_t, N> b) {
3119#if HWY_TARGET <= HWY_AVX3
3120 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
3128HWY_API Vec128<int8_t, N>
Max(
const Vec128<int8_t, N> a,
3129 const Vec128<int8_t, N> b) {
3130#if HWY_TARGET == HWY_SSSE3
3133 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
3137HWY_API Vec128<int16_t, N>
Max(
const Vec128<int16_t, N> a,
3138 const Vec128<int16_t, N> b) {
3139 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
3142HWY_API Vec128<int32_t, N>
Max(
const Vec128<int32_t, N> a,
3143 const Vec128<int32_t, N> b) {
3144#if HWY_TARGET == HWY_SSSE3
3147 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
3151HWY_API Vec128<int64_t, N>
Max(
const Vec128<int64_t, N> a,
3152 const Vec128<int64_t, N> b) {
3153#if HWY_TARGET <= HWY_AVX3
3154 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
3162HWY_API Vec128<float, N>
Max(
const Vec128<float, N> a,
3163 const Vec128<float, N> b) {
3164 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
3178template <
typename T,
size_t N>
3181 _mm_stream_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
3186 _mm_stream_ps(aligned,
v.raw);
3191 _mm_stream_pd(aligned,
v.raw);
3202static_assert(sizeof(
GatherIndex64) == 8, "Must be 64-bit type");
3204#if HWY_TARGET <= HWY_AVX3
3207template <
typename T,
size_t N>
3212 _mm_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
3214 const __mmask8 mask = (1u <<
N) - 1;
3215 _mm_mask_i32scatter_epi32(base, mask, offset.
raw,
v.raw, 1);
3218template <
typename T,
size_t N>
3223 _mm_i32scatter_epi32(base, index.
raw,
v.raw, 4);
3225 const __mmask8 mask = (1u <<
N) - 1;
3226 _mm_mask_i32scatter_epi32(base, mask, index.
raw,
v.raw, 4);
3230template <
typename T,
size_t N>
3235 _mm_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
3237 const __mmask8 mask = (1u <<
N) - 1;
3238 _mm_mask_i64scatter_epi64(base, mask, offset.
raw,
v.raw, 1);
3241template <
typename T,
size_t N>
3246 _mm_i64scatter_epi64(base, index.
raw,
v.raw, 8);
3248 const __mmask8 mask = (1u <<
N) - 1;
3249 _mm_mask_i64scatter_epi64(base, mask, index.
raw,
v.raw, 8);
3255template <
typename T,
size_t N,
typename Offset>
3259 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3262template <
typename T,
size_t N,
typename Index>
3264 const Vec128<Index, N> index) {
3265 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3274 _mm_i32scatter_ps(base, offset.
raw,
v.raw, 1);
3276 const __mmask8 mask = (1u <<
N) - 1;
3277 _mm_mask_i32scatter_ps(base, mask, offset.
raw,
v.raw, 1);
3285 _mm_i32scatter_ps(base, index.
raw,
v.raw, 4);
3287 const __mmask8 mask = (1u <<
N) - 1;
3288 _mm_mask_i32scatter_ps(base, mask, index.
raw,
v.raw, 4);
3297 _mm_i64scatter_pd(base, offset.
raw,
v.raw, 1);
3299 const __mmask8 mask = (1u <<
N) - 1;
3300 _mm_mask_i64scatter_pd(base, mask, offset.
raw,
v.raw, 1);
3308 _mm_i64scatter_pd(base, index.
raw,
v.raw, 8);
3310 const __mmask8 mask = (1u <<
N) - 1;
3311 _mm_mask_i64scatter_pd(base, mask, index.
raw,
v.raw, 8);
3316template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
3319 const Vec128<Offset, N> offset) {
3320 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3322 alignas(16) T lanes[
N];
3325 alignas(16) Offset offset_lanes[
N];
3326 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
3328 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
3329 for (
size_t i = 0; i <
N; ++i) {
3330 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
3334template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
3336 const Vec128<Index, N> index) {
3337 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3339 alignas(16) T lanes[
N];
3342 alignas(16) Index index_lanes[
N];
3343 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
3345 for (
size_t i = 0; i <
N; ++i) {
3346 base[index_lanes[i]] = lanes[i];
3354#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3356template <
typename T,
size_t N,
typename Offset>
3359 const Vec128<Offset, N> offset) {
3360 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3362 alignas(16) Offset offset_lanes[
N];
3363 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
3365 alignas(16) T lanes[
N];
3366 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
3367 for (
size_t i = 0; i <
N; ++i) {
3368 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
3370 return Load(
d, lanes);
3373template <
typename T,
size_t N,
typename Index>
3376 const Vec128<Index, N> index) {
3377 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3379 alignas(16) Index index_lanes[
N];
3380 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
3382 alignas(16) T lanes[
N];
3383 for (
size_t i = 0; i <
N; ++i) {
3384 lanes[i] = base[index_lanes[i]];
3386 return Load(
d, lanes);
3393template <
typename T,
size_t N>
3397 const Vec128<int32_t, N> offset) {
3398 return Vec128<T, N>{_mm_i32gather_epi32(
3399 reinterpret_cast<const int32_t*
>(base), offset.raw, 1)};
3401template <
typename T,
size_t N>
3405 const Vec128<int32_t, N> index) {
3406 return Vec128<T, N>{_mm_i32gather_epi32(
3407 reinterpret_cast<const int32_t*
>(base), index.raw, 4)};
3410template <
typename T,
size_t N>
3414 const Vec128<int64_t, N> offset) {
3415 return Vec128<T, N>{_mm_i64gather_epi64(
3416 reinterpret_cast<const GatherIndex64*
>(base), offset.raw, 1)};
3418template <
typename T,
size_t N>
3422 const Vec128<int64_t, N> index) {
3423 return Vec128<T, N>{_mm_i64gather_epi64(
3424 reinterpret_cast<const GatherIndex64*
>(base), index.raw, 8)};
3429template <
typename T,
size_t N,
typename Offset>
3431 const Vec128<Offset, N> offset) {
3434template <
typename T,
size_t N,
typename Index>
3436 const Vec128<Index, N> index) {
3443 const Vec128<int32_t, N> offset) {
3444 return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3449 const Vec128<int32_t, N> index) {
3450 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3456 const Vec128<int64_t, N> offset) {
3457 return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3462 const Vec128<int64_t, N> index) {
3463 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3475template <
typename T,
size_t N>
3478 return Vec128<T,
N / 2>{
v.raw};
3481template <
typename T,
size_t N>
3488template <
int kBytes,
typename T,
size_t N>
3490 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3491 return Vec128<T, N>{_mm_slli_si128(
v.raw, kBytes)};
3494template <
int kBytes,
typename T,
size_t N>
3496 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(
v)>(),
v);
3501template <
int kLanes,
typename T,
size_t N>
3507template <
int kLanes,
typename T,
size_t N>
3509 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(
v)>(),
v);
3513template <
int kBytes,
typename T,
size_t N>
3515 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3517 if (
N != 16 /
sizeof(T)) {
3518 const Vec128<T> vfull{
v.raw};
3521 return Vec128<T, N>{_mm_srli_si128(
v.raw, kBytes)};
3525template <
int kLanes,
typename T,
size_t N>
3534template <
typename T>
3536 return Vec64<T>{_mm_unpackhi_epi64(
v.raw,
v.raw)};
3539 return Vec128<float, 2>{_mm_movehl_ps(
v.raw,
v.raw)};
3546template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3553 return Vec128<T, (
N + 1) / 2>{upper.raw};
3560template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3562 static_assert(kLane <
N,
"Lane index out of bounds");
3563#if HWY_TARGET == HWY_SSSE3
3564 const int pair = _mm_extract_epi16(
v.raw, kLane / 2);
3565 constexpr int kShift = kLane & 1 ? 8 : 0;
3566 return static_cast<T
>((pair >> kShift) & 0xFF);
3568 return static_cast<T
>(_mm_extract_epi8(
v.raw, kLane) & 0xFF);
3572template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3574 static_assert(kLane <
N,
"Lane index out of bounds");
3575 return static_cast<T
>(_mm_extract_epi16(
v.raw, kLane) & 0xFFFF);
3578template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3580 static_assert(kLane <
N,
"Lane index out of bounds");
3581#if HWY_TARGET == HWY_SSSE3
3582 alignas(16) T lanes[4];
3584 return lanes[kLane];
3586 return static_cast<T
>(_mm_extract_epi32(
v.raw, kLane));
3590template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3592 static_assert(kLane <
N,
"Lane index out of bounds");
3593#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3594 alignas(16) T lanes[2];
3596 return lanes[kLane];
3598 return static_cast<T
>(_mm_extract_epi64(
v.raw, kLane));
3602template <
size_t kLane,
size_t N>
3604 static_assert(kLane <
N,
"Lane index out of bounds");
3605#if HWY_TARGET == HWY_SSSE3
3606 alignas(16)
float lanes[4];
3608 return lanes[kLane];
3611 const int bits = _mm_extract_ps(
v.raw, kLane);
3613 CopyBytes<4>(&bits, &ret);
3619template <
size_t kLane>
3621 static_assert(kLane == 0,
"Lane index out of bounds");
3625template <
size_t kLane>
3627 static_assert(kLane < 2,
"Lane index out of bounds");
3636template <
typename T>
3643template <
typename T>
3645#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3646 if (__builtin_constant_p(i)) {
3649 return detail::ExtractLane<0>(
v);
3651 return detail::ExtractLane<1>(
v);
3655 alignas(16) T lanes[2];
3660template <
typename T>
3662#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3663 if (__builtin_constant_p(i)) {
3666 return detail::ExtractLane<0>(
v);
3668 return detail::ExtractLane<1>(
v);
3670 return detail::ExtractLane<2>(
v);
3672 return detail::ExtractLane<3>(
v);
3676 alignas(16) T lanes[4];
3681template <
typename T>
3683#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3684 if (__builtin_constant_p(i)) {
3687 return detail::ExtractLane<0>(
v);
3689 return detail::ExtractLane<1>(
v);
3691 return detail::ExtractLane<2>(
v);
3693 return detail::ExtractLane<3>(
v);
3695 return detail::ExtractLane<4>(
v);
3697 return detail::ExtractLane<5>(
v);
3699 return detail::ExtractLane<6>(
v);
3701 return detail::ExtractLane<7>(
v);
3705 alignas(16) T lanes[8];
3710template <
typename T>
3712#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3713 if (__builtin_constant_p(i)) {
3716 return detail::ExtractLane<0>(
v);
3718 return detail::ExtractLane<1>(
v);
3720 return detail::ExtractLane<2>(
v);
3722 return detail::ExtractLane<3>(
v);
3724 return detail::ExtractLane<4>(
v);
3726 return detail::ExtractLane<5>(
v);
3728 return detail::ExtractLane<6>(
v);
3730 return detail::ExtractLane<7>(
v);
3732 return detail::ExtractLane<8>(
v);
3734 return detail::ExtractLane<9>(
v);
3736 return detail::ExtractLane<10>(
v);
3738 return detail::ExtractLane<11>(
v);
3740 return detail::ExtractLane<12>(
v);
3742 return detail::ExtractLane<13>(
v);
3744 return detail::ExtractLane<14>(
v);
3746 return detail::ExtractLane<15>(
v);
3750 alignas(16) T lanes[16];
3759template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3761 static_assert(kLane <
N,
"Lane index out of bounds");
3762#if HWY_TARGET == HWY_SSSE3
3764 alignas(16) T lanes[16];
3767 return Load(
d, lanes);
3769 return Vec128<T, N>{_mm_insert_epi8(
v.raw, t, kLane)};
3773template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3775 static_assert(kLane <
N,
"Lane index out of bounds");
3776 return Vec128<T, N>{_mm_insert_epi16(
v.raw, t, kLane)};
3779template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3781 static_assert(kLane <
N,
"Lane index out of bounds");
3782#if HWY_TARGET == HWY_SSSE3
3783 alignas(16) T lanes[4];
3787 return Load(
d, lanes);
3790 CopyBytes<sizeof(T)>(&t, &ti);
3791 return Vec128<T, N>{_mm_insert_epi32(
v.raw, ti, kLane)};
3795template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3797 static_assert(kLane <
N,
"Lane index out of bounds");
3798#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3800 alignas(16) T lanes[2];
3803 return Load(
d, lanes);
3806 CopyBytes<sizeof(T)>(&t, &ti);
3807 return Vec128<T, N>{_mm_insert_epi64(
v.raw, ti, kLane)};
3811template <
size_t kLane,
size_t N>
3813 static_assert(kLane <
N,
"Lane index out of bounds");
3814#if HWY_TARGET == HWY_SSSE3
3816 alignas(16)
float lanes[4];
3819 return Load(
d, lanes);
3821 return Vec128<float, N>{_mm_insert_ps(
v.raw, _mm_set_ss(t), kLane << 4)};
3826template <
size_t kLane>
3828 static_assert(kLane == 0,
"Lane index out of bounds");
3832template <
size_t kLane>
3834 static_assert(kLane < 2,
"Lane index out of bounds");
3848template <
typename T>
3855template <
typename T>
3857#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3858 if (__builtin_constant_p(i)) {
3861 return detail::InsertLane<0>(
v, t);
3863 return detail::InsertLane<1>(
v, t);
3868 alignas(16) T lanes[2];
3871 return Load(
d, lanes);
3874template <
typename T>
3876#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3877 if (__builtin_constant_p(i)) {
3880 return detail::InsertLane<0>(
v, t);
3882 return detail::InsertLane<1>(
v, t);
3884 return detail::InsertLane<2>(
v, t);
3886 return detail::InsertLane<3>(
v, t);
3891 alignas(16) T lanes[4];
3894 return Load(
d, lanes);
3897template <
typename T>
3899#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3900 if (__builtin_constant_p(i)) {
3903 return detail::InsertLane<0>(
v, t);
3905 return detail::InsertLane<1>(
v, t);
3907 return detail::InsertLane<2>(
v, t);
3909 return detail::InsertLane<3>(
v, t);
3911 return detail::InsertLane<4>(
v, t);
3913 return detail::InsertLane<5>(
v, t);
3915 return detail::InsertLane<6>(
v, t);
3917 return detail::InsertLane<7>(
v, t);
3922 alignas(16) T lanes[8];
3925 return Load(
d, lanes);
3928template <
typename T>
3930#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
3931 if (__builtin_constant_p(i)) {
3934 return detail::InsertLane<0>(
v, t);
3936 return detail::InsertLane<1>(
v, t);
3938 return detail::InsertLane<2>(
v, t);
3940 return detail::InsertLane<3>(
v, t);
3942 return detail::InsertLane<4>(
v, t);
3944 return detail::InsertLane<5>(
v, t);
3946 return detail::InsertLane<6>(
v, t);
3948 return detail::InsertLane<7>(
v, t);
3950 return detail::InsertLane<8>(
v, t);
3952 return detail::InsertLane<9>(
v, t);
3954 return detail::InsertLane<10>(
v, t);
3956 return detail::InsertLane<11>(
v, t);
3958 return detail::InsertLane<12>(
v, t);
3960 return detail::InsertLane<13>(
v, t);
3962 return detail::InsertLane<14>(
v, t);
3964 return detail::InsertLane<15>(
v, t);
3969 alignas(16) T lanes[16];
3972 return Load(
d, lanes);
3977template <
int kBytes,
typename T,
class V = Vec128<T>>
3980 return BitCast(
d, Vec128<uint8_t>{_mm_alignr_epi8(
3984template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
3985 class V = Vec128<T, N>>
3987 constexpr size_t kSize =
N *
sizeof(T);
3988 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3990 const Full128<uint8_t> d_full8;
3991 using V8 =
VFromD<
decltype(d_full8)>;
3992 const V8 hi8{
BitCast(d8, hi).raw};
3996 return V{
BitCast(Full128<T>(), r).raw};
4002template <
int kLane,
size_t N>
4004 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4006 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
4009 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
4013template <
int kLane,
size_t N>
4015 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4018template <
int kLane,
size_t N>
4020 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4025template <
int kLane,
size_t N>
4027 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4029 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
4032 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
4036template <
int kLane,
size_t N>
4038 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4041template <
int kLane,
size_t N>
4043 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4048template <
int kLane,
size_t N>
4050 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4053template <
int kLane,
size_t N>
4055 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
4062template <
typename T,
size_t N = 16 /
sizeof(T)>
4070 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
4071#if HWY_IS_DEBUG_BUILD
4072 const Rebind<TI,
decltype(
d)> di;
4077#if HWY_TARGET <= HWY_AVX2
4082 using V8 =
VFromD<
decltype(d8)>;
4083 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
4084 0, 1, 2, 3, 0, 1, 2, 3};
4087 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
4088 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
4093 const V8 byte_indices =
BitCast(d8, ShiftLeft<2>(
BitCast(d16, lane_indices)));
4102 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
4103#if HWY_IS_DEBUG_BUILD
4104 const Rebind<TI,
decltype(
d)> di;
4112 return Indices128<T, N>{vec.raw};
4115template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
4117 const Rebind<TI,
decltype(
d)> di;
4121template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4123#if HWY_TARGET <= HWY_AVX2
4133template <
size_t N, HWY_IF_GE64(
float, N)>
4136#if HWY_TARGET <= HWY_AVX2
4147template <
typename T>
4153template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4157#if HWY_TARGET <= HWY_AVX2
4177#if HWY_TARGET <= HWY_AVX2
4195template <
typename T>
4203template <
typename T>
4209template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4214template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4220template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4226template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4228#if HWY_TARGET <= HWY_AVX3
4229 if (
N == 1)
return v;
4235 alignas(16)
constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4236 const Vec128<int16_t, N> idx =
Load(di, kReverse + (
N == 8 ? 0 : 4));
4237 return BitCast(
d, Vec128<int16_t, N>{
4238 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4247template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4253template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4258template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4265template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4270 return BitCast(
d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
4271 BitCast(di,
v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
4274#if HWY_TARGET <= HWY_AVX3
4275 alignas(16)
constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
4276 const Vec128<int16_t, N> idx =
Load(di, kReverse4);
4277 return BitCast(
d, Vec128<int16_t, N>{
4278 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4286template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4291template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4298template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4300#if HWY_TARGET <= HWY_AVX3
4302 alignas(32)
constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
4303 15, 14, 13, 12, 11, 10, 9, 8};
4304 const Vec128<int16_t, N> idx =
Load(di, kReverse8);
4305 return BitCast(
d, Vec128<int16_t, N>{
4306 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
4313template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4324template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
4329template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
4334template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
4339template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
4345template <
size_t N, HWY_IF_LE128(
int8_t, N)>
4350template <
size_t N, HWY_IF_LE128(
int16_t, N)>
4355template <
size_t N, HWY_IF_LE128(
int32_t, N)>
4360template <
size_t N, HWY_IF_LE128(
int64_t, N)>
4366template <
size_t N, HWY_IF_LE128(
float, N)>
4368 const Vec128<float, N> b) {
4369 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
4371template <
size_t N, HWY_IF_LE128(
double, N)>
4423 const Vec128<float> b) {
4424 return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
4434template <
typename T,
class V = Vec128<T>>
4440template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
4442 const Half<
decltype(
d)> d2;
4450template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4454template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4459template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4469template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4470HWY_API Vec128<T, N>
Combine(Simd<T, N, 0>
d, Vec128<T, N / 2> hi_half,
4471 Vec128<T, N / 2> lo_half) {
4472 const Half<
decltype(
d)> d2;
4476 const VU lo{
BitCast(du2, lo_half).raw};
4477 const VU hi{
BitCast(du2, hi_half).raw};
4483template <
typename T, HWY_IF_NOT_FLOAT(T)>
4488template <
typename T, HWY_IF_FLOAT(T)>
4494template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4502template <
typename T>
4509template <
typename T>
4516template <
typename T>
4518 const Vec128<T> lo) {
4519 return CombineShiftRightBytes<8>(
d, hi, lo);
4523template <
typename T>
4526#if HWY_TARGET == HWY_SSSE3
4529 _MM_SHUFFLE2(1, 0))});
4538#if HWY_TARGET == HWY_SSSE3
4550#if HWY_TARGET == HWY_SSSE3
4560template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4563 const Half<
decltype(
d)> d2;
4567template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4570 const Half<
decltype(
d)> d2;
4574template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4576 const Vec128<T, N> lo) {
4577 const Half<
decltype(
d)> d2;
4581template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4584 const Half<
decltype(
d)> d2;
4591template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4595 const Vec128<uint16_t> uH = ShiftRight<8>(
BitCast(dw, hi));
4596 const Vec128<uint16_t> uL = ShiftRight<8>(
BitCast(dw, lo));
4597 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4601template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4605 alignas(16)
const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
4613template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4617 alignas(16)
const uint8_t kCompactOddU8[4] = {1, 3};
4625template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4629 const Vec128<uint32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
4630 const Vec128<uint32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
4631 return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4635template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4639 alignas(16)
const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
4647template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4651 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4652 _MM_SHUFFLE(3, 1, 3, 1))});
4661template <
typename T>
4670template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4674 const Vec128<uint16_t> mask =
Set(dw, 0x00FF);
4675 const Vec128<uint16_t> uH =
And(
BitCast(dw, hi), mask);
4676 const Vec128<uint16_t> uL =
And(
BitCast(dw, lo), mask);
4677 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4681template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4685 alignas(16)
const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
4693template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4697 alignas(16)
const uint8_t kCompactEvenU8[4] = {0, 2};
4705template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4709 const Vec128<uint32_t> mask =
Set(dw, 0x0000FFFF);
4710 const Vec128<uint32_t> uH =
And(
BitCast(dw, hi), mask);
4711 const Vec128<uint32_t> uL =
And(
BitCast(dw, lo), mask);
4712 return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4716template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4720 alignas(16)
const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
4728template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4732 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4733 _MM_SHUFFLE(2, 0, 2, 0))});
4741template <
typename T>
4749template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4751 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4756 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4759template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4766template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4768 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4773 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4776template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4783template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4787 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4788 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4792template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4794#if HWY_TARGET == HWY_SSSE3
4797 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4798 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4801 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
4805template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4807#if HWY_TARGET == HWY_SSSE3
4808 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4809 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4810 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4820template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4826#if HWY_TARGET == HWY_SSSE3
4828 d, Vec128<double, N>{_mm_shuffle_pd(
4838HWY_API Vec128<float, N>
OddEven(Vec128<float, N> a, Vec128<float, N> b) {
4839#if HWY_TARGET == HWY_SSSE3
4842 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4843 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4844 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4846 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
4851template <
typename T,
size_t N>
4858template <
typename T,
size_t N>
4869#if HWY_TARGET > HWY_AVX3
4873template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4877 const Rebind<float,
decltype(dw)> df;
4878 const auto zero =
Zero(
d);
4881 const auto upper = exp +
Set(
d, 0x3F80);
4883 const auto f0 =
ZipLower(dw, zero, upper);
4884 const auto f1 =
ZipUpper(dw, zero, upper);
4886 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(
BitCast(df, f0).raw)};
4887 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(
BitCast(df, f1).raw)};
4888 return Vec128<MakeUnsigned<T>,
N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4892template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4895 const auto exp = ShiftLeft<23>(
v);
4896 const auto f = exp +
Set(
d, 0x3F800000);
4900 return Vec128<MakeUnsigned<T>,
N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
4909#if HWY_TARGET <= HWY_AVX3
4923#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4935 const Vec128<uint64_t> bits) {
4936#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4938 const Vec128<uint64_t> out0{_mm_sll_epi64(
v.raw, bits.raw)};
4939 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4940 const Vec128<uint64_t> out1{_mm_sll_epi64(
v.raw, bits1)};
4943 return Vec128<uint64_t>{_mm_sllv_epi64(
v.raw, bits.raw)};
4947 const Vec64<uint64_t> bits) {
4948 return Vec64<uint64_t>{_mm_sll_epi64(
v.raw, bits.raw)};
4952template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
4969#if HWY_TARGET <= HWY_AVX3
4987#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4995 const auto out20 = ShiftRight<32>(
MulEven(in, mul));
5012 const Vec128<uint64_t> bits) {
5013#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5015 const Vec128<uint64_t> out0{_mm_srl_epi64(
v.raw, bits.raw)};
5016 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
5017 const Vec128<uint64_t> out1{_mm_srl_epi64(
v.raw, bits1)};
5020 return Vec128<uint64_t>{_mm_srlv_epi64(
v.raw, bits.raw)};
5024 const Vec64<uint64_t> bits) {
5025 return Vec64<uint64_t>{_mm_srl_epi64(
v.raw, bits.raw)};
5028#if HWY_TARGET > HWY_AVX3
5032template <
class DI,
class V>
5033HWY_INLINE V SignedShr(
const DI di,
const V
v,
const V count_i) {
5034 const RebindToUnsigned<DI> du;
5035 const auto count =
BitCast(du, count_i);
5039 const auto abs =
BitCast(du,
v ^ sign);
5040 return BitCast(di, abs >> count) ^ sign;
5049#if HWY_TARGET <= HWY_AVX3
5063#if HWY_TARGET <= HWY_AVX3
5077#if HWY_TARGET <= HWY_AVX3
5087 const Vec128<uint64_t> b) {
5088 alignas(16) uint64_t mul[2];
5090 return Load(Full128<uint64_t>(), mul);
5094 const Vec128<uint64_t> b) {
5095 alignas(16) uint64_t mul[2];
5096 const Half<Full128<uint64_t>> d2;
5099 return Load(Full128<uint64_t>(), mul);
5106 Vec128<bfloat16_t, 2 * N> a,
5107 Vec128<bfloat16_t, 2 * N> b,
5108 const Vec128<float, N> sum0,
5109 Vec128<float, N>& sum1) {
5113 const Vec128<uint16_t, 2 * N> zero =
Zero(du16);
5116 const Vec128<uint32_t, N> a0 =
ZipLower(du32, zero,
BitCast(du16, a));
5117 const Vec128<uint32_t, N> a1 =
ZipUpper(du32, zero,
BitCast(du16, a));
5118 const Vec128<uint32_t, N> b0 =
ZipLower(du32, zero,
BitCast(du16, b));
5119 const Vec128<uint32_t, N> b1 =
ZipUpper(du32, zero,
BitCast(du16, b));
5131 const Vec128<uint8_t, N>
v) {
5132#if HWY_TARGET == HWY_SSSE3
5133 const __m128i zero = _mm_setzero_si128();
5134 return Vec128<uint16_t, N>{_mm_unpacklo_epi8(
v.raw, zero)};
5136 return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(
v.raw)};
5141 const Vec128<uint16_t, N>
v) {
5142#if HWY_TARGET == HWY_SSSE3
5143 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(
v.raw, _mm_setzero_si128())};
5145 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(
v.raw)};
5150 const Vec128<uint32_t, N>
v) {
5151#if HWY_TARGET == HWY_SSSE3
5152 return Vec128<uint64_t, N>{_mm_unpacklo_epi32(
v.raw, _mm_setzero_si128())};
5154 return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(
v.raw)};
5159 const Vec128<uint8_t, N>
v) {
5160#if HWY_TARGET == HWY_SSSE3
5161 const __m128i zero = _mm_setzero_si128();
5162 const __m128i u16 = _mm_unpacklo_epi8(
v.raw, zero);
5163 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
5165 return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(
v.raw)};
5172 const Vec128<uint8_t, N>
v) {
5177 const Vec128<uint16_t, N>
v) {
5182 const Vec128<uint8_t, N>
v) {
5189 const Vec128<int8_t, N>
v) {
5190#if HWY_TARGET == HWY_SSSE3
5191 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(
v.raw,
v.raw)});
5193 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(
v.raw)};
5198 const Vec128<int16_t, N>
v) {
5199#if HWY_TARGET == HWY_SSSE3
5200 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(
v.raw,
v.raw)});
5202 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(
v.raw)};
5207 const Vec128<int32_t, N>
v) {
5208#if HWY_TARGET == HWY_SSSE3
5209 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(
v.raw,
v.raw)});
5211 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(
v.raw)};
5216 const Vec128<int8_t, N>
v) {
5217#if HWY_TARGET == HWY_SSSE3
5218 const __m128i x2 = _mm_unpacklo_epi8(
v.raw,
v.raw);
5219 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
5220 return ShiftRight<24>(Vec128<int32_t, N>{x4});
5222 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(
v.raw)};
5228#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
5229#define HWY_INLINE_F16 HWY_NOINLINE
5231#define HWY_INLINE_F16 HWY_INLINE
5236#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5241 const auto sign = ShiftRight<15>(bits16);
5242 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
5243 const auto mantissa = bits16 &
Set(du32, 0x3FF);
5244 const auto subnormal =
5246 Set(df32, 1.0f / 16384 / 1024));
5248 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
5249 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
5250 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
5251 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
5252 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
5261 const Vec128<bfloat16_t, N>
v) {
5262 const Rebind<uint16_t,
decltype(df32)> du16;
5275 const Vec128<int32_t, N>
v) {
5276 return Vec128<double, N>{_mm_cvtepi32_pd(
v.raw)};
5283 const Vec128<int32_t, N>
v) {
5284#if HWY_TARGET == HWY_SSSE3
5285 const Simd<int32_t, N, 0> di32;
5286 const Simd<uint16_t, N * 2, 0> du16;
5287 const auto zero_if_neg =
AndNot(ShiftRight<31>(
v),
v);
5289 const auto clamped =
Or(zero_if_neg, too_big);
5291 alignas(16)
constexpr uint16_t kLower2Bytes[16] = {
5292 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
5293 const auto lo2 =
Load(du16, kLower2Bytes);
5296 return Vec128<uint16_t, N>{_mm_packus_epi32(
v.raw,
v.raw)};
5302 const Vec128<int32_t, N>
v) {
5303 return Vec128<int16_t, N>{_mm_packs_epi32(
v.raw,
v.raw)};
5308 const Vec128<int32_t, N>
v) {
5309 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
5310 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
5315 const Vec128<int16_t, N>
v) {
5316 return Vec128<uint8_t, N>{_mm_packus_epi16(
v.raw,
v.raw)};
5321 const Vec128<int32_t, N>
v) {
5322 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
5323 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
5328 const Vec128<int16_t, N>
v) {
5329 return Vec128<int8_t, N>{_mm_packs_epi16(
v.raw,
v.raw)};
5339 const Vec128<float, N>
v) {
5340#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5342 const Rebind<uint32_t,
decltype(df16)> du;
5344 const auto bits32 =
BitCast(du,
v);
5345 const auto sign = ShiftRight<31>(bits32);
5346 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
5347 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
5349 const auto k15 =
Set(di, 15);
5350 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
5351 const auto is_tiny = exp <
Set(di, -24);
5353 const auto is_subnormal = exp <
Set(di, -14);
5354 const auto biased_exp16 =
5356 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
5357 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
5358 (mantissa32 >> (
Set(du, 13) + sub_exp));
5360 ShiftRight<13>(mantissa32));
5362 const auto sign16 = ShiftLeft<15>(sign);
5363 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
5368 return Vec128<float16_t, N>{_mm_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
5376 const Vec128<float, N>
v) {
5378 const Rebind<int32_t,
decltype(dbf16)> di32;
5379 const Rebind<uint32_t,
decltype(dbf16)> du32;
5380 const Rebind<uint16_t,
decltype(dbf16)> du16;
5381 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
5387 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
5390 const Repartition<uint32_t,
decltype(dbf16)> du32;
5391 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
5397 const Vec128<double, N>
v) {
5398 return Vec128<float, N>{_mm_cvtpd_ps(
v.raw)};
5407 ->
decltype(
Zero(
d)) {
5410 return Min(
v,
Set(
d, 2147483647.0));
5416template <
class DI,
class DF = RebindToFloat<DI>>
5418 decltype(
Zero(di).raw) converted_raw)
5425 const auto converted =
VFromD<DI>{converted_raw};
5426 const auto sign_wrong =
AndNot(
BitCast(di, original), converted);
5427#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
5443 const Vec128<double, N>
v) {
5445 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
5451 const Simd<uint32_t, N, 0> d32;
5452 const Simd<uint8_t, N * 4, 0> d8;
5453 alignas(16)
static constexpr uint32_t k8From32[4] = {
5454 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
5464 const Vec128<int32_t, N>
v) {
5465 return Vec128<float, N>{_mm_cvtepi32_ps(
v.raw)};
5471#if HWY_TARGET <= HWY_AVX3
5480 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
5481 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64,
v)) ^ k84_63);
5484 const auto k52 =
Set(d32, 0x43300000);
5487 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
5488 return (v_upper - k84_63_52) + v_lower;
5495 const Vec128<float, N>
v) {
5501#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
5503#elif HWY_ARCH_X86_64
5504 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
v.raw));
5506 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
UpperHalf(dd2,
v).raw));
5509 using VI =
VFromD<
decltype(di)>;
5510 const VI k0 =
Zero(di);
5511 const VI k1 =
Set(di, 1);
5512 const VI k51 =
Set(di, 51);
5515 const VI biased_exp = ShiftRight<52>(
BitCast(di,
v)) &
Set(di, 0x7FF);
5516 const VI exp = biased_exp -
Set(di, 0x3FF);
5517 const auto in_range = exp <
Set(di, 63);
5525 const VI shift_mnt =
Max(k51 - exp, k0);
5526 const VI shift_int =
Max(exp - k51, k0);
5527 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
5529 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
5531 const VI shifted = int52 << shift_int;
5533 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
5537 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
5538 const VI magnitude =
IfThenElse(in_range, restored, limit);
5541 return (magnitude ^ sign_mask) - sign_mask;
5546#if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
5558 const Simd<int32_t, N, 0> di;
5564#if HWY_TARGET == HWY_SSSE3
5567template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
5573 const auto max =
Set(df, MantissaEnd<T>());
5575 const auto added = large +
v;
5576 const auto rounded = added - large;
5586template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
5594template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
5600 const auto int_f =
ConvertTo(df, integer);
5606template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
5612 const auto int_f =
ConvertTo(df, integer);
5621template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
5627 const auto int_f =
ConvertTo(df, integer);
5640 return Vec128<float, N>{
5641 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5645 return Vec128<double, N>{
5646 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5652 return Vec128<float, N>{
5653 _mm_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5657 return Vec128<double, N>{
5658 _mm_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5663HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
5664 return Vec128<float, N>{
5665 _mm_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5668HWY_API Vec128<double, N>
Ceil(
const Vec128<double, N>
v) {
5669 return Vec128<double, N>{
5670 _mm_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5676 return Vec128<float, N>{
5677 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5681 return Vec128<double, N>{
5682 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5691#if HWY_TARGET <= HWY_AVX3
5699#if HWY_TARGET <= HWY_AVX3
5706#if HWY_TARGET <= HWY_AVX3
5731template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
5733 const Simd<T, N, 0>
d;
5741template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
5743 const Simd<T, N, 0>
d;
5751 const VFromD<
decltype(di)> exp =
5760#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5763#ifdef HWY_NATIVE_AES
5764#undef HWY_NATIVE_AES
5766#define HWY_NATIVE_AES
5770 Vec128<uint8_t> round_key) {
5771 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
5775 Vec128<uint8_t> round_key) {
5776 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
5779template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
5781 Vec128<uint64_t, N> b) {
5782 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
5785template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
5787 Vec128<uint64_t, N> b) {
5788 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
5795template <
typename T>
5796struct CompressIsPartition {
5797#if HWY_TARGET <= HWY_AVX3
5803 enum {
value = (
sizeof(T) == 8) };
5809#if HWY_TARGET <= HWY_AVX3
5814template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5817 uint64_t mask_bits = 0;
5818 constexpr size_t kNumBytes = (
N + 7) / 8;
5819 CopyBytes<kNumBytes>(bits, &mask_bits);
5821 mask_bits &= (1ull <<
N) - 1;
5830template <
typename T,
size_t N>
5832 const Mask128<T, N> mask, uint8_t* bits) {
5833 constexpr size_t kNumBytes = (
N + 7) / 8;
5834 CopyBytes<kNumBytes>(&mask.raw, bits);
5838 const int mask = (1 <<
N) - 1;
5839 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
5849template <
typename T,
size_t N>
5851 const Mask128<T, N> mask) {
5852 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
5856template <
typename T,
size_t N>
5858 const Mask128<T, N> mask) {
5859 const uint32_t mask_bits =
static_cast<uint32_t
>(mask.raw) & ((1u <<
N) - 1);
5863template <
typename T,
size_t N>
5864HWY_API bool AllFalse(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
5865 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
5866 return mask_bits == 0;
5869template <
typename T,
size_t N>
5870HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
5871 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
5873 return mask_bits == (1u <<
N) - 1;
5878#if HWY_TARGET != HWY_AVX3_DL
5882HWY_INLINE Vec128<uint16_t> IndicesForCompress16(uint64_t mask_bits) {
5883 Full128<uint16_t> du16;
5887 Rebind<uint8_t,
decltype(du16)> du8;
5888 alignas(16)
constexpr uint8_t tbl[2048] = {
5889 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
5890 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
5891 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
5892 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
5893 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
5894 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
5895 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
5896 0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
5897 0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
5898 3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
5899 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
5900 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
5901 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
5902 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
5903 0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
5904 0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
5905 1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
5906 2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
5907 5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
5908 4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
5909 5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
5910 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
5911 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
5912 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
5913 0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
5914 2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
5915 6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
5916 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
5917 6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
5918 0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
5919 0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
5920 0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
5921 2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
5922 1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
5923 5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
5924 5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
5925 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
5926 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
5927 0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
5928 0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
5929 0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
5930 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
5931 7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
5932 0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
5933 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
5934 0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
5935 0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
5936 0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
5937 1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
5938 3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
5939 4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
5940 3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
5941 0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
5942 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
5943 0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
5944 0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
5945 0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
5946 4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
5947 4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
5948 7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
5949 5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
5950 7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
5951 0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
5952 0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
5953 3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
5954 1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
5955 3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
5956 7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
5957 0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
5958 7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
5959 0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
5960 0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
5961 0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
5962 5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
5963 2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
5964 6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
5965 6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
5966 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
5967 0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
5968 0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
5969 1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
5970 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
5978template <
typename T>
5983template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
5985 const Simd<T, N, 0>
d;
5986 const Rebind<uint16_t,
decltype(
d)> du;
5989#if HWY_TARGET == HWY_AVX3_DL
5990 const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)};
5992 const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw});
5993 const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
5998template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
6000 return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw,
v.raw)};
6003template <
size_t N, HWY_IF_GE64(
float, N)>
6008template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6013 alignas(16)
constexpr uint8_t u8_indices[64] = {
6014 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6015 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6016 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6017 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6021 const auto index =
Load(d8, u8_indices + 16 * mask.raw);
6028template <
typename T>
6033template <
typename T,
size_t N>
6040 Mask128<uint64_t> ) {
6046template <
typename T,
size_t N>
6054template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6057 const Rebind<uint16_t,
decltype(
d)> du;
6060 const uint64_t mask_bits{mask.
raw};
6062#if HWY_TARGET == HWY_AVX3_DL
6063 _mm_mask_compressstoreu_epi16(unaligned, mask.
raw, vu.raw);
6065 const auto idx = detail::IndicesForCompress16(mask_bits);
6070 const size_t count =
PopCount(mask_bits & ((1ull <<
N) - 1));
6073 __msan_unpoison(unaligned, count *
sizeof(T));
6078template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
6082 _mm_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
6083 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
6086 __msan_unpoison(unaligned, count *
sizeof(T));
6091template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6095 _mm_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
6096 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
6099 __msan_unpoison(unaligned, count *
sizeof(T));
6104template <
size_t N, HWY_IF_LE128(
float, N)>
6108 _mm_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
6109 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
6112 __msan_unpoison(unaligned, count *
sizeof(
float));
6117template <
size_t N, HWY_IF_LE128(
double, N)>
6121 _mm_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
6122 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
6125 __msan_unpoison(unaligned, count *
sizeof(
double));
6131template <
typename T,
size_t N>
6139 if (
N != 16 /
sizeof(T)) {
6145 const Vec128<T, N> compressed =
Compress(
v, m);
6146#if HWY_MEM_OPS_MIGHT_FAULT
6149 alignas(16) T buf[
N];
6150 Store(compressed,
d, buf);
6151 memcpy(unaligned, buf, count *
sizeof(T));
6158 __msan_unpoison(unaligned, count *
sizeof(T));
6166template <
typename T,
size_t N>
6179template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
6184 const Vec128<T, N> vbits{_mm_cvtsi32_si128(
static_cast<int>(mask_bits))};
6187 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
6188 1, 1, 1, 1, 1, 1, 1, 1};
6191 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6192 1, 2, 4, 8, 16, 32, 64, 128};
6196template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6199 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
6200 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
6204template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
6207 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
6208 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
6212template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6215 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
6222template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6225 uint64_t mask_bits = 0;
6226 constexpr size_t kNumBytes = (
N + 7) / 8;
6227 CopyBytes<kNumBytes>(bits, &mask_bits);
6229 mask_bits &= (1ull <<
N) - 1;
6239constexpr HWY_INLINE uint64_t U64FromInt(
int mask_bits) {
6240 return static_cast<uint64_t
>(
static_cast<unsigned>(mask_bits));
6243template <
typename T,
size_t N>
6245 const Mask128<T, N> mask) {
6246 const Simd<T, N, 0>
d;
6248 return U64FromInt(_mm_movemask_epi8(sign_bits));
6251template <
typename T,
size_t N>
6253 const Mask128<T, N> mask) {
6255 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
6256 return U64FromInt(_mm_movemask_epi8(sign_bits));
6259template <
typename T,
size_t N>
6261 const Mask128<T, N> mask) {
6262 const Simd<T, N, 0>
d;
6263 const Simd<float, N, 0> df;
6265 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
6268template <
typename T,
size_t N>
6270 const Mask128<T, N> mask) {
6271 const Simd<T, N, 0>
d;
6272 const Simd<double, N, 0> df;
6274 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
6278template <
typename T,
size_t N>
6279constexpr uint64_t
OnlyActive(uint64_t mask_bits) {
6280 return ((
N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull <<
N) - 1);
6283template <
typename T,
size_t N>
6291template <
typename T,
size_t N>
6293 const Mask128<T, N> mask, uint8_t* bits) {
6294 constexpr size_t kNumBytes = (
N + 7) / 8;
6296 CopyBytes<kNumBytes>(&mask_bits, bits);
6302template <
typename T,
size_t N>
6303HWY_API bool AllFalse(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6308template <
typename T,
size_t N>
6309HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
6310 constexpr uint64_t kAllBits =
6311 detail::OnlyActive<T, N>((1ull << (16 /
sizeof(T))) - 1);
6315template <
typename T,
size_t N>
6317 const Mask128<T, N> mask) {
6321template <
typename T,
size_t N>
6323 const Mask128<T, N> mask) {
6333template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6334HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6336 const Rebind<uint8_t,
decltype(
d)> d8;
6337 const Simd<uint16_t, N, 0> du;
6347 alignas(16)
constexpr uint8_t table[2048] = {
6349 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6350 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6351 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
6352 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6353 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
6354 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
6355 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
6356 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6357 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
6358 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
6359 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
6360 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
6361 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
6362 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
6363 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
6364 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6365 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
6366 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
6367 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
6368 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
6369 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
6370 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
6371 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
6372 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
6373 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
6374 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
6375 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
6376 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
6377 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
6378 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
6379 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
6380 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6381 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
6382 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
6383 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
6384 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
6385 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
6386 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
6387 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
6388 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
6389 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
6390 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
6391 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
6392 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
6393 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
6394 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
6395 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
6396 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
6397 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
6398 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
6399 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
6400 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
6401 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
6402 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
6403 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
6404 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
6405 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
6406 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
6407 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
6408 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
6409 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
6410 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
6411 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
6412 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
6413 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
6414 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
6415 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
6416 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
6417 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
6418 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
6419 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
6420 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
6421 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
6422 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
6423 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
6424 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
6425 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
6426 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
6427 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
6428 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
6429 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
6430 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
6431 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
6432 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
6433 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
6434 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
6435 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
6436 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
6437 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
6438 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
6439 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
6440 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
6441 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
6442 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
6443 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
6444 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
6445 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
6446 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
6447 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
6448 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
6449 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
6450 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
6451 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
6452 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
6453 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
6454 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
6455 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
6456 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
6457 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
6458 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
6459 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
6460 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
6461 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
6462 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
6463 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
6464 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
6465 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
6466 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
6467 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
6468 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
6469 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
6470 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
6471 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
6472 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
6473 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
6474 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
6475 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
6476 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
6478 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
6479 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
6483template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
6484HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6485 uint64_t mask_bits) {
6487 const Rebind<uint8_t,
decltype(
d)> d8;
6488 const Simd<uint16_t, N, 0> du;
6498 alignas(16)
constexpr uint8_t table[2048] = {
6500 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
6501 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
6502 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
6503 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
6504 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
6505 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
6506 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
6507 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
6508 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
6509 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
6510 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
6511 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
6512 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
6513 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
6514 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
6515 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
6516 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
6517 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
6518 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
6519 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
6520 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
6521 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
6522 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
6523 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
6524 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
6525 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
6526 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
6527 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
6528 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
6529 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
6530 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
6531 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
6532 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
6533 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
6534 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
6535 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
6536 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
6537 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
6538 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
6539 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
6540 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
6541 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
6542 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
6543 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
6544 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
6545 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
6546 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
6547 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
6548 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
6549 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
6550 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
6551 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
6552 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
6553 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
6554 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
6555 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
6556 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
6557 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
6558 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
6559 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
6560 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
6561 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
6562 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
6563 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
6564 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
6565 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
6566 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
6567 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
6568 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
6569 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
6570 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
6571 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
6572 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
6573 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
6574 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
6575 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
6576 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
6577 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
6578 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
6579 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
6580 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
6581 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
6582 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
6583 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
6584 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
6585 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
6586 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
6587 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
6588 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
6589 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
6590 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
6591 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
6592 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
6593 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
6594 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
6595 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
6596 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
6597 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
6598 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
6599 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
6600 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
6601 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
6602 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
6603 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
6604 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
6605 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
6606 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
6607 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
6608 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
6609 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
6610 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
6611 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
6612 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
6613 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
6614 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
6615 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
6616 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
6617 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
6618 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
6619 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
6620 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
6621 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
6622 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
6623 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
6624 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
6625 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
6626 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
6627 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6629 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
6630 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
6634template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6635HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6639 alignas(16)
constexpr uint8_t u8_indices[256] = {
6641 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6642 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6643 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
6644 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6645 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
6646 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
6647 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
6648 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6649 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6650 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
6651 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
6652 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6653 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6654 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
6655 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
6656 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6659 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6662template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6663HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6664 uint64_t mask_bits) {
6668 alignas(16)
constexpr uint8_t u8_indices[256] = {
6670 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6671 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6672 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6673 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6674 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6675 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6676 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6677 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6678 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6679 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6680 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6681 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6682 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6683 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6687 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6690template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6691HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
6695 alignas(16)
constexpr uint8_t u8_indices[64] = {
6697 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6698 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6699 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6700 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6703 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6706template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6707HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0>
d,
6708 uint64_t mask_bits) {
6712 alignas(16)
constexpr uint8_t u8_indices[64] = {
6714 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6715 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6716 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6717 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6720 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6723template <
typename T,
size_t N>
6725 const Simd<T, N, 0>
d;
6729 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6733template <
typename T,
size_t N>
6734HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N>
v, uint64_t mask_bits) {
6735 const Simd<T, N, 0>
d;
6739 const auto indices =
BitCast(du, detail::IndicesFromNotBits(
d, mask_bits));
6746template <
typename T>
6752template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6757 const Vec128<T> maskL =
DupEven(m);
6758 const Vec128<T> maskH =
DupOdd(m);
6759 const Vec128<T> swap =
AndNot(maskL, maskH);
6764template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
6770template <
typename T>
6776template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6781 const Vec128<T> maskL =
DupEven(m);
6782 const Vec128<T> maskH =
DupOdd(m);
6783 const Vec128<T> swap =
AndNot(maskH, maskL);
6788template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
6792 if (
N < 16 /
sizeof(T)) {
6800 Mask128<uint64_t> ) {
6804template <
typename T,
size_t N>
6807 uint64_t mask_bits = 0;
6808 constexpr size_t kNumBytes = (
N + 7) / 8;
6809 CopyBytes<kNumBytes>(bits, &mask_bits);
6811 mask_bits &= (1ull <<
N) - 1;
6819template <
typename T,
size_t N>
6826 const size_t count =
PopCount(mask_bits);
6829 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6831 StoreU(compressed,
d, unaligned);
6834 __msan_unpoison(unaligned, count *
sizeof(T));
6840template <
typename T,
size_t N>
6848 const size_t count =
PopCount(mask_bits);
6851 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6856 __msan_unpoison(unaligned, count *
sizeof(T));
6861template <
typename T,
size_t N>
6867 uint64_t mask_bits = 0;
6868 constexpr size_t kNumBytes = (
N + 7) / 8;
6869 CopyBytes<kNumBytes>(bits, &mask_bits);
6871 mask_bits &= (1ull <<
N) - 1;
6873 const size_t count =
PopCount(mask_bits);
6876 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
6878 StoreU(compressed,
d, unaligned);
6882 __msan_unpoison(unaligned, count *
sizeof(T));
6899template <
typename T>
6901 const Vec128<T, 1>
v) {
6904template <
typename T>
6906 const Vec128<T, 1>
v) {
6909template <
typename T>
6911 const Vec128<T, 1>
v) {
6918template <
typename T>
6920 const Vec128<T, 2> v10) {
6923template <
typename T>
6925 const Vec128<T, 2> v10) {
6928template <
typename T>
6930 const Vec128<T, 2> v10) {
6935template <
typename T>
6937 const Vec128<T> v3210) {
6939 const Vec128<T> v31_20_31_20 = v3210 + v1032;
6940 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
6941 return v20_31_20_31 + v31_20_31_20;
6943template <
typename T>
6945 const Vec128<T> v3210) {
6947 const Vec128<T> v31_20_31_20 =
Min(v3210, v1032);
6948 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
6949 return Min(v20_31_20_31, v31_20_31_20);
6951template <
typename T>
6953 const Vec128<T> v3210) {
6955 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
6956 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
6957 return Max(v20_31_20_31, v31_20_31_20);
6963template <
typename T>
6965 const Vec128<T> v10) {
6969template <
typename T>
6971 const Vec128<T> v10) {
6973 return Min(v10, v01);
6975template <
typename T>
6977 const Vec128<T> v10) {
6979 return Max(v10, v01);
6983template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6985 const Repartition<int32_t, Simd<T, N, 0>> d32;
6987 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
6990 return BitCast(Simd<T, N, 0>(),
Or(min, ShiftLeft<16>(min)));
6992template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6994 const Repartition<int32_t, Simd<T, N, 0>> d32;
6996 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
6999 return BitCast(Simd<T, N, 0>(),
Or(min, ShiftLeft<16>(min)));
7005template <
typename T,
size_t N>
7009template <
typename T,
size_t N>
7013template <
typename T,
size_t N>
7023template <
class D,
class V = VFromD<D>>
7025 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
"Use u64");
7039 const auto eqHL =
Eq(a, b);
7041 const V ltLX = ShiftLeftLanes<1>(ltHL);
7042 const V vecHx =
IfThenElse(eqHL, ltLX, ltHL);
7046template <
class D,
class V = VFromD<D>>
7056template <
class D,
class V = VFromD<D>>
7061template <
class D,
class V = VFromD<D>>
7069template <
class D,
class V = VFromD<D>>
7074template <
class D,
class V = VFromD<D>>
7079template <
class D,
class V = VFromD<D>>
7084template <
class D,
class V = VFromD<D>>
7121HWY_API auto Eq(V a, V b) ->
decltype(a == b) {
7125HWY_API auto Ne(V a, V b) ->
decltype(a == b) {
7129HWY_API auto Lt(V a, V b) ->
decltype(a == b) {
7134HWY_API auto Gt(V a, V b) ->
decltype(a == b) {
7138HWY_API auto Ge(V a, V b) ->
decltype(a == b) {
7143HWY_API auto Le(V a, V b) ->
decltype(a == b) {
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:346
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_IF_LE128(T, N)
Definition: base.h:332
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_MAYBE_UNUSED
Definition: base.h:73
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
Raw raw
Definition: arm_neon-inl.h:814
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:141
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:79
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:85
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:94
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:91
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:76
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:88
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:82
Definition: wasm_256-inl.h:39
Definition: x86_512-inl.h:112
#define HWY_AVX3_DL
Definition: detect_targets.h:62
#define HWY_TARGET
Definition: detect_targets.h:341
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: x86_128-inl.h:2139
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1700
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b)
Definition: x86_128-inl.h:7047
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:3080
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
trn2 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2793
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2984
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:800
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
Simd< T, 32/sizeof(T), 0 > Full256
Definition: wasm_256-inl.h:32
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition: ops/shared-inl.h:252
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:797
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
long long int GatherIndex64
Definition: x86_128-inl.h:3201
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
__m128i raw
Definition: x86_128-inl.h:4064
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:226
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:222
Definition: wasm_128-inl.h:146
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:218
Definition: arm_neon-inl.h:823
Simd< T, N, 0 > operator()(const Vec128< T, N > *) const
Definition: x86_128-inl.h:171
Full256< T > operator()(const hwy::HWY_NAMESPACE::Vec256< T > *) const
Definition: x86_128-inl.h:176
Full512< T > operator()(const hwy::HWY_NAMESPACE::Vec512< T > *) const
Definition: x86_128-inl.h:182
Definition: x86_128-inl.h:190
decltype(DeduceD()(static_cast< V * >(nullptr))) type
Definition: x86_128-inl.h:191
__m128d type
Definition: x86_128-inl.h:64
__f32x4 type
Definition: wasm_128-inl.h:60
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
Definition: x86_128-inl.h:119
__mmask16 type
Definition: x86_128-inl.h:120
Definition: x86_128-inl.h:123
__mmask8 type
Definition: x86_128-inl.h:124
Definition: x86_128-inl.h:127
__mmask8 type
Definition: x86_128-inl.h:128
Definition: x86_128-inl.h:131
__mmask8 type
Definition: x86_128-inl.h:132
Definition: x86_128-inl.h:117
#define HWY_INLINE_F16
Definition: x86_128-inl.h:5231