22 #include <immintrin.h>
23 #if defined(_MSC_VER) && defined(__clang__)
27 #include <avxintrin.h>
29 #include <avx2intrin.h>
30 #include <bmi2intrin.h>
31 #include <f16cintrin.h>
32 #include <fmaintrin.h>
33 #include <smmintrin.h>
74 return *
this = (*
this * other);
77 return *
this = (*
this / other);
80 return *
this = (*
this + other);
83 return *
this = (*
this - other);
86 return *
this = (*
this & other);
89 return *
this = (*
this | other);
92 return *
this = (*
this ^ other);
98 #if HWY_TARGET <= HWY_AVX3
103 template <
size_t size>
124 template <
typename T>
138 template <
typename T>
152 return _mm256_castpd_si256(v);
155 template <
typename T>
161 template <
typename T>
174 template <
typename T>
181 template <
typename T,
typename FromT>
189 template <
typename T>
191 return Vec256<T>{_mm256_setzero_si256()};
212 _mm256_set1_epi64x(
static_cast<long long>(t))};
225 _mm256_set1_epi64x(
static_cast<long long>(t))};
238 template <
typename T>
242 return Vec256<T>{_mm256_undefined_si256()};
257 template <
typename T>
272 template <
typename T>
287 template <
typename T>
301 template <
typename T>
315 template <
typename T>
318 #if HWY_TARGET <= HWY_AVX3
321 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
329 template <
typename T>
334 template <
typename T>
339 template <
typename T>
347 #if HWY_TARGET == HWY_AVX3_DL
349 #ifdef HWY_NATIVE_POPCNT
350 #undef HWY_NATIVE_POPCNT
352 #define HWY_NATIVE_POPCNT
357 template <
typename T>
361 template <
typename T>
365 template <
typename T>
369 template <
typename T>
376 template <
typename T>
387 template <
typename T>
389 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
394 #if HWY_TARGET <= HWY_AVX3
406 const __m256i out = _mm256_ternarylogic_epi32(
414 template <
typename T>
416 #if HWY_TARGET <= HWY_AVX3
426 #if HWY_TARGET <= HWY_AVX3
435 template <
typename T>
440 template <
typename T>
445 template <
typename T>
450 template <
typename T>
458 template <
typename T>
473 template <
typename T>
478 template <
typename T>
483 template <
typename T>
488 template <
typename T>
496 template <
typename T>
510 template <
typename T>
516 template <
typename T>
521 template <
typename T>
526 template <
typename T>
534 template <
typename T>
545 template <
typename T, HWY_IF_FLOAT(T)>
555 template <
typename T>
558 #if HWY_COMPILER_HAS_MASK_INTRINSICS
564 template <
typename T>
567 #if HWY_COMPILER_HAS_MASK_INTRINSICS
573 template <
typename T>
576 #if HWY_COMPILER_HAS_MASK_INTRINSICS
582 template <
typename T>
585 #if HWY_COMPILER_HAS_MASK_INTRINSICS
592 template <
typename T>
595 #if HWY_COMPILER_HAS_MASK_INTRINSICS
601 template <
typename T>
604 #if HWY_COMPILER_HAS_MASK_INTRINSICS
610 template <
typename T>
613 #if HWY_COMPILER_HAS_MASK_INTRINSICS
619 template <
typename T>
622 #if HWY_COMPILER_HAS_MASK_INTRINSICS
629 template <
typename T>
632 #if HWY_COMPILER_HAS_MASK_INTRINSICS
638 template <
typename T>
641 #if HWY_COMPILER_HAS_MASK_INTRINSICS
647 template <
typename T>
650 #if HWY_COMPILER_HAS_MASK_INTRINSICS
656 template <
typename T>
659 #if HWY_COMPILER_HAS_MASK_INTRINSICS
666 template <
typename T>
669 #if HWY_COMPILER_HAS_MASK_INTRINSICS
675 template <
typename T>
678 #if HWY_COMPILER_HAS_MASK_INTRINSICS
684 template <
typename T>
687 #if HWY_COMPILER_HAS_MASK_INTRINSICS
693 template <
typename T>
696 #if HWY_COMPILER_HAS_MASK_INTRINSICS
705 template <
typename T>
710 template <
typename T>
715 template <
typename T>
720 template <
typename T>
725 template <
typename T>
728 constexpr
size_t N = 32 /
sizeof(T);
737 template <
typename T>
739 return Mask256<T>{v.raw};
742 template <
typename T>
744 return Vec256<T>{v.raw};
747 template <
typename T>
749 return Vec256<T>{v.raw};
755 template <
typename T>
757 const Vec256<T> no) {
758 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
761 const Vec256<float> yes,
762 const Vec256<float> no) {
763 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
766 const Vec256<double> yes,
767 const Vec256<double> no) {
768 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
772 template <
typename T>
778 template <
typename T>
783 template <
typename T, HWY_IF_FLOAT(T)>
785 const auto zero =
Zero(Full256<T>());
791 template <
typename T>
796 template <
typename T>
797 HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
802 template <
typename T>
803 HWY_API Mask256<T>
AndNot(
const Mask256<T> a, Mask256<T> b) {
808 template <
typename T>
809 HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
814 template <
typename T>
815 HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
824 #if HWY_TARGET <= HWY_AVX3
828 template <
typename TFrom,
typename TTo>
830 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
836 template <
typename T>
841 template <
typename T>
846 template <
typename T>
851 template <
typename T>
859 template <
typename T>
861 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
867 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
871 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
873 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
875 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
877 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
879 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
881 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
894 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
898 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
900 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
902 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
904 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
906 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
908 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
954 template <
typename T>
958 template <
typename T>
962 template <
typename T>
966 template <
typename T>
973 template <
typename T>
985 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
990 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
992 return Vec256<T>{_mm256_movm_epi16(v.raw)};
995 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
997 return Vec256<T>{_mm256_movm_epi32(v.raw)};
1000 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1002 return Vec256<T>{_mm256_movm_epi64(v.raw)};
1013 template <
typename T>
1022 template <
typename TFrom,
typename TTo>
1024 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1028 template <
typename T>
1029 HWY_API Mask256<T>
TestBit(
const Vec256<T> v,
const Vec256<T> bit) {
1030 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1031 return (v & bit) == bit;
1036 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1038 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1041 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1043 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1046 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1048 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1051 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1053 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1057 const Vec256<float> b) {
1058 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1062 const Vec256<double> b) {
1063 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1068 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1074 const Vec256<float> b) {
1075 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1078 const Vec256<double> b) {
1079 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1087 #if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1088 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1090 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1095 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1096 using i8x32 =
signed char __attribute__((__vector_size__(32)));
1097 return Mask256<int8_t>{
static_cast<__m256i
>(
reinterpret_cast<i8x32
>(a.raw) >
1098 reinterpret_cast<i8x32
>(b.raw))};
1100 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1104 const Vec256<int16_t> b) {
1105 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1108 const Vec256<int32_t> b) {
1109 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1112 const Vec256<int64_t> b) {
1113 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1115 HWY_API Mask256<float>
operator>(
const Vec256<float> a,
const Vec256<float> b) {
1116 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1119 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1125 const Vec256<float> b) {
1126 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1129 const Vec256<double> b) {
1130 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1137 template <
typename T>
1142 template <
typename T>
1163 #if HWY_TARGET <= HWY_AVX3
1168 const auto msb =
Set(du, 1ull << 63);
1185 #if HWY_TARGET <= HWY_AVX3
1216 #if HWY_TARGET <= HWY_AVX3
1221 const auto msb =
Set(du, 1ull << 63);
1238 #if HWY_TARGET <= HWY_AVX3
1255 template <
typename T>
1257 #if HWY_TARGET <= HWY_AVX3
1432 #if HWY_COMPILER_MSVC
1502 template <
int kBits>
1507 template <
int kBits>
1512 template <
int kBits>
1517 template <
int kBits>
1522 template <
int kBits>
1527 template <
int kBits>
1532 template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1536 const auto shifted =
BitCast(d8, ShiftLeft<kBits>(
BitCast(d16, v)));
1539 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
1544 template <
int kBits>
1549 template <
int kBits>
1554 template <
int kBits>
1559 template <
int kBits>
1564 return shifted &
Set(d8, 0xFF >> kBits);
1567 template <
int kBits>
1572 template <
int kBits>
1577 template <
int kBits>
1581 const auto shifted =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
1582 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1583 return (shifted ^ shifted_sign) - shifted_sign;
1595 return ShiftRight<15>(v);
1599 return ShiftRight<31>(v);
1603 #if HWY_TARGET == HWY_AVX2
1610 template <
int kBits>
1612 #if HWY_TARGET <= HWY_AVX3
1617 const auto right =
BitCast(di, ShiftRight<kBits>(
BitCast(du, v)));
1619 return right | sign;
1624 #if HWY_TARGET <= HWY_AVX3
1659 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1664 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
1686 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
1700 #if HWY_TARGET <= HWY_AVX3
1707 return right | sign;
1715 const auto shifted_sign =
1716 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
1717 return (shifted ^ shifted_sign) - shifted_sign;
1722 template <
typename T, HWY_IF_FLOAT(T)>
1727 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1729 return Zero(Full256<T>()) - v;
1765 #ifdef HWY_DISABLE_BMI2_FMA
1766 return mul * x + add;
1773 #ifdef HWY_DISABLE_BMI2_FMA
1774 return mul * x + add;
1783 #ifdef HWY_DISABLE_BMI2_FMA
1784 return add - mul * x;
1792 #ifdef HWY_DISABLE_BMI2_FMA
1793 return add - mul * x;
1802 #ifdef HWY_DISABLE_BMI2_FMA
1803 return mul * x - sub;
1810 #ifdef HWY_DISABLE_BMI2_FMA
1811 return mul * x - sub;
1820 #ifdef HWY_DISABLE_BMI2_FMA
1821 return Neg(mul * x) - sub;
1829 #ifdef HWY_DISABLE_BMI2_FMA
1830 return Neg(mul * x) - sub;
1856 _mm256_round_ps(v.
raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1860 _mm256_round_pd(v.
raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1866 _mm256_round_ps(v.
raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1870 _mm256_round_pd(v.
raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1876 _mm256_round_ps(v.
raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1880 _mm256_round_pd(v.
raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1886 _mm256_round_ps(v.
raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1890 _mm256_round_pd(v.
raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1897 template <
typename T>
1900 _mm256_load_si256(
reinterpret_cast<const __m256i*
>(aligned))};
1911 template <
typename T>
1913 return Vec256<T>{_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(p))};
1926 #if HWY_TARGET <= HWY_AVX3
1928 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1931 return Vec256<T>{_mm256_maskz_load_epi32(m.
raw, aligned)};
1934 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1937 return Vec256<T>{_mm256_maskz_load_epi64(m.raw, aligned)};
1951 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1954 return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, aligned)};
1957 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1960 return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, aligned)};
1969 template <
typename T>
1973 asm(
"vbroadcasti128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
1975 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
1981 _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
1990 asm(
"vbroadcastf128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
1992 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
1995 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
1997 return Vec256<float>{_mm256_broadcast_ps(
reinterpret_cast<const __m128*
>(p))};
2004 asm(
"vbroadcastf128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
2006 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2009 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2012 _mm256_broadcast_pd(
reinterpret_cast<const __m128d*
>(p))};
2018 template <
typename T>
2020 _mm256_store_si256(
reinterpret_cast<__m256i*
>(aligned), v.
raw);
2024 _mm256_store_ps(aligned, v.
raw);
2028 _mm256_store_pd(aligned, v.
raw);
2031 template <
typename T>
2033 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(p), v.
raw);
2037 _mm256_storeu_ps(p, v.
raw);
2041 _mm256_storeu_pd(p, v.
raw);
2046 template <
typename T>
2049 _mm256_stream_si256(
reinterpret_cast<__m256i*
>(aligned), v.
raw);
2053 _mm256_stream_ps(aligned, v.
raw);
2057 _mm256_stream_pd(aligned, v.
raw);
2066 #if HWY_TARGET <= HWY_AVX3
2069 template <
typename T>
2073 _mm256_i32scatter_epi32(base, offset.
raw, v.
raw, 1);
2075 template <
typename T>
2079 _mm256_i32scatter_epi32(base, index.
raw, v.
raw, 4);
2082 template <
typename T>
2086 _mm256_i64scatter_epi64(base, offset.
raw, v.
raw, 1);
2088 template <
typename T>
2092 _mm256_i64scatter_epi64(base, index.
raw, v.
raw, 8);
2097 template <
typename T,
typename Offset>
2100 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2103 template <
typename T,
typename Index>
2106 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2113 _mm256_i32scatter_ps(base, offset.
raw, v.
raw, 1);
2118 _mm256_i32scatter_ps(base, index.
raw, v.
raw, 4);
2124 _mm256_i64scatter_pd(base, offset.
raw, v.
raw, 1);
2129 _mm256_i64scatter_pd(base, index.
raw, v.
raw, 8);
2134 template <
typename T,
typename Offset>
2136 const Vec256<Offset> offset) {
2137 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2139 constexpr
size_t N = 32 /
sizeof(T);
2140 alignas(32) T lanes[N];
2143 alignas(32) Offset offset_lanes[N];
2144 Store(offset, Simd<Offset, N>(), offset_lanes);
2146 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
2147 for (
size_t i = 0; i < N; ++i) {
2148 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2152 template <
typename T,
typename Index>
2154 const Vec256<Index> index) {
2155 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2157 constexpr
size_t N = 32 /
sizeof(T);
2158 alignas(32) T lanes[N];
2161 alignas(32) Index index_lanes[N];
2162 Store(index, Simd<Index, N>(), index_lanes);
2164 for (
size_t i = 0; i < N; ++i) {
2165 base[index_lanes[i]] = lanes[i];
2175 template <
typename T>
2180 return Vec256<T>{_mm256_i32gather_epi32(
2181 reinterpret_cast<const int32_t*
>(base), offset.
raw, 1)};
2183 template <
typename T>
2188 return Vec256<T>{_mm256_i32gather_epi32(
2189 reinterpret_cast<const int32_t*
>(base), index.
raw, 4)};
2192 template <
typename T>
2197 return Vec256<T>{_mm256_i64gather_epi64(
2200 template <
typename T>
2205 return Vec256<T>{_mm256_i64gather_epi64(
2211 template <
typename T,
typename Offset>
2214 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2217 template <
typename T,
typename Index>
2220 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2252 template <
typename T>
2263 template <
typename T>
2270 template <
typename T>
2282 template <
typename T>
2300 template <
typename T>
2302 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2303 return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.
raw, 0)};
2310 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2311 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.
raw, 0)};
2318 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2327 template <
typename T>
2330 return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.
raw, 1)};
2345 template <
int kBytes,
typename T>
2347 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2352 template <
int kBytes,
typename T>
2354 return ShiftLeftBytes<kBytes>(
Full256<T>(), v);
2359 template <
int kLanes,
typename T>
2365 template <
int kLanes,
typename T>
2367 return ShiftLeftLanes<kLanes>(
Full256<T>(), v);
2372 template <
int kBytes,
typename T>
2374 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2380 template <
int kLanes,
typename T>
2389 template <
int kBytes,
typename T,
class V = Vec256<T>>
2399 template <
int kLane>
2401 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2403 const __m256i lo = _mm256_shufflelo_epi16(v.
raw, (0x55 * kLane) & 0xFF);
2407 _mm256_shufflehi_epi16(v.
raw, (0x55 * (kLane - 4)) & 0xFF);
2411 template <
int kLane>
2413 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2416 template <
int kLane>
2418 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2423 template <
int kLane>
2425 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2427 const __m256i lo = _mm256_shufflelo_epi16(v.
raw, (0x55 * kLane) & 0xFF);
2431 _mm256_shufflehi_epi16(v.
raw, (0x55 * (kLane - 4)) & 0xFF);
2435 template <
int kLane>
2437 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2440 template <
int kLane>
2442 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2447 template <
int kLane>
2449 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2452 template <
int kLane>
2454 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2534 template <
typename T>
2539 template <
typename T>
2541 #if HWY_IS_DEBUG_BUILD
2542 const size_t N = 32 /
sizeof(T);
2543 for (
size_t i = 0; i < N; ++i) {
2544 HWY_DASSERT(0 <= idx[i] && idx[i] <
static_cast<int32_t
>(N));
2565 template <
typename T>
2567 alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2621 template <
typename T,
class V = Vec256<T>>
2676 template <
typename T,
class V = Vec256<T>>
2685 template <
typename T,
typename TW = MakeW
ide<T>>
2689 template <
typename T,
typename TW = MakeW
ide<T>>
2694 template <
typename T,
typename TW = MakeW
ide<T>>
2706 template <
typename T>
2709 const Half<decltype(d)> d2;
2714 const Half<decltype(d)> d2;
2720 const Half<decltype(d)> d2;
2725 template <
typename T>
2742 template <
typename T>
2759 template <
typename T>
2762 const Half<decltype(d)> d2;
2768 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2771 #if HWY_TARGET <= HWY_AVX3
2772 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2774 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2779 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
2781 _MM_SHUFFLE(3, 1, 2, 0))};
2788 #if HWY_TARGET <= HWY_AVX3
2789 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2791 __mmask8{0xFF}, hi.
raw)};
2794 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(3, 1, 3, 1))};
2796 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
2800 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2803 #if HWY_TARGET <= HWY_AVX3
2804 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
2805 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
2806 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2810 const Vec256<double> v31{
2811 _mm256_shuffle_pd(
BitCast(df, lo).raw,
BitCast(df, hi).raw, 15)};
2813 _mm256_permute4x64_epi64(
BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
2819 #if HWY_TARGET <= HWY_AVX3
2821 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
2823 __mmask8{0xFF}, hi.
raw)};
2828 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
2834 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2837 #if HWY_TARGET <= HWY_AVX3
2838 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2840 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2845 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
2847 _MM_SHUFFLE(3, 1, 2, 0))};
2855 #if HWY_TARGET <= HWY_AVX3
2856 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2858 __mmask8{0xFF}, hi.
raw)};
2861 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(2, 0, 2, 0))};
2863 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
2868 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2871 #if HWY_TARGET <= HWY_AVX3
2872 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
2873 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
2874 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2878 const Vec256<double> v20{
2881 _mm256_permute4x64_epi64(
BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
2888 #if HWY_TARGET <= HWY_AVX3
2890 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
2892 __mmask8{0xFF}, hi.
raw)};
2897 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
2905 template <
typename T>
2910 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2911 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2914 template <
typename T>
2919 template <
typename T>
2924 template <
typename T>
2932 template <
typename T>
2947 template <
typename T,
typename TI>
2954 template <
typename T,
typename TI,
size_t NI>
2965 template <
typename T,
size_t N,
typename TI>
2977 #if HWY_TARGET > HWY_AVX3
2981 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2982 HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(
const Vec256<T> v) {
2985 const Rebind<float, decltype(dw)> df;
2986 const auto zero =
Zero(d);
2989 const auto upper = exp +
Set(d, 0x3F80);
2991 const auto f0 =
ZipLower(dw, zero, upper);
2992 const auto f1 =
ZipUpper(dw, zero, upper);
2995 const Vec256<int32_t> bits0{_mm256_cvttps_epi32(
BitCast(df, f0).raw)};
2996 const Vec256<int32_t> bits1{_mm256_cvttps_epi32(
BitCast(df, f1).raw)};
2997 return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3005 #if HWY_TARGET <= HWY_AVX3
3008 return v * detail::Pow2(bits);
3023 template <
typename T, HWY_IF_SIGNED(T)>
3034 #if HWY_TARGET <= HWY_AVX3
3039 const auto out =
MulHigh(v, detail::Pow2(
Set(d, 16) - bits));
3057 #if HWY_TARGET <= HWY_AVX3
3071 #if HWY_TARGET <= HWY_AVX3
3080 const DFromV<decltype(a)> du64;
3082 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3083 const auto a32 =
BitCast(du32, a);
3084 const auto b32 =
BitCast(du32, b);
3092 const auto aLbL =
MulEven(a32, b32);
3093 const auto w3 = aLbL & maskL;
3095 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3096 const auto w2 = t2 & maskL;
3097 const auto w1 = ShiftRight<32>(t2);
3099 const auto t =
MulEven(a32, bH) + w2;
3100 const auto k = ShiftRight<32>(t);
3102 const auto mulH =
MulEven(aH, bH) + w1 + k;
3103 const auto mulL = ShiftLeft<32>(t) + w3;
3109 const DFromV<decltype(a)> du64;
3111 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3112 const auto a32 =
BitCast(du32, a);
3113 const auto b32 =
BitCast(du32, b);
3119 const auto aLbL =
MulEven(a32, b32);
3120 const auto w3 = aLbL & maskL;
3122 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3123 const auto w2 = t2 & maskL;
3124 const auto w1 = ShiftRight<32>(t2);
3126 const auto t =
MulEven(a32, bH) + w2;
3127 const auto k = ShiftRight<32>(t);
3129 const auto mulH =
MulEven(aH, bH) + w1 + k;
3130 const auto mulL = ShiftLeft<32>(t) + w3;
3226 const __m256i u16 = _mm256_packus_epi32(v.
raw, v.
raw);
3230 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
3235 const __m256i i16 = _mm256_packs_epi32(v.
raw, v.
raw);
3237 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
3242 const __m256i u16_blocks = _mm256_packus_epi32(v.
raw, v.
raw);
3244 const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
3245 const __m128i u16 = _mm256_castsi256_si128(u16_concat);
3248 const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
3254 const __m256i u8 = _mm256_packus_epi16(v.
raw, v.
raw);
3256 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
3261 const __m256i i16_blocks = _mm256_packs_epi32(v.
raw, v.
raw);
3263 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
3264 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
3270 const __m256i i8 = _mm256_packs_epi16(v.
raw, v.
raw);
3272 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
3282 #ifdef HWY_DISABLE_F16C
3284 const Rebind<uint32_t, decltype(df16)> du;
3286 const auto bits32 =
BitCast(du, v);
3287 const auto sign = ShiftRight<31>(bits32);
3288 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3289 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3291 const auto k15 =
Set(di, 15);
3292 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3293 const auto is_tiny = exp <
Set(di, -24);
3295 const auto is_subnormal = exp <
Set(di, -14);
3296 const auto biased_exp16 =
3298 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3299 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3300 (mantissa32 >> (
Set(du, 13) + sub_exp));
3302 ShiftRight<13>(mantissa32));
3304 const auto sign16 = ShiftLeft<15>(sign);
3305 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3319 const Rebind<int32_t, decltype(dbf16)> di32;
3320 const Rebind<uint32_t, decltype(dbf16)> du32;
3321 const Rebind<uint16_t, decltype(dbf16)> du16;
3322 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32, v)));
3330 const Repartition<uint32_t, decltype(dbf16)> du32;
3349 alignas(32)
static constexpr uint32_t k8From32[8] = {
3350 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
3368 #if HWY_TARGET <= HWY_AVX3
3377 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
3378 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64, v)) ^ k84_63);
3381 const auto k52 =
Set(d32, 0x43300000);
3384 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
3385 return (v_upper - k84_63_52) + v_lower;
3395 #if HWY_TARGET <= HWY_AVX3
3398 using VI = decltype(
Zero(di));
3399 const VI k0 =
Zero(di);
3400 const VI k1 =
Set(di, 1);
3401 const VI k51 =
Set(di, 51);
3404 const VI biased_exp = ShiftRight<52>(
BitCast(di, v)) &
Set(di, 0x7FF);
3405 const VI exp = biased_exp -
Set(di, 0x3FF);
3406 const auto in_range = exp <
Set(di, 63);
3414 const VI shift_mnt =
Max(k51 - exp, k0);
3415 const VI shift_int =
Max(exp - k51, k0);
3416 const VI mantissa =
BitCast(di, v) &
Set(di, (1ULL << 52) - 1);
3418 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
3420 const VI shifted = int52 << shift_int;
3422 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
3426 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
3427 const VI magnitude =
IfThenElse(in_range, restored, limit);
3430 return (magnitude ^ sign_mask) - sign_mask;
3442 #ifdef HWY_DISABLE_F16C
3447 const auto sign = ShiftRight<15>(bits16);
3448 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3449 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3450 const auto subnormal =
3452 Set(df32, 1.0f / 16384 / 1024));
3454 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3455 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3456 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3457 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3458 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3467 const Rebind<uint16_t, decltype(df32)> du16;
3474 #if !defined(HWY_DISABLE_PCLMUL_AES)
3477 #ifdef HWY_NATIVE_AES
3478 #undef HWY_NATIVE_AES
3480 #define HWY_NATIVE_AES
3485 #if HWY_TARGET == HWY_AVX3_DL
3489 const Half<decltype(d)> d2;
3496 #if HWY_TARGET == HWY_AVX3_DL
3500 const Half<decltype(d)> d2;
3507 #if HWY_TARGET == HWY_AVX3_DL
3511 const Half<decltype(d)> d2;
3522 template <
typename T,
typename T2>
3525 for (
size_t i = 0; i < 32 /
sizeof(T); ++i) {
3526 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
3528 return Load(d, lanes);
3531 #if HWY_TARGET <= HWY_AVX3
3536 template <
typename T>
3539 constexpr
size_t N = 32 /
sizeof(T);
3540 constexpr
size_t kNumBytes = (N + 7) / 8;
3542 uint64_t mask_bits = 0;
3543 CopyBytes<kNumBytes>(bits, &mask_bits);
3546 mask_bits &= (1ull << N) - 1;
3555 template <
typename T>
3558 constexpr
size_t N = 32 /
sizeof(T);
3559 constexpr
size_t kNumBytes = (N + 7) / 8;
3561 CopyBytes<kNumBytes>(&mask.
raw, bits);
3565 const int mask =
static_cast<int>((1ull << N) - 1);
3566 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
3573 template <
typename T>
3578 template <
typename T>
3588 template <
typename T>
3590 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3591 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
3593 return mask.
raw == 0;
3596 template <
typename T>
3598 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3599 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
3601 return mask.
raw == 0;
3604 template <
typename T>
3606 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3607 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
3609 return mask.
raw == 0;
3612 template <
typename T>
3614 return (uint64_t{mask.
raw} & 0xF) == 0;
3619 template <
typename T>
3626 template <
typename T>
3628 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3629 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
3631 return mask.
raw == 0xFFFFFFFFu;
3634 template <
typename T>
3636 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3637 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
3639 return mask.
raw == 0xFFFFu;
3642 template <
typename T>
3644 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3645 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
3647 return mask.
raw == 0xFFu;
3650 template <
typename T>
3653 return mask.
raw == 0xFu;
3658 template <
typename T>
3667 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3672 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3674 return Vec256<T>{_mm256_maskz_compress_epi64(mask.raw, v.raw)};
3687 template <
typename T>
3694 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3697 const Rebind<uint16_t, decltype(d)> du;
3698 const auto vu =
BitCast(du, v);
3700 const uint64_t mask_bits{mask.
raw};
3702 #if HWY_TARGET == HWY_AVX3_DL
3703 _mm256_mask_compressstoreu_epi16(unaligned, mask.
raw, v.
raw);
3706 const Half<decltype(du)> duh;
3710 const uint64_t mask_bitsL = mask_bits & 0xFF;
3711 const uint64_t mask_bitsH = mask_bits >> 8;
3713 const auto idxL = detail::IndicesForCompress16(mask_bitsL);
3714 const auto idxH = detail::IndicesForCompress16(mask_bitsH);
3719 const Half<decltype(d)> dh;
3727 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3730 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3731 return PopCount(uint64_t{mask.raw});
3734 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3737 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3738 return PopCount(uint64_t{mask.raw} & 0xFull);
3744 _mm256_mask_compressstoreu_ps(unaligned, mask.
raw, v.
raw);
3751 _mm256_mask_compressstoreu_pd(unaligned, mask.
raw, v.
raw);
3757 template <
typename T>
3770 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3771 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3774 const auto vbits =
BitCast(du,
Set(du32,
static_cast<uint32_t
>(mask_bits)));
3778 alignas(32) constexpr uint64_t kRep8[4] = {
3779 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
3780 0x0303030303030303ull};
3783 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3784 1, 2, 4, 8, 16, 32, 64, 128};
3788 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3789 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3791 alignas(32) constexpr uint16_t kBit[16] = {
3792 1, 2, 4, 8, 16, 32, 64, 128,
3793 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3794 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
3798 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3799 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3801 constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3802 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
3806 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3807 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3809 constexpr uint64_t kBit[8] = {1, 2, 4, 8};
3816 template <
typename T>
3819 constexpr
size_t N = 32 /
sizeof(T);
3820 constexpr
size_t kNumBytes = (N + 7) / 8;
3822 uint64_t mask_bits = 0;
3823 CopyBytes<kNumBytes>(bits, &mask_bits);
3826 mask_bits &= (1ull << N) - 1;
3829 return detail::LoadMaskBits256(d, mask_bits);
3836 template <
typename T>
3838 const Mask256<T> mask) {
3840 const Full256<uint8_t> d8;
3843 return static_cast<uint32_t
>(_mm256_movemask_epi8(sign_bits));
3846 template <
typename T>
3848 const Mask256<T> mask) {
3853 return _pext_u64(sign_bits8, 0xAAAAAAAAull);
3858 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
3860 const auto compressed =
3861 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
3862 return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
3866 template <
typename T>
3868 const Mask256<T> mask) {
3870 const Full256<float> df;
3872 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
3875 template <
typename T>
3877 const Mask256<T> mask) {
3879 const Full256<double> df;
3881 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
3884 template <
typename T>
3892 template <
typename T>
3895 constexpr
size_t N = 32 /
sizeof(T);
3896 constexpr
size_t kNumBytes = (N + 7) / 8;
3899 CopyBytes<kNumBytes>(&mask_bits, bits);
3905 template <
typename T>
3911 template <
typename T>
3913 constexpr uint64_t kAllBits = (1ull << (32 /
sizeof(T))) - 1;
3917 template <
typename T>
3922 template <
typename T>
3924 const Mask256<T> mask) {
3933 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3934 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 8> d,
3935 uint64_t mask_bits) {
3942 alignas(16) constexpr uint32_t packed_array[256] = {
3943 0x00000000, 0x00000000, 0x00000001, 0x00000010, 0x00000002, 0x00000020,
3944 0x00000021, 0x00000210, 0x00000003, 0x00000030, 0x00000031, 0x00000310,
3945 0x00000032, 0x00000320, 0x00000321, 0x00003210, 0x00000004, 0x00000040,
3946 0x00000041, 0x00000410, 0x00000042, 0x00000420, 0x00000421, 0x00004210,
3947 0x00000043, 0x00000430, 0x00000431, 0x00004310, 0x00000432, 0x00004320,
3948 0x00004321, 0x00043210, 0x00000005, 0x00000050, 0x00000051, 0x00000510,
3949 0x00000052, 0x00000520, 0x00000521, 0x00005210, 0x00000053, 0x00000530,
3950 0x00000531, 0x00005310, 0x00000532, 0x00005320, 0x00005321, 0x00053210,
3951 0x00000054, 0x00000540, 0x00000541, 0x00005410, 0x00000542, 0x00005420,
3952 0x00005421, 0x00054210, 0x00000543, 0x00005430, 0x00005431, 0x00054310,
3953 0x00005432, 0x00054320, 0x00054321, 0x00543210, 0x00000006, 0x00000060,
3954 0x00000061, 0x00000610, 0x00000062, 0x00000620, 0x00000621, 0x00006210,
3955 0x00000063, 0x00000630, 0x00000631, 0x00006310, 0x00000632, 0x00006320,
3956 0x00006321, 0x00063210, 0x00000064, 0x00000640, 0x00000641, 0x00006410,
3957 0x00000642, 0x00006420, 0x00006421, 0x00064210, 0x00000643, 0x00006430,
3958 0x00006431, 0x00064310, 0x00006432, 0x00064320, 0x00064321, 0x00643210,
3959 0x00000065, 0x00000650, 0x00000651, 0x00006510, 0x00000652, 0x00006520,
3960 0x00006521, 0x00065210, 0x00000653, 0x00006530, 0x00006531, 0x00065310,
3961 0x00006532, 0x00065320, 0x00065321, 0x00653210, 0x00000654, 0x00006540,
3962 0x00006541, 0x00065410, 0x00006542, 0x00065420, 0x00065421, 0x00654210,
3963 0x00006543, 0x00065430, 0x00065431, 0x00654310, 0x00065432, 0x00654320,
3964 0x00654321, 0x06543210, 0x00000007, 0x00000070, 0x00000071, 0x00000710,
3965 0x00000072, 0x00000720, 0x00000721, 0x00007210, 0x00000073, 0x00000730,
3966 0x00000731, 0x00007310, 0x00000732, 0x00007320, 0x00007321, 0x00073210,
3967 0x00000074, 0x00000740, 0x00000741, 0x00007410, 0x00000742, 0x00007420,
3968 0x00007421, 0x00074210, 0x00000743, 0x00007430, 0x00007431, 0x00074310,
3969 0x00007432, 0x00074320, 0x00074321, 0x00743210, 0x00000075, 0x00000750,
3970 0x00000751, 0x00007510, 0x00000752, 0x00007520, 0x00007521, 0x00075210,
3971 0x00000753, 0x00007530, 0x00007531, 0x00075310, 0x00007532, 0x00075320,
3972 0x00075321, 0x00753210, 0x00000754, 0x00007540, 0x00007541, 0x00075410,
3973 0x00007542, 0x00075420, 0x00075421, 0x00754210, 0x00007543, 0x00075430,
3974 0x00075431, 0x00754310, 0x00075432, 0x00754320, 0x00754321, 0x07543210,
3975 0x00000076, 0x00000760, 0x00000761, 0x00007610, 0x00000762, 0x00007620,
3976 0x00007621, 0x00076210, 0x00000763, 0x00007630, 0x00007631, 0x00076310,
3977 0x00007632, 0x00076320, 0x00076321, 0x00763210, 0x00000764, 0x00007640,
3978 0x00007641, 0x00076410, 0x00007642, 0x00076420, 0x00076421, 0x00764210,
3979 0x00007643, 0x00076430, 0x00076431, 0x00764310, 0x00076432, 0x00764320,
3980 0x00764321, 0x07643210, 0x00000765, 0x00007650, 0x00007651, 0x00076510,
3981 0x00007652, 0x00076520, 0x00076521, 0x00765210, 0x00007653, 0x00076530,
3982 0x00076531, 0x00765310, 0x00076532, 0x00765320, 0x00765321, 0x07653210,
3983 0x00007654, 0x00076540, 0x00076541, 0x00765410, 0x00076542, 0x00765420,
3984 0x00765421, 0x07654210, 0x00076543, 0x00765430, 0x00765431, 0x07654310,
3985 0x00765432, 0x07654320, 0x07654321, 0x76543210};
3991 const auto packed =
Set(d32, packed_array[mask_bits]);
3992 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3993 return Indices256<uint32_t>{(packed >>
Load(d32, shifts)).raw};
3996 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3997 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 4> d,
3998 uint64_t mask_bits) {
4004 alignas(32) constexpr uint32_t packed_array[128] = {
4005 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
4006 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 0, 1, 0, 1,
4007 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 0, 1, 0, 1,
4008 2, 3, 4, 5, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 0, 1,
4009 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 0, 1,
4010 2, 3, 6, 7, 0, 1, 0, 1, 0, 1, 2, 3, 6, 7, 0, 1,
4011 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 4, 5, 6, 7, 0, 1,
4012 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
4013 return Indices256<uint32_t>{
Load(d32, packed_array + 8 * mask_bits).raw};
4016 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4021 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
4022 const auto indices = IndicesFromBits(d, mask_bits);
4028 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4030 using D = Full256<T>;
4031 const Rebind<uint16_t, D> du;
4032 const Repartition<int32_t, D> dw;
4033 const auto vu16 =
BitCast(du, v);
4037 const uint64_t mask_bits0 = mask_bits & 0xFF;
4038 const uint64_t mask_bits1 = mask_bits >> 8;
4039 const auto compressed0 =
Compress(promoted0, mask_bits0);
4040 const auto compressed1 =
Compress(promoted1, mask_bits1);
4042 const Half<decltype(du)> dh;
4046 const size_t count0 =
PopCount(mask_bits0);
4049 alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4050 0, 1, 2, 3, 4, 5, 6, 7};
4052 const auto shift1_multiple4 =
4057 const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw,
4058 shift1_multiple4.raw, 0x08);
4059 const auto shift1_multiple2 =
4060 Vec256<uint16_t>{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)};
4064 TestBit(
Set(du,
static_cast<uint16_t
>(count0)),
Set(du, 1));
4065 const auto shifted1 =
IfThenElse(m_odd, shift1_multiple2, shift1_multiple4);
4068 constexpr uint16_t on = 0xFFFF;
4069 alignas(32) constexpr uint16_t lower_lanes[32] = {
HWY_REP4(on),
HWY_REP4(on),
4077 template <
typename T>
4083 template <
typename T>
4085 constexpr
size_t N = 32 /
sizeof(T);
4086 constexpr
size_t kNumBytes = (N + 7) / 8;
4088 uint64_t mask_bits = 0;
4089 CopyBytes<kNumBytes>(bits, &mask_bits);
4092 mask_bits &= (1ull << N) - 1;
4100 template <
typename T>
4108 template <
typename T>
4111 constexpr
size_t N = 32 /
sizeof(T);
4112 constexpr
size_t kNumBytes = (N + 7) / 8;
4114 uint64_t mask_bits = 0;
4115 CopyBytes<kNumBytes>(bits, &mask_bits);
4118 mask_bits &= (1ull << N) - 1;
4134 const auto k5 =
Set(d, 5);
4135 const auto k6 =
Set(d, 6);
4139 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
4140 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
4141 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
4142 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
4143 0x80, 0, 0x80, 0x80, 1, 0x80,
4144 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
4147 const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
4151 const auto interleaved_10_00 = r0 | g0 | b0;
4154 const auto shuf_r1 = shuf_b0 + k6;
4155 const auto shuf_g1 = shuf_r0 + k5;
4156 const auto shuf_b1 = shuf_g0 + k5;
4160 const auto interleaved_15_05 = r1 | g1 | b1;
4166 const auto out0 =
ConcatLowerLower(d, interleaved_15_05, interleaved_10_00);
4167 StoreU(out0, d, unaligned + 0 * 32);
4170 const auto shuf_r2 = shuf_b1 + k6;
4171 const auto shuf_g2 = shuf_r1 + k5;
4172 const auto shuf_b2 = shuf_g1 + k5;
4176 const auto interleaved_1A_0A = r2 | g2 | b2;
4178 const auto out1 =
ConcatUpperLower(d, interleaved_10_00, interleaved_1A_0A);
4179 StoreU(out1, d, unaligned + 1 * 32);
4181 const auto out2 =
ConcatUpperUpper(d, interleaved_1A_0A, interleaved_15_05);
4182 StoreU(out2, d, unaligned + 2 * 32);
4195 const auto ba0 =
ZipLower(d16, v0, v1);
4196 const auto dc0 =
ZipLower(d16, v2, v3);
4197 const auto ba8 =
ZipUpper(d16, v0, v1);
4198 const auto dc8 =
ZipUpper(d16, v2, v3);
4199 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
4200 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
4201 const auto dcba_8 =
ZipLower(d32, ba8, dc8);
4202 const auto dcba_C =
ZipUpper(d32, ba8, dc8);
4207 StoreU(out0, d8, unaligned + 0 * 32);
4208 StoreU(out1, d8, unaligned + 1 * 32);
4211 StoreU(out2, d8, unaligned + 2 * 32);
4212 StoreU(out3, d8, unaligned + 3 * 32);
4221 template <
typename T>
4225 const auto v31_20_31_20 = v3210 + v1032;
4226 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4227 return v20_31_20_31 + v31_20_31_20;
4229 template <
typename T>
4233 const auto v31_20_31_20 =
Min(v3210, v1032);
4234 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4235 return Min(v20_31_20_31, v31_20_31_20);
4237 template <
typename T>
4241 const auto v31_20_31_20 =
Max(v3210, v1032);
4242 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4243 return Max(v20_31_20_31, v31_20_31_20);
4246 template <
typename T>
4252 template <
typename T>
4256 return Min(v10, v01);
4258 template <
typename T>
4262 return Max(v10, v01);
4268 template <
typename T>
4273 template <
typename T>
4278 template <
typename T>
4286 template <
typename T>
4291 template <
typename T>
4296 template <
typename T>
4301 template <
typename T>
4306 template <
typename T>
4310 template <
typename T>
4314 template <
typename T>
4319 template <
typename T>
4324 template <
int kBytes,
typename T>
4326 return ShiftRightBytes<kBytes>(
Full256<T>(), v);
4329 template <
int kLanes,
typename T>
4331 return ShiftRightLanes<kLanes>(
Full256<T>(), v);
4334 template <
size_t kBytes,
typename T>
4336 return CombineShiftRightBytes<kBytes>(
Full256<T>(), hi, lo);
4339 template <
typename T>
4344 template <
typename T>
4349 template <
typename T>
4354 template <
typename T>
4359 template <
typename T>
4364 template <
typename T>
4369 template <
typename T>
4374 template <
typename T>
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_API
Definition: base.h:117
#define HWY_REP4(literal)
Definition: base.h:136
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:468
Raw raw
Definition: arm_neon-inl.h:501
Definition: x86_256-inl.h:67
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: x86_256-inl.h:76
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: x86_256-inl.h:73
Raw raw
Definition: x86_256-inl.h:95
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: x86_256-inl.h:88
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: x86_256-inl.h:91
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: x86_256-inl.h:79
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: x86_256-inl.h:85
typename detail::Raw256< T >::type Raw
Definition: x86_256-inl.h:68
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: x86_256-inl.h:82
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:842
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:2739
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:2798
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:3589
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:879
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4769
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:672
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:768
HWY_INLINE auto FixConversionOverflow(Simd< TI, N > di, decltype(Zero(DF())) original, decltype(Zero(di).raw) converted_raw) -> decltype(Zero(di))
Definition: x86_128-inl.h:4176
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2184
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4165
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_INLINE Vu16m1 DemoteTo(Du16m1 d, const Vu32m2 v)
Definition: rvv-inl.h:1176
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:2728
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:714
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2176
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: shared-inl.h:151
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: shared-inl.h:160
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
long long int GatherIndex64
Definition: x86_128-inl.h:2721
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:555
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: x86_256-inl.h:2535
__m256i raw
Definition: x86_256-inl.h:2536
Definition: x86_256-inl.h:125
typename detail::RawMask256< sizeof(T)>::type Raw
Definition: x86_256-inl.h:126
Raw raw
Definition: x86_256-inl.h:132
static Mask256< T > FromBits(uint64_t mask_bits)
Definition: x86_256-inl.h:128
Definition: shared-inl.h:35
HWY_INLINE __m256d operator()(__m256i v)
Definition: x86_256-inl.h:171
HWY_INLINE __m256 operator()(__m256i v)
Definition: x86_256-inl.h:167
Definition: x86_256-inl.h:162
HWY_INLINE __m256i operator()(__m256i v)
Definition: x86_256-inl.h:163
__m256d type
Definition: x86_256-inl.h:61
__m256 type
Definition: x86_256-inl.h:57
Definition: x86_256-inl.h:52
__m256i type
Definition: x86_256-inl.h:53
__mmask32 type
Definition: x86_256-inl.h:107
__mmask16 type
Definition: x86_256-inl.h:111
__mmask8 type
Definition: x86_256-inl.h:115
__mmask8 type
Definition: x86_256-inl.h:119
Definition: x86_256-inl.h:104