21#include <wasm_simd128.h>
26#ifdef HWY_WASM_OLD_NAMES
27#define wasm_i8x16_shuffle wasm_v8x16_shuffle
28#define wasm_i16x8_shuffle wasm_v16x8_shuffle
29#define wasm_i32x4_shuffle wasm_v32x4_shuffle
30#define wasm_i64x2_shuffle wasm_v64x2_shuffle
31#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
65template <
typename T,
size_t N = 16 /
sizeof(T)>
73 return *
this = (*
this * other);
76 return *
this = (*
this / other);
79 return *
this = (*
this + other);
82 return *
this = (*
this - other);
85 return *
this = (*
this & other);
88 return *
this = (*
this | other);
91 return *
this = (*
this ^ other);
98using Vec64 = Vec128<T, 8 /
sizeof(T)>;
101using Vec32 = Vec128<T, 4 /
sizeof(T)>;
104template <
typename T,
size_t N = 16 /
sizeof(T)>
113 template <
typename T,
size_t N>
122using DFromV =
decltype(detail::DeduceD()(V()));
125using TFromV = TFromD<DFromV<V>>;
133 return static_cast<__v128_u
>(
v);
136 return static_cast<__v128_u
>(
v);
139template <
typename T,
size_t N>
154template <
typename T,
size_t N>
162template <
typename T,
size_t N,
typename FromT>
164 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
171template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
175template <
size_t N, HWY_IF_LE128(
float, N)>
186template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
190template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
195template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
200template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
206template <
size_t N, HWY_IF_LE128(
int8_t, N)>
210template <
size_t N, HWY_IF_LE128(
int16_t, N)>
214template <
size_t N, HWY_IF_LE128(
int32_t, N)>
218template <
size_t N, HWY_IF_LE128(
int64_t, N)>
223template <
size_t N, HWY_IF_LE128(
float, N)>
232template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
240template <
typename T,
size_t N,
typename T2>
241Vec128<T, N>
Iota(
const Simd<T, N, 0>
d,
const T2 first) {
243 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
244 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
246 return Load(
d, lanes);
457template <
int kBits,
size_t N>
461template <
int kBits,
size_t N>
465template <
int kBits,
size_t N>
469template <
int kBits,
size_t N>
473template <
int kBits,
size_t N>
477template <
int kBits,
size_t N>
483template <
int kBits,
size_t N>
487template <
int kBits,
size_t N>
491template <
int kBits,
size_t N>
495template <
int kBits,
size_t N>
499template <
int kBits,
size_t N>
503template <
int kBits,
size_t N>
509template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
516 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
519template <
int kBits,
size_t N>
525 return shifted &
Set(d8, 0xFF >> kBits);
528template <
int kBits,
size_t N>
533 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
534 return (shifted ^ shifted_sign) - shifted_sign;
538template <
int kBits,
typename T,
size_t N>
540 constexpr size_t kSizeInBits =
sizeof(T) * 8;
541 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
542 if (kBits == 0)
return v;
543 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
617template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
623 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
633 return shifted &
Set(d8, 0xFF >> bits);
641 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
642 return (shifted ^ shifted_sign) - shifted_sign;
664HWY_API Vec128<uint64_t, N>
Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
666 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
667 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
668 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
669 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
671 return Vec128<uint64_t, N>{wasm_v128_load(min)};
688HWY_API Vec128<int64_t, N>
Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
689 alignas(16) int64_t min[4];
690 min[0] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
691 wasm_i64x2_extract_lane(b.raw, 0));
692 min[1] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
693 wasm_i64x2_extract_lane(b.raw, 1));
694 return Vec128<int64_t, N>{wasm_v128_load(min)};
719HWY_API Vec128<uint64_t, N>
Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
721 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
722 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
723 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
724 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
726 return Vec128<uint64_t, N>{wasm_v128_load(max)};
743HWY_API Vec128<int64_t, N>
Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
744 alignas(16) int64_t max[2];
745 max[0] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
746 wasm_i64x2_extract_lane(b.raw, 0));
747 max[1] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
748 wasm_i64x2_extract_lane(b.raw, 1));
749 return Vec128<int64_t, N>{wasm_v128_load(max)};
787 const Vec128<uint16_t, N> b) {
789 const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
790 const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
791 const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
792 const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
793 const auto l = wasm_i32x4_mul(al, bl);
794 const auto h = wasm_i32x4_mul(ah, bh);
796 return Vec128<uint16_t, N>{
797 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
801 const Vec128<int16_t, N> b) {
803 const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
804 const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
805 const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
806 const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
807 const auto l = wasm_i32x4_mul(al, bl);
808 const auto h = wasm_i32x4_mul(ah, bh);
810 return Vec128<int16_t, N>{
811 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
816 Vec128<int16_t, N> b) {
820 const Vec128<uint16_t, N> lo =
BitCast(du,
Mul(a, b));
821 const Vec128<int16_t, N> hi =
MulHigh(a, b);
825 const Vec128<uint16_t, N> lo_top2 = ShiftRight<14>(lo);
827 const Vec128<uint16_t, N> rounding = ShiftRight<1>(
Add(lo_top2,
Set(du, 1)));
833HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
834 const Vec128<int32_t, N> b) {
836 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
837 const auto ae = wasm_v128_and(a.raw, kEvenMask);
838 const auto be = wasm_v128_and(b.raw, kEvenMask);
839 return Vec128<int64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
842HWY_API Vec128<uint64_t, (
N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
843 const Vec128<uint32_t, N> b) {
845 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
846 const auto ae = wasm_v128_and(a.raw, kEvenMask);
847 const auto be = wasm_v128_and(b.raw, kEvenMask);
848 return Vec128<uint64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
853template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
884 const Vec128<float, N> b) {
885 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
891 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
898 const Vec128<float, N> b) {
907 const Vec128<float, N> x,
908 const Vec128<float, N> add) {
911 return mul * x + add;
917 const Vec128<float, N> x,
918 const Vec128<float, N> add) {
920 return add - mul * x;
926 const Vec128<float, N> x,
927 const Vec128<float, N> sub) {
930 return mul * x - sub;
936 const Vec128<float, N> x,
937 const Vec128<float, N> sub) {
939 return Neg(mul) * x - sub;
946HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
947 return Vec128<float, N>{wasm_f32x4_sqrt(
v.raw)};
954 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
955 return one /
Sqrt(
v);
963 return Vec128<float, N>{wasm_f32x4_nearest(
v.raw)};
969 return Vec128<float, N>{wasm_f32x4_trunc(
v.raw)};
974HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
975 return Vec128<float, N>{wasm_f32x4_ceil(
v.raw)};
981 return Vec128<float, N>{wasm_f32x4_floor(
v.raw)};
985template <
typename T,
size_t N>
990template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
992 const Simd<T, N, 0>
d;
1000template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1002 const Simd<T, N, 0>
d;
1009 const VFromD<
decltype(di)> exp =
1018template <
typename TFrom,
typename TTo,
size_t N>
1020 Mask128<TFrom, N> m) {
1021 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1022 return Mask128<TTo, N>{m.raw};
1025template <
typename T,
size_t N>
1027 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1028 return (
v & bit) == bit;
1180 const auto a32 =
BitCast(d32, a);
1181 const auto b32 =
BitCast(d32, b);
1183 const auto m_gt = a32 > b32;
1186 const auto m_eq = a32 == b32;
1187 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1190 const auto gt =
Or(lo_gt, m_gt);
1201template <
typename T,
size_t N>
1202HWY_API Mask128<T, N>
operator<(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1222template <
typename T,
size_t N>
1232template <
typename T,
size_t N>
1234 return Vec128<T, N>{wasm_v128_not(
v.raw)};
1239template <
typename T,
size_t N>
1240HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
1241 return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1247template <
typename T,
size_t N>
1248HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
1249 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1254template <
typename T,
size_t N>
1255HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
1256 return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1261template <
typename T,
size_t N>
1262HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
1263 return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1268template <
typename T,
size_t N>
1269HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
1270 return Or(o1,
Or(o2, o3));
1275template <
typename T,
size_t N>
1276HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1277 return Or(o,
And(a1, a2));
1282template <
typename T,
size_t N>
1290template <
typename T,
size_t N>
1291HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1295template <
typename T,
size_t N>
1296HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1300template <
typename T,
size_t N>
1301HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1307template <
typename T,
size_t N>
1309 const Vec128<T, N> sign) {
1310 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1315template <
typename T,
size_t N>
1317 const Vec128<T, N> sign) {
1318 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1324template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1337template <
typename T,
size_t N>
1339 return Mask128<T, N>{
v.raw};
1342template <
typename T,
size_t N>
1344 return Vec128<T, N>{
v.raw};
1348template <
typename T,
size_t N>
1351 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1355template <
typename T,
size_t N>
1361template <
typename T,
size_t N>
1366template <
typename T,
size_t N>
1369 static_assert(IsSigned<T>(),
"Only works for signed/float");
1377template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1380 const auto zero =
Zero(
d);
1386template <
typename T,
size_t N>
1387HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1391template <
typename T,
size_t N>
1392HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1393 const Simd<T, N, 0>
d;
1397template <
typename T,
size_t N>
1398HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1399 const Simd<T, N, 0>
d;
1403template <
typename T,
size_t N>
1404HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1405 const Simd<T, N, 0>
d;
1409template <
typename T,
size_t N>
1410HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1411 const Simd<T, N, 0>
d;
1425template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1432 test = ShiftLeft<12>(test);
1435 test = ShiftLeft<1>(test);
1439 test = ShiftLeft<1>(test);
1443 test = ShiftLeft<1>(test);
1450template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1457 test = ShiftLeft<27>(test);
1460 test = ShiftLeft<1>(test);
1464 test = ShiftLeft<1>(test);
1468 test = ShiftLeft<1>(test);
1472 test = ShiftLeft<1>(test);
1479template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1482 alignas(16) T lanes[2];
1483 alignas(16) T bits_lanes[2];
1485 Store(bits,
d, bits_lanes);
1486 lanes[0] <<= bits_lanes[0];
1487 lanes[1] <<= bits_lanes[1];
1488 return Load(
d, lanes);
1493template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1500 test = ShiftLeft<12>(test);
1503 test = ShiftLeft<1>(test);
1507 test = ShiftLeft<1>(test);
1511 test = ShiftLeft<1>(test);
1518template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1525 test = ShiftLeft<27>(test);
1528 test = ShiftLeft<1>(test);
1532 test = ShiftLeft<1>(test);
1536 test = ShiftLeft<1>(test);
1540 test = ShiftLeft<1>(test);
1551template <
typename T>
1553 return Vec128<T>{wasm_v128_load(aligned)};
1556template <
typename T,
size_t N>
1563template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1566 CopyBytes<sizeof(T) * N>(p, &
v);
1571template <
typename T,
size_t N>
1577template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1584template <
typename T>
1586 wasm_v128_store(aligned,
v.raw);
1590template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1592 CopyBytes<sizeof(T) * N>(&
v, p);
1597 *p = wasm_f32x4_extract_lane(
v.raw, 0);
1601template <
typename T,
size_t N>
1606template <
typename T,
size_t N>
1616template <
typename T,
size_t N>
1619 wasm_v128_store(aligned,
v.raw);
1624template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
1627 const Vec128<Offset, N> offset) {
1628 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1630 alignas(16) T lanes[
N];
1633 alignas(16) Offset offset_lanes[
N];
1634 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
1636 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1637 for (
size_t i = 0; i <
N; ++i) {
1638 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1642template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
1644 const Vec128<Index, N> index) {
1645 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1647 alignas(16) T lanes[
N];
1650 alignas(16) Index index_lanes[
N];
1651 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
1653 for (
size_t i = 0; i <
N; ++i) {
1654 base[index_lanes[i]] = lanes[i];
1660template <
typename T,
size_t N,
typename Offset>
1663 const Vec128<Offset, N> offset) {
1664 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1666 alignas(16) Offset offset_lanes[
N];
1667 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
1669 alignas(16) T lanes[
N];
1670 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1671 for (
size_t i = 0; i <
N; ++i) {
1672 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1674 return Load(
d, lanes);
1677template <
typename T,
size_t N,
typename Index>
1680 const Vec128<Index, N> index) {
1681 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1683 alignas(16) Index index_lanes[
N];
1684 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
1686 alignas(16) T lanes[
N];
1687 for (
size_t i = 0; i <
N; ++i) {
1688 lanes[i] = base[index_lanes[i]];
1690 return Load(
d, lanes);
1699template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1701 return static_cast<T
>(wasm_i8x16_extract_lane(
v.raw, kLane));
1703template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1705 return static_cast<T
>(wasm_i16x8_extract_lane(
v.raw, kLane));
1707template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1709 return static_cast<T
>(wasm_i32x4_extract_lane(
v.raw, kLane));
1711template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1713 return static_cast<T
>(wasm_i64x2_extract_lane(
v.raw, kLane));
1716template <
size_t kLane,
size_t N>
1718 return wasm_f32x4_extract_lane(
v.raw, kLane);
1726template <
typename T>
1733template <
typename T>
1735#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1736 if (__builtin_constant_p(i)) {
1739 return detail::ExtractLane<0>(
v);
1741 return detail::ExtractLane<1>(
v);
1745 alignas(16) T lanes[2];
1750template <
typename T>
1752#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1753 if (__builtin_constant_p(i)) {
1756 return detail::ExtractLane<0>(
v);
1758 return detail::ExtractLane<1>(
v);
1760 return detail::ExtractLane<2>(
v);
1762 return detail::ExtractLane<3>(
v);
1766 alignas(16) T lanes[4];
1771template <
typename T>
1773#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1774 if (__builtin_constant_p(i)) {
1777 return detail::ExtractLane<0>(
v);
1779 return detail::ExtractLane<1>(
v);
1781 return detail::ExtractLane<2>(
v);
1783 return detail::ExtractLane<3>(
v);
1785 return detail::ExtractLane<4>(
v);
1787 return detail::ExtractLane<5>(
v);
1789 return detail::ExtractLane<6>(
v);
1791 return detail::ExtractLane<7>(
v);
1795 alignas(16) T lanes[8];
1800template <
typename T>
1802#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1803 if (__builtin_constant_p(i)) {
1806 return detail::ExtractLane<0>(
v);
1808 return detail::ExtractLane<1>(
v);
1810 return detail::ExtractLane<2>(
v);
1812 return detail::ExtractLane<3>(
v);
1814 return detail::ExtractLane<4>(
v);
1816 return detail::ExtractLane<5>(
v);
1818 return detail::ExtractLane<6>(
v);
1820 return detail::ExtractLane<7>(
v);
1822 return detail::ExtractLane<8>(
v);
1824 return detail::ExtractLane<9>(
v);
1826 return detail::ExtractLane<10>(
v);
1828 return detail::ExtractLane<11>(
v);
1830 return detail::ExtractLane<12>(
v);
1832 return detail::ExtractLane<13>(
v);
1834 return detail::ExtractLane<14>(
v);
1836 return detail::ExtractLane<15>(
v);
1840 alignas(16) T lanes[16];
1846template <
typename T,
size_t N>
1848 return detail::ExtractLane<0>(
v);
1855template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1857 static_assert(kLane <
N,
"Lane index out of bounds");
1859 wasm_i8x16_replace_lane(
v.raw, kLane,
static_cast<int8_t
>(t))};
1862template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1864 static_assert(kLane <
N,
"Lane index out of bounds");
1866 wasm_i16x8_replace_lane(
v.raw, kLane,
static_cast<int16_t
>(t))};
1869template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1871 static_assert(kLane <
N,
"Lane index out of bounds");
1872 return Vec128<T, N>{
1873 wasm_i32x4_replace_lane(
v.raw, kLane,
static_cast<int32_t
>(t))};
1876template <
size_t kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1878 static_assert(kLane <
N,
"Lane index out of bounds");
1879 return Vec128<T, N>{
1880 wasm_i64x2_replace_lane(
v.raw, kLane,
static_cast<int64_t
>(t))};
1883template <
size_t kLane,
size_t N>
1885 static_assert(kLane <
N,
"Lane index out of bounds");
1889template <
size_t kLane,
size_t N>
1891 static_assert(kLane < 2,
"Lane index out of bounds");
1900template <
typename T>
1907template <
typename T>
1909#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1910 if (__builtin_constant_p(i)) {
1913 return detail::InsertLane<0>(
v, t);
1915 return detail::InsertLane<1>(
v, t);
1920 alignas(16) T lanes[2];
1923 return Load(
d, lanes);
1926template <
typename T>
1928#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1929 if (__builtin_constant_p(i)) {
1932 return detail::InsertLane<0>(
v, t);
1934 return detail::InsertLane<1>(
v, t);
1936 return detail::InsertLane<2>(
v, t);
1938 return detail::InsertLane<3>(
v, t);
1943 alignas(16) T lanes[4];
1946 return Load(
d, lanes);
1949template <
typename T>
1951#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1952 if (__builtin_constant_p(i)) {
1955 return detail::InsertLane<0>(
v, t);
1957 return detail::InsertLane<1>(
v, t);
1959 return detail::InsertLane<2>(
v, t);
1961 return detail::InsertLane<3>(
v, t);
1963 return detail::InsertLane<4>(
v, t);
1965 return detail::InsertLane<5>(
v, t);
1967 return detail::InsertLane<6>(
v, t);
1969 return detail::InsertLane<7>(
v, t);
1974 alignas(16) T lanes[8];
1977 return Load(
d, lanes);
1980template <
typename T>
1982#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1983 if (__builtin_constant_p(i)) {
1986 return detail::InsertLane<0>(
v, t);
1988 return detail::InsertLane<1>(
v, t);
1990 return detail::InsertLane<2>(
v, t);
1992 return detail::InsertLane<3>(
v, t);
1994 return detail::InsertLane<4>(
v, t);
1996 return detail::InsertLane<5>(
v, t);
1998 return detail::InsertLane<6>(
v, t);
2000 return detail::InsertLane<7>(
v, t);
2002 return detail::InsertLane<8>(
v, t);
2004 return detail::InsertLane<9>(
v, t);
2006 return detail::InsertLane<10>(
v, t);
2008 return detail::InsertLane<11>(
v, t);
2010 return detail::InsertLane<12>(
v, t);
2012 return detail::InsertLane<13>(
v, t);
2014 return detail::InsertLane<14>(
v, t);
2016 return detail::InsertLane<15>(
v, t);
2021 alignas(16) T lanes[16];
2024 return Load(
d, lanes);
2029template <
typename T,
size_t N>
2032 return Vec128<T,
N / 2>{
v.raw};
2035template <
typename T,
size_t N>
2043template <
int kBytes,
typename T,
size_t N>
2045 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2046 const __i8x16 zero = wasm_i8x16_splat(0);
2052 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
2053 6, 7, 8, 9, 10, 11, 12, 13, 14)};
2056 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
2057 5, 6, 7, 8, 9, 10, 11, 12, 13)};
2060 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 0, 1, 2,
2061 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
2064 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 0, 1,
2065 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
2068 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 0,
2069 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
2072 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2073 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
2076 return Vec128<T, N>{wasm_i8x16_shuffle(
2077 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
2080 return Vec128<T, N>{wasm_i8x16_shuffle(
2081 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
2084 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2085 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
2089 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2090 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
2094 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2095 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
2099 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2100 16, 16, 16, 16, 16, 16, 16, 0, 1,
2104 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2105 16, 16, 16, 16, 16, 16, 16, 16, 0,
2109 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2110 16, 16, 16, 16, 16, 16, 16, 16, 16,
2114 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
2115 16, 16, 16, 16, 16, 16, 16, 16, 16,
2118 return Vec128<T, N>{zero};
2121template <
int kBytes,
typename T,
size_t N>
2123 return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(),
v);
2128template <
int kLanes,
typename T,
size_t N>
2134template <
int kLanes,
typename T,
size_t N>
2136 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(
v)>(),
v);
2143template <
int kBytes,
typename T,
size_t N>
2145 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2146 const __i8x16 zero = wasm_i8x16_splat(0);
2153 return wasm_i8x16_shuffle(
v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2154 12, 13, 14, 15, 16);
2157 return wasm_i8x16_shuffle(
v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2158 13, 14, 15, 16, 16);
2161 return wasm_i8x16_shuffle(
v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2162 13, 14, 15, 16, 16, 16);
2165 return wasm_i8x16_shuffle(
v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2166 14, 15, 16, 16, 16, 16);
2169 return wasm_i8x16_shuffle(
v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
2170 15, 16, 16, 16, 16, 16);
2173 return wasm_i8x16_shuffle(
v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2174 16, 16, 16, 16, 16, 16);
2177 return wasm_i8x16_shuffle(
v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2178 16, 16, 16, 16, 16, 16, 16);
2181 return wasm_i8x16_shuffle(
v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2182 16, 16, 16, 16, 16, 16, 16);
2185 return wasm_i8x16_shuffle(
v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
2186 16, 16, 16, 16, 16, 16, 16);
2189 return wasm_i8x16_shuffle(
v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
2190 16, 16, 16, 16, 16, 16, 16);
2193 return wasm_i8x16_shuffle(
v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
2194 16, 16, 16, 16, 16, 16, 16);
2197 return wasm_i8x16_shuffle(
v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
2198 16, 16, 16, 16, 16, 16, 16);
2201 return wasm_i8x16_shuffle(
v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
2202 16, 16, 16, 16, 16, 16, 16);
2205 return wasm_i8x16_shuffle(
v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
2206 16, 16, 16, 16, 16, 16, 16);
2209 return wasm_i8x16_shuffle(
v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
2210 16, 16, 16, 16, 16, 16, 16);
2219template <
int kBytes,
typename T,
size_t N>
2222 if (
N != 16 /
sizeof(T)) {
2223 const Vec128<T> vfull{
v.raw};
2226 return Vec128<T, N>{detail::ShrBytes<kBytes>(
v)};
2230template <
int kLanes,
typename T,
size_t N>
2239template <
typename T>
2241 return Vec64<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
2244 return Vec64<float>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
2248template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2255 return Vec128<T, (
N + 1) / 2>{upper.raw};
2260template <
int kBytes,
typename T,
class V = Vec128<T>>
2262 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2268 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2269 11, 12, 13, 14, 15, 16)};
2272 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2273 11, 12, 13, 14, 15, 16, 17)};
2276 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2277 12, 13, 14, 15, 16, 17, 18)};
2280 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
2281 13, 14, 15, 16, 17, 18, 19)};
2284 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
2285 14, 15, 16, 17, 18, 19, 20)};
2288 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
2289 14, 15, 16, 17, 18, 19, 20, 21)};
2292 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
2293 15, 16, 17, 18, 19, 20, 21, 22)};
2296 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
2297 16, 17, 18, 19, 20, 21, 22, 23)};
2300 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
2301 17, 18, 19, 20, 21, 22, 23, 24)};
2304 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
2305 17, 18, 19, 20, 21, 22, 23, 24, 25)};
2308 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
2309 18, 19, 20, 21, 22, 23, 24, 25, 26)};
2312 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
2313 19, 20, 21, 22, 23, 24, 25, 26, 27)};
2316 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
2317 20, 21, 22, 23, 24, 25, 26, 27, 28)};
2320 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
2321 21, 22, 23, 24, 25, 26, 27, 28, 29)};
2324 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2325 22, 23, 24, 25, 26, 27, 28, 29, 30)};
2330template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
2331 class V = Vec128<T, N>>
2333 constexpr size_t kSize =
N *
sizeof(T);
2334 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
2337 using V8 =
VFromD<
decltype(d_full8)>;
2338 const V8 hi8{
BitCast(d8, hi).raw};
2347template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2349 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2350 return Vec128<T, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, kLane, kLane, kLane,
2351 kLane, kLane, kLane, kLane, kLane)};
2354template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2356 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2357 return Vec128<T, N>{
2358 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
2361template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2363 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2364 return Vec128<T, N>{wasm_i64x2_shuffle(
v.raw,
v.raw, kLane, kLane)};
2371template <
typename T,
size_t N,
typename TI,
size_t NI>
2373 const Vec128<TI, NI> from) {
2379 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2381 alignas(16) uint8_t control[16];
2382 alignas(16) uint8_t input[16];
2383 alignas(16) uint8_t output[16];
2384 wasm_v128_store(control, from.raw);
2385 wasm_v128_store(input, bytes.raw);
2386 for (
size_t i = 0; i < 16; ++i) {
2387 output[i] = control[i] < 16 ? input[control[i]] : 0;
2389 return Vec128<TI, NI>{wasm_v128_load(output)};
2393template <
typename T,
size_t N,
typename TI,
size_t NI>
2395 const Vec128<TI, NI> from) {
2396 const Simd<TI, NI, 0>
d;
2399 Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
2414template <
typename T,
size_t N>
2416 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2417 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2418 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
2424template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2426 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2428 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2429 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2431template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2433 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2435 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2437template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2439 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2440 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
2443template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2445 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2447 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2448 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2450template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2452 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2454 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2456template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2458 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2459 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
2462template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2464 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2466 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
2467 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
2469template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2471 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2473 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
2475template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2477 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2478 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
2484template <
typename T>
2486 static_assert(
sizeof(T) == 8,
"Only for 64-bit lanes");
2487 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2489template <
typename T>
2491 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2492 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2496template <
typename T>
2498 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2499 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 2, 3, 0)};
2503template <
typename T>
2505 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2506 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 0, 1, 2)};
2510template <
typename T>
2512 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2513 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 2, 1, 0)};
2519template <
typename T,
size_t N>
2524template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2526 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2527#if HWY_IS_DEBUG_BUILD
2528 const Rebind<TI,
decltype(
d)> di;
2534 using V8 =
VFromD<
decltype(d8)>;
2538 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
2539 if (
sizeof(T) == 4) {
2540 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
2541 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2542 const V8 lane_indices =
2544 const V8 byte_indices =
2546 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2547 0, 1, 2, 3, 0, 1, 2, 3};
2550 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
2551 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2552 const V8 lane_indices =
2554 const V8 byte_indices =
2556 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2557 0, 1, 2, 3, 4, 5, 6, 7};
2558 return Indices128<T, N>{
Add(byte_indices,
Load(d8, kByteOffsets)).raw};
2562template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2564 const Rebind<TI,
decltype(
d)> di;
2568template <
typename T,
size_t N>
2570 using TI = MakeSigned<T>;
2572 const Rebind<TI,
decltype(
d)> di;
2579template <
typename T>
2585template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2587 return Vec128<T, 2>{
Shuffle2301(Vec128<T>{
v.raw}).raw};
2590template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2596template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2602template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2610template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2616template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2621template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2628template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2630 return BitCast(
d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, 3, 2,
2631 1, 0, 7, 6, 5, 4)});
2634template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2639template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2646template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2651template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2662 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2668 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2685 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2691 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2731 26, 11, 27, 12, 28, 13, 29, 14,
2738 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2755 26, 11, 27, 12, 28, 13, 29, 14,
2762 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2790template <
typename T,
class V = Vec128<T>>
2796template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
2798 const Half<
decltype(
d)> d2;
2806template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2810template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2815template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2825template <
typename T,
size_t N>
2826HWY_API Vec128<T, N>
Combine(Simd<T, N, 0>
d, Vec128<T, N / 2> hi_half,
2827 Vec128<T, N / 2> lo_half) {
2828 const Half<
decltype(
d)> d2;
2832 const VU lo{
BitCast(du2, lo_half).raw};
2833 const VU hi{
BitCast(du2, hi_half).raw};
2839template <
typename T,
size_t N>
2847template <
typename T>
2852template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2854 const Vec128<T, N> lo) {
2855 const Half<
decltype(
d)> d2;
2861template <
typename T>
2866template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2868 const Vec128<T, N> lo) {
2869 const Half<
decltype(
d)> d2;
2875template <
typename T>
2878 return CombineShiftRightBytes<8>(
d, hi, lo);
2880template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2882 const Vec128<T, N> lo) {
2883 const Half<
decltype(
d)> d2;
2888template <
typename T,
size_t N>
2890 const Vec128<T, N> lo) {
2897template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2899 return Vec128<T>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 1, 3, 5, 7, 9, 11, 13, 15,
2900 17, 19, 21, 23, 25, 27, 29, 31)};
2904template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2908 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 1, 3, 5, 7, 17, 19, 21,
2909 23, 1, 3, 5, 7, 17, 19, 21, 23)};
2913template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2917 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
2918 19, 1, 3, 17, 19, 1, 3, 17, 19)};
2922template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2925 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
2929template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2933 return Vec128<T, 4>{
2934 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
2938template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2940 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2944template <
typename T>
2953template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2955 return Vec128<T>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 0, 2, 4, 6, 8, 10, 12, 14,
2956 16, 18, 20, 22, 24, 26, 28, 30)};
2960template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2964 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.
raw, hi.
raw, 0, 2, 4, 6, 16, 18, 20,
2965 22, 0, 2, 4, 6, 16, 18, 20, 22)};
2969template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2973 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
2974 18, 0, 2, 16, 18, 0, 2, 16, 18)};
2978template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2981 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
2985template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2989 return Vec128<T, 4>{
2990 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
2994template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2996 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
3000template <
typename T>
3008template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3010 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 0, 0, 2, 2)};
3013template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3020template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3022 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 1, 3, 3)};
3025template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3034template <
typename T,
size_t N>
3039 alignas(16)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3040 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3043template <
typename T,
size_t N>
3047 wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
3049template <
typename T,
size_t N>
3054template <
typename T,
size_t N>
3062template <
typename T,
size_t N>
3063HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
3073template <
typename T,
size_t N>
3080template <
typename T,
size_t N>
3088template <
typename T>
3107 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
3118 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
3122 const Vec128<uint16_t, N>
v) {
3123 return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(
v.raw)};
3140 const Vec128<int8_t, N>
v) {
3141 return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(
v.raw)};
3145 const Vec128<int8_t, N>
v) {
3146 return Vec128<int32_t, N>{
3147 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(
v.raw))};
3151 const Vec128<int16_t, N>
v) {
3152 return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(
v.raw)};
3156 const Vec128<int32_t, N>
v) {
3157 return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(
v.raw)};
3168 const Vec128<float16_t, N>
v) {
3172 const auto bits16 =
PromoteTo(du32, Vec128<uint16_t, N>{
v.raw});
3173 const auto sign = ShiftRight<15>(bits16);
3174 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3175 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3176 const auto subnormal =
3178 Set(df32, 1.0f / 16384 / 1024));
3180 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3181 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3182 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3183 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3184 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3189 const Vec128<bfloat16_t, N>
v) {
3190 const Rebind<uint16_t,
decltype(df32)> du16;
3212 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3214 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3226 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3238 const Vec128<double, N>
v) {
3239 return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(
v.raw)};
3244 const Vec128<float, N>
v) {
3246 const Rebind<uint32_t,
decltype(du16)> du;
3248 const auto bits32 =
BitCast(du,
v);
3249 const auto sign = ShiftRight<31>(bits32);
3250 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3251 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3253 const auto k15 =
Set(di, 15);
3254 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3255 const auto is_tiny = exp <
Set(di, -24);
3257 const auto is_subnormal = exp <
Set(di, -14);
3258 const auto biased_exp16 =
3260 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3261 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3262 (mantissa32 >> (
Set(du, 13) + sub_exp));
3264 ShiftRight<13>(mantissa32));
3266 const auto sign16 = ShiftLeft<15>(sign);
3267 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3269 return Vec128<float16_t, N>{
DemoteTo(du16, bits16).raw};
3274 const Vec128<float, N>
v) {
3275 const Rebind<int32_t,
decltype(dbf16)> di32;
3276 const Rebind<uint32_t,
decltype(dbf16)> du32;
3277 const Rebind<uint16_t,
decltype(dbf16)> du16;
3278 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3284 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
3286 const Repartition<uint32_t,
decltype(dbf16)> du32;
3287 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
3294 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
3295 return Vec128<uint8_t, N>{
3296 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
3323 const DFromV<
decltype(
v)> du8;
3327 using VU16 =
VFromD<
decltype(du16)>;
3329 const VU16 vFDB97531 = ShiftRight<8>(
BitCast(du16,
v));
3331 const VU16 sFE_DC_BA_98_76_54_32_10 =
Add(vFDB97531, vECA86420);
3333 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
3334 BitCast(du16, ShiftRight<16>(
BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
3335 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
3336 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
3337 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
3338 BitCast(du16, ShiftRight<32>(
BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
3339 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
3340 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
3341 return And(
BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70),
Set(du64, 0xFFFF));
3348template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
3353 const Vec128<T, N> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
3356 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
3357 1, 1, 1, 1, 1, 1, 1, 1};
3360 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3361 1, 2, 4, 8, 16, 32, 64, 128};
3365template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3368 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3373template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3376 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
3381template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3384 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
3391template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3394 uint64_t mask_bits = 0;
3404template <
typename T>
3406 const Mask128<T> mask) {
3407 alignas(16) uint64_t lanes[2];
3408 wasm_v128_store(lanes, mask.raw);
3410 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3411 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
3412 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
3417template <
typename T>
3420 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3421 return (
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0)) *
3427template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3430 uint64_t bytes =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0));
3432 bytes &= (1ULL << (
N * 8)) - 1;
3433 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
3434 return (bytes * kMagic) >> 56;
3437template <
typename T,
size_t N>
3441 const __i16x8 zero = wasm_i16x8_splat(0);
3446template <
typename T,
size_t N>
3449 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
3450 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
3451 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
3452 alignas(16) uint32_t lanes[4];
3453 wasm_v128_store(lanes, sliced_mask);
3454 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
3457template <
typename T,
size_t N>
3460 const __i64x2 mask_i =
static_cast<__i64x2
>(mask.
raw);
3461 const __i64x2 slice = wasm_i64x2_make(1, 2);
3462 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
3463 alignas(16) uint64_t lanes[2];
3464 wasm_v128_store(lanes, sliced_mask);
3465 return lanes[0] | lanes[1];
3469template <
typename T,
size_t N>
3470constexpr uint64_t
OnlyActive(uint64_t bits) {
3471 return ((
N *
sizeof(T)) == 16) ? bits : bits & ((1ull <<
N) - 1);
3478 (
N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3479 : (
N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3480 : (
N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3481 : (
N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3482 : (
N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3483 : (
N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3484 : (
N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3485 : (
N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3486 : (
N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3487 : (
N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3489 : (
N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3491 : (
N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3493 : (
N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3495 : (
N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3498 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3500 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3501 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3504template <
typename T,
size_t N>
3509template <
typename T>
3514template <
typename T>
3519template <
typename T>
3521 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3522 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3523 alignas(16) uint64_t lanes[2];
3524 wasm_v128_store(lanes, shifted_bits);
3525 return PopCount(lanes[0] | lanes[1]);
3528template <
typename T>
3530 alignas(16) int64_t lanes[2];
3531 wasm_v128_store(lanes, m.raw);
3532 return static_cast<size_t>(-(lanes[0] + lanes[1]));
3538template <
typename T,
size_t N>
3540 const Mask128<T, N> mask, uint8_t* bits) {
3542 const size_t kNumBytes = (
N + 7) / 8;
3543 CopyBytes<kNumBytes>(&mask_bits, bits);
3547template <
typename T,
size_t N>
3553template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3556 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3561template <
typename T>
3567 return !wasm_i8x16_any_true(v8.raw);
3570 return (wasm_i64x2_extract_lane(m.
raw, 0) |
3571 wasm_i64x2_extract_lane(m.
raw, 1)) == 0;
3577template <
typename T>
3579 return wasm_i8x16_all_true(m.
raw);
3581template <
typename T>
3583 return wasm_i16x8_all_true(m.
raw);
3585template <
typename T>
3587 return wasm_i32x4_all_true(m.
raw);
3589template <
typename T>
3591 return wasm_i64x2_all_true(m.
raw);
3596template <
typename T,
size_t N>
3603template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3606 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3610template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3611HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> m) {
3613 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3614 return AllTrue(Full128<T>(), Mask128<T>{
Or(mask, m).raw});
3617template <
typename T,
size_t N>
3619 const Mask128<T, N> mask) {
3628template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3632 const Rebind<uint8_t,
decltype(
d)> d8;
3640 alignas(16)
constexpr uint8_t table[256 * 8] = {
3642 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3643 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3644 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
3645 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3646 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
3647 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
3648 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
3649 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3650 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
3651 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
3652 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
3653 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
3654 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
3655 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
3656 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
3657 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3658 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
3659 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
3660 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
3661 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
3662 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
3663 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
3664 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
3665 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
3666 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
3667 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
3668 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
3669 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
3670 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
3671 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
3672 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
3673 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3674 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
3675 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
3676 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
3677 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
3678 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
3679 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
3680 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
3681 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
3682 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
3683 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
3684 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
3685 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
3686 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
3687 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
3688 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
3689 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
3690 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
3691 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
3692 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
3693 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
3694 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
3695 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
3696 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
3697 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
3698 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
3699 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
3700 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
3701 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
3702 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
3703 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
3704 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
3705 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3706 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
3707 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
3708 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
3709 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
3710 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
3711 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
3712 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
3713 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
3714 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
3715 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
3716 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
3717 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
3718 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
3719 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
3720 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
3721 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
3722 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
3723 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
3724 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
3725 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
3726 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
3727 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
3728 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
3729 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
3730 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
3731 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
3732 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
3733 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
3734 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
3735 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
3736 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
3737 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
3738 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
3739 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
3740 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
3741 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
3742 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
3743 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
3744 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
3745 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
3746 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
3747 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
3748 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
3749 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
3750 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
3751 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
3752 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
3753 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
3754 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
3755 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
3756 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
3757 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
3758 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
3759 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
3760 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
3761 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
3762 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
3763 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
3764 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
3765 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
3766 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
3767 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
3768 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
3769 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3776template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3780 const Rebind<uint8_t,
decltype(
d)> d8;
3788 alignas(16)
constexpr uint8_t table[256 * 8] = {
3790 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
3791 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
3792 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
3793 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
3794 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
3795 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
3796 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
3797 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
3798 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
3799 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
3800 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
3801 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
3802 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
3803 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
3804 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
3805 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
3806 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
3807 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
3808 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
3809 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
3810 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
3811 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
3812 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
3813 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
3814 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
3815 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
3816 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
3817 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
3818 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
3819 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
3820 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
3821 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
3822 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
3823 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
3824 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
3825 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
3826 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
3827 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
3828 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
3829 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
3830 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
3831 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
3832 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
3833 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
3834 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
3835 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
3836 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
3837 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
3838 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
3839 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
3840 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
3841 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
3842 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
3843 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
3844 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
3845 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
3846 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
3847 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
3848 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
3849 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
3850 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
3851 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
3852 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
3853 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
3854 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
3855 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
3856 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
3857 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
3858 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
3859 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
3860 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
3861 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
3862 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
3863 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
3864 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
3865 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
3866 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
3867 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
3868 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
3869 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
3870 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
3871 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
3872 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
3873 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
3874 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
3875 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
3876 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
3877 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
3878 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
3879 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
3880 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
3881 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
3882 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
3883 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
3884 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
3885 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
3886 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
3887 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
3888 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
3889 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
3890 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
3891 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
3892 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
3893 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
3894 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
3895 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
3896 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
3897 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
3898 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
3899 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
3900 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
3901 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
3902 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
3903 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
3904 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
3905 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
3906 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
3907 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
3908 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
3909 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
3910 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
3911 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
3912 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
3913 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
3914 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
3915 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
3916 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
3917 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
3924template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3929 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
3931 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3932 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3933 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
3934 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3935 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
3936 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
3937 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
3938 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3939 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
3940 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
3941 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
3942 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
3943 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3944 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
3945 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
3946 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3947 const Simd<T, N, 0>
d;
3949 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
3952template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3957 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
3959 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
3960 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
3961 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
3962 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
3963 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
3964 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
3965 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3966 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3967 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
3968 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
3969 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
3970 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
3971 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
3972 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
3974 const Simd<T, N, 0>
d;
3976 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
3979template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3984 alignas(16)
constexpr uint8_t u8_indices[4 * 16] = {
3986 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3987 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3988 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3989 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3991 const Simd<T, N, 0>
d;
3993 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
3996template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4001 alignas(16)
constexpr uint8_t u8_indices[4 * 16] = {
4003 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4004 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4005 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4006 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4008 const Simd<T, N, 0>
d;
4010 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
4016template <
typename T,
size_t N>
4018 const auto idx = detail::IdxFromBits<T, N>(mask_bits);
4024template <
typename T,
size_t N>
4026 const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
4034template <
typename T>
4035struct CompressIsPartition {
4040template <
typename T>
4046template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4058template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4064template <
typename T>
4070template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4075 const Vec128<T> maskL =
DupEven(m);
4076 const Vec128<T> maskH =
DupOdd(m);
4077 const Vec128<T> swap =
AndNot(maskH, maskL);
4082template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
4086 if (
N < 16 /
sizeof(T)) {
4093 Mask128<uint64_t> ) {
4099template <
typename T,
size_t N>
4102 uint64_t mask_bits = 0;
4103 constexpr size_t kNumBytes = (
N + 7) / 8;
4104 CopyBytes<kNumBytes>(bits, &mask_bits);
4106 mask_bits &= (1ull <<
N) - 1;
4113template <
typename T,
size_t N>
4123template <
typename T,
size_t N>
4128 using TU =
TFromD<
decltype(du)>;
4130 const size_t count =
PopCount(mask_bits);
4139template <
typename T,
size_t N>
4143 uint64_t mask_bits = 0;
4144 constexpr size_t kNumBytes = (
N + 7) / 8;
4145 CopyBytes<kNumBytes>(bits, &mask_bits);
4147 mask_bits &= (1ull <<
N) - 1;
4163 const Vec128<uint64_t> b) {
4164 alignas(16) uint64_t mul[2];
4166 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0)),
4167 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
4168 return Load(Full128<uint64_t>(), mul);
4172 const Vec128<uint64_t> b) {
4173 alignas(16) uint64_t mul[2];
4175 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1)),
4176 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
4177 return Load(Full128<uint64_t>(), mul);
4184 Vec128<bfloat16_t, 2 * N> a,
4185 Vec128<bfloat16_t, 2 * N> b,
4186 const Vec128<float, N> sum0,
4187 Vec128<float, N>& sum1) {
4190 const Vec128<uint16_t, 2 * N> zero =
Zero(du16);
4191 const Vec128<uint32_t, N> a0 =
ZipLower(du32, zero,
BitCast(du16, a));
4192 const Vec128<uint32_t, N> a1 =
ZipUpper(du32, zero,
BitCast(du16, a));
4193 const Vec128<uint32_t, N> b0 =
ZipLower(du32, zero,
BitCast(du16, b));
4194 const Vec128<uint32_t, N> b1 =
ZipUpper(du32, zero,
BitCast(du16, b));
4204template <
typename T>
4209template <
typename T>
4214template <
typename T>
4216 const Vec128<T, 1>
v) {
4223template <
typename T>
4228template <
typename T>
4233template <
typename T>
4235 const Vec128<T, 2> v10) {
4236 return Max(v10, Vec128<T, 2>{
Shuffle2301(Vec128<T>{v10.raw}).raw});
4240template <
typename T>
4244 const Vec128<T> v31_20_31_20 = v3210 + v1032;
4246 return v20_31_20_31 + v31_20_31_20;
4248template <
typename T>
4254 return Min(v20_31_20_31, v31_20_31_20);
4256template <
typename T>
4258 const Vec128<T> v3210) {
4260 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
4261 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4262 return Max(v20_31_20_31, v31_20_31_20);
4268template <
typename T>
4274template <
typename T>
4278 return Min(v10, v01);
4280template <
typename T>
4282 const Vec128<T> v10) {
4284 return Max(v10, v01);
4288template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4293 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4296 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
4298template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4303 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4306 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
4312template <
typename T,
size_t N>
4316template <
typename T,
size_t N>
4320template <
typename T,
size_t N>
4327template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4330 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"Use u64");
4344 const Mask128<T, N> eqHL =
Eq(a, b);
4350 const Vec128<T, N> ltLx =
DupEven(ltHL);
4351 const Vec128<T, N> outHx =
IfThenElse(eqHL, ltLx, ltHL);
4355template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4415HWY_API auto Eq(V a, V b) ->
decltype(a == b) {
4419HWY_API auto Ne(V a, V b) ->
decltype(a == b) {
4423HWY_API auto Lt(V a, V b) ->
decltype(a == b) {
4428HWY_API auto Gt(V a, V b) ->
decltype(a == b) {
4432HWY_API auto Ge(V a, V b) ->
decltype(a == b) {
4437HWY_API auto Le(V a, V b) ->
decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:106
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:75
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:81
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:90
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:87
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:72
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:84
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:78
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1700
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:800
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:797
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
__v128_u raw
Definition: wasm_128-inl.h:2521
Definition: ops/shared-inl.h:40
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:151
Definition: wasm_128-inl.h:146
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:147
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:114
__f32x4 type
Definition: wasm_128-inl.h:60
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56