21#include <wasm_simd128.h>
44 return *
this = (*
this * other);
47 return *
this = (*
this / other);
50 return *
this = (*
this + other);
53 return *
this = (*
this - other);
56 return *
this = (*
this & other);
59 return *
this = (*
this | other);
62 return *
this = (*
this ^ other);
77template <
typename T,
typename FromT>
79 const Half<
decltype(
d)> dh;
147template <
typename T,
typename T2>
150 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
151 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
153 return Load(
d, lanes);
353template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
360 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
368 return shifted &
Set(d8, 0xFF >> kBits);
376 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
377 return (shifted ^ shifted_sign) - shifted_sign;
381template <
int kBits,
typename T>
383 constexpr size_t kSizeInBits =
sizeof(T) * 8;
384 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
385 if (kBits == 0)
return v;
386 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
426template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
431 return shifted &
Set(d8, (0xFF << bits) & 0xFF);
439 return shifted &
Set(d8, 0xFF >> bits);
446 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
447 return (shifted ^ shifted_sign) - shifted_sign;
466 alignas(32)
float min[4];
468 HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
470 HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
485 alignas(32)
float min[4];
487 HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
489 HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
514 alignas(32)
float max[4];
516 HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
518 HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
533 alignas(32)
float max[4];
535 HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
537 HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
572 const auto al = wasm_u32x4_extend_low_u16x8(a.
raw);
573 const auto ah = wasm_u32x4_extend_high_u16x8(a.
raw);
574 const auto bl = wasm_u32x4_extend_low_u16x8(b.
raw);
575 const auto bh = wasm_u32x4_extend_high_u16x8(b.
raw);
576 const auto l = wasm_i32x4_mul(al, bl);
577 const auto h = wasm_i32x4_mul(ah, bh);
579 return Vec256<uint16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
584 const auto al = wasm_i32x4_extend_low_i16x8(a.
raw);
585 const auto ah = wasm_i32x4_extend_high_i16x8(a.
raw);
586 const auto bl = wasm_i32x4_extend_low_i16x8(b.
raw);
587 const auto bh = wasm_i32x4_extend_high_i16x8(b.
raw);
588 const auto l = wasm_i32x4_mul(al, bl);
589 const auto h = wasm_i32x4_mul(ah, bh);
591 return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
602 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
603 const auto ae = wasm_v128_and(a.
raw, kEvenMask);
604 const auto be = wasm_v128_and(b.
raw, kEvenMask);
610 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
611 const auto ae = wasm_v128_and(a.
raw, kEvenMask);
612 const auto be = wasm_v128_and(b.
raw, kEvenMask);
618template <
typename T, HWY_IF_FLOAT(T)>
664 return mul * x + add;
671 return add - mul * x;
679 return mul * x - sub;
686 return Neg(mul) * x - sub;
700 return one /
Sqrt(
v);
732template <
typename T, HWY_IF_FLOAT(T)>
742template <
typename T, HWY_IF_FLOAT(T)>
751 const VFromD<
decltype(di)> exp =
760template <
typename TFrom,
typename TTo>
762 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
768 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
769 return (
v & bit) == bit;
859 const auto a32 =
BitCast(d32, a);
860 const auto b32 =
BitCast(d32, b);
862 const auto m_gt = a32 < b32;
865 const auto m_eq = a32 == b32;
866 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
867 const auto lo_gt =
And(m_eq, lo_in_hi);
869 const auto gt =
Or(lo_gt, m_gt);
874template <
typename T, HWY_IF_UNSIGNED(T)>
878 const Vec256<T> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
953 return Or(o1,
Or(o2, o3));
960 return Or(o,
And(a1, a2));
991 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
998 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1004template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
1015template <
typename T>
1020template <
typename T>
1026template <
typename T>
1032template <
typename T>
1038template <
typename T>
1043template <
typename T>
1049template <
typename T, HWY_IF_FLOAT(T)>
1052 const auto zero =
Zero(
d);
1053 return IfThenElse(Mask256<T>{(
v > zero).raw},
v, zero);
1058template <
typename T>
1063template <
typename T>
1069template <
typename T>
1075template <
typename T>
1081template <
typename T>
1097template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1104 test = ShiftLeft<12>(test);
1107 test = ShiftLeft<1>(test);
1111 test = ShiftLeft<1>(test);
1115 test = ShiftLeft<1>(test);
1122template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1129 test = ShiftLeft<27>(test);
1132 test = ShiftLeft<1>(test);
1136 test = ShiftLeft<1>(test);
1140 test = ShiftLeft<1>(test);
1144 test = ShiftLeft<1>(test);
1153template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1160 test = ShiftLeft<12>(test);
1163 test = ShiftLeft<1>(test);
1167 test = ShiftLeft<1>(test);
1171 test = ShiftLeft<1>(test);
1178template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1185 test = ShiftLeft<27>(test);
1188 test = ShiftLeft<1>(test);
1192 test = ShiftLeft<1>(test);
1196 test = ShiftLeft<1>(test);
1200 test = ShiftLeft<1>(test);
1211template <
typename T>
1213 return Vec256<T>{wasm_v128_load(aligned)};
1216template <
typename T>
1223template <
typename T>
1229template <
typename T>
1236template <
typename T>
1238 wasm_v128_store(aligned,
v.raw);
1242template <
typename T>
1247template <
typename T>
1257template <
typename T>
1260 wasm_v128_store(aligned,
v.raw);
1265template <
typename T,
typename Offset>
1268 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1270 alignas(32) T lanes[32 /
sizeof(T)];
1273 alignas(32) Offset offset_lanes[32 /
sizeof(T)];
1276 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1277 for (
size_t i = 0; i <
N; ++i) {
1278 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1282template <
typename T,
typename Index>
1285 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1287 alignas(32) T lanes[32 /
sizeof(T)];
1290 alignas(32) Index index_lanes[32 /
sizeof(T)];
1293 for (
size_t i = 0; i <
N; ++i) {
1294 base[index_lanes[i]] = lanes[i];
1300template <
typename T,
typename Offset>
1303 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1305 alignas(32) Offset offset_lanes[32 /
sizeof(T)];
1308 alignas(32) T lanes[32 /
sizeof(T)];
1309 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1310 for (
size_t i = 0; i <
N; ++i) {
1311 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1313 return Load(
d, lanes);
1316template <
typename T,
typename Index>
1319 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1321 alignas(32) Index index_lanes[32 /
sizeof(T)];
1324 alignas(32) T lanes[32 /
sizeof(T)];
1325 for (
size_t i = 0; i <
N; ++i) {
1326 lanes[i] = base[index_lanes[i]];
1328 return Load(
d, lanes);
1334template <
typename T,
size_t N>
1340template <
typename T,
size_t N>
1348 return wasm_i8x16_extract_lane(
v.raw, 0);
1351 return wasm_i8x16_extract_lane(
v.raw, 0);
1354 return wasm_i16x8_extract_lane(
v.raw, 0);
1357 return wasm_i16x8_extract_lane(
v.raw, 0);
1360 return wasm_i32x4_extract_lane(
v.raw, 0);
1363 return wasm_i32x4_extract_lane(
v.raw, 0);
1366 return wasm_i64x2_extract_lane(
v.raw, 0);
1369 return wasm_i64x2_extract_lane(
v.raw, 0);
1373 return wasm_f32x4_extract_lane(
v.raw, 0);
1378template <
typename T>
1383template <
typename T>
1391template <
int kBytes,
typename T>
1393 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1394 const __i8x16 zero = wasm_i8x16_splat(0);
1400 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
1401 7, 8, 9, 10, 11, 12, 13, 14)};
1404 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
1405 6, 7, 8, 9, 10, 11, 12, 13)};
1408 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
1409 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1412 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
1413 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1416 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
1417 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1420 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1421 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1424 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1425 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1428 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1429 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1432 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1433 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
1436 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1437 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
1440 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1441 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
1444 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1445 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
1448 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1449 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
1452 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1453 16, 16, 16, 16, 16, 16, 16, 16, 0,
1457 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1458 16, 16, 16, 16, 16, 16, 16, 16, 16,
1464template <
int kBytes,
typename T>
1471template <
int kLanes,
typename T>
1477template <
int kLanes,
typename T>
1486template <
int kBytes,
typename T>
1488 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1489 const __i8x16 zero = wasm_i8x16_splat(0);
1496 return wasm_i8x16_shuffle(
v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1497 12, 13, 14, 15, 16);
1500 return wasm_i8x16_shuffle(
v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1501 13, 14, 15, 16, 16);
1504 return wasm_i8x16_shuffle(
v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1505 13, 14, 15, 16, 16, 16);
1508 return wasm_i8x16_shuffle(
v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1509 14, 15, 16, 16, 16, 16);
1512 return wasm_i8x16_shuffle(
v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1513 15, 16, 16, 16, 16, 16);
1516 return wasm_i8x16_shuffle(
v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1517 16, 16, 16, 16, 16, 16);
1520 return wasm_i8x16_shuffle(
v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1521 16, 16, 16, 16, 16, 16, 16);
1524 return wasm_i8x16_shuffle(
v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1525 16, 16, 16, 16, 16, 16, 16);
1528 return wasm_i8x16_shuffle(
v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1529 16, 16, 16, 16, 16, 16, 16);
1532 return wasm_i8x16_shuffle(
v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1533 16, 16, 16, 16, 16, 16, 16);
1536 return wasm_i8x16_shuffle(
v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1537 16, 16, 16, 16, 16, 16, 16);
1540 return wasm_i8x16_shuffle(
v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1541 16, 16, 16, 16, 16, 16, 16);
1544 return wasm_i8x16_shuffle(
v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1545 16, 16, 16, 16, 16, 16, 16);
1548 return wasm_i8x16_shuffle(
v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1549 16, 16, 16, 16, 16, 16, 16);
1552 return wasm_i8x16_shuffle(
v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1553 16, 16, 16, 16, 16, 16, 16);
1562template <
int kBytes,
typename T>
1564 return Vec256<T>{detail::ShrBytes<kBytes>(
v)};
1568template <
int kLanes,
typename T>
1577template <
typename T>
1580 return Vec128<T, 8 /
sizeof(T)>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
1589template <
int kBytes,
typename T,
class V = Vec256<T>>
1591 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1597 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1598 11, 12, 13, 14, 15, 16)};
1601 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1602 11, 12, 13, 14, 15, 16, 17)};
1605 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1606 12, 13, 14, 15, 16, 17, 18)};
1609 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1610 13, 14, 15, 16, 17, 18, 19)};
1613 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1614 14, 15, 16, 17, 18, 19, 20)};
1617 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1618 14, 15, 16, 17, 18, 19, 20, 21)};
1621 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1622 15, 16, 17, 18, 19, 20, 21, 22)};
1625 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1626 16, 17, 18, 19, 20, 21, 22, 23)};
1629 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1630 17, 18, 19, 20, 21, 22, 23, 24)};
1633 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1634 17, 18, 19, 20, 21, 22, 23, 24, 25)};
1637 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1638 18, 19, 20, 21, 22, 23, 24, 25, 26)};
1641 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1642 19, 20, 21, 22, 23, 24, 25, 26, 27)};
1645 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1646 20, 21, 22, 23, 24, 25, 26, 27, 28)};
1649 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1650 21, 22, 23, 24, 25, 26, 27, 28, 29)};
1653 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1654 22, 23, 24, 25, 26, 27, 28, 29, 30)};
1664 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1666 v.raw,
v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1670 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1672 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1678 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1680 kLane, kLane, kLane, kLane, kLane)};
1684 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1686 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1692 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1694 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1701template <
typename T,
typename TI>
1711 alignas(32) uint8_t control[16];
1712 alignas(32) uint8_t input[16];
1713 alignas(32) uint8_t output[16];
1714 wasm_v128_store(control, from.
raw);
1715 wasm_v128_store(input, bytes.
raw);
1716 for (
size_t i = 0; i < 16; ++i) {
1717 output[i] = control[i] < 16 ? input[control[i]] : 0;
1723template <
typename T,
typename TI>
1745 return Vec128<uint32_t>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1748 return Vec128<int32_t>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1751 return Vec128<float>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1800template <
typename T>
1805template <
typename T,
typename TI>
1807 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
1811template <
typename T,
typename TI>
1813 const Rebind<TI,
decltype(
d)> di;
1817template <
typename T>
1827template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1833template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1839template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1847template <
typename T>
1854template <
typename T>
1861template <
typename T>
1870 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1875 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1888 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1892 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1906template <
typename T,
class V = Vec256<T>>
1918 11, 27, 12, 28, 13, 29, 14, 30, 15,
1924 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1937 11, 27, 12, 28, 13, 29, 14, 30, 15,
1942 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1957template <
typename T,
class V = Vec256<T>>
1966template <
typename T,
class DW = RepartitionToW
ide<Full256<T>>>
1970template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1975template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1985template <
typename T>
1987 const Half<
decltype(
d)> d2;
1991 const VU lo{
BitCast(du2, lo_half).raw};
1992 const VU hi{
BitCast(du2, hi_half).raw};
1998template <
typename T>
2006template <
typename T>
2014template <
typename T>
2022template <
typename T>
2025 return CombineShiftRightBytes<8>(
d, hi, lo);
2029template <
typename T>
2038template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2045template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2053template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2060template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2066template <
typename T>
2072template <
typename T>
2081template <
typename T>
2086 alignas(32)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2087 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2090template <
typename T>
2093 return Vec256<T>{wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2095template <
typename T>
2100template <
typename T>
2108template <
typename T>
2117template <
typename T>
2124template <
typename T>
2131template <
typename T>
2148 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2157 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2176 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(
v.raw))};
2195 const auto sign = ShiftRight<15>(bits16);
2196 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
2197 const auto mantissa = bits16 &
Set(du32, 0x3FF);
2198 const auto subnormal =
2200 Set(df32, 1.0f / 16384 / 1024));
2202 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
2203 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
2204 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2205 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
2206 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2211 const Rebind<uint16_t,
decltype(df32)> du16;
2230 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2231 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2241 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2242 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2260 const auto bits32 =
BitCast(du,
v);
2261 const auto sign = ShiftRight<31>(bits32);
2262 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
2263 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
2265 const auto k15 =
Set(di, 15);
2266 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
2267 const auto is_tiny = exp <
Set(di, -24);
2269 const auto is_subnormal = exp <
Set(di, -14);
2270 const auto biased_exp16 =
2272 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
2273 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
2274 (mantissa32 >> (
Set(du, 13) + sub_exp));
2276 ShiftRight<13>(mantissa32));
2278 const auto sign16 = ShiftLeft<15>(sign);
2279 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2286 const Rebind<int32_t,
decltype(dbf16)> di32;
2287 const Rebind<uint32_t,
decltype(dbf16)> du32;
2288 const Rebind<uint16_t,
decltype(dbf16)> du16;
2289 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
2296 const Repartition<uint32_t,
decltype(dbf16)> du32;
2303 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2304 return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2329template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2334 const Vec256<T> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
2337 alignas(32)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2338 1, 1, 1, 1, 1, 1, 1, 1};
2341 alignas(32)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2342 1, 2, 4, 8, 16, 32, 64, 128};
2346template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2349 alignas(32)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2353template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2356 alignas(32)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2360template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2363 alignas(32)
constexpr uint64_t kBit[8] = {1, 2};
2370template <
typename T>
2373 uint64_t mask_bits = 0;
2383template <
typename T>
2385 const Mask128<T> mask) {
2386 alignas(32) uint64_t lanes[2];
2387 wasm_v128_store(lanes, mask.raw);
2389 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2390 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2391 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2395template <
typename T>
2399 const __i16x8 zero = wasm_i16x8_splat(0);
2404template <
typename T>
2407 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
2408 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2409 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2410 alignas(32) uint32_t lanes[4];
2411 wasm_v128_store(lanes, sliced_mask);
2412 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2418 (
N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2419 : (
N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2420 : (
N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2421 : (
N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2422 : (
N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2423 : (
N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2424 : (
N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2425 : (
N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2426 : (
N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2427 : (
N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2429 : (
N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2431 : (
N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2433 : (
N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2435 : (
N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2438 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2440 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2441 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2444template <
typename T>
2449template <
typename T>
2454template <
typename T>
2459template <
typename T>
2461 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2462 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2463 alignas(32) uint64_t lanes[2];
2464 wasm_v128_store(lanes, shifted_bits);
2465 return PopCount(lanes[0] | lanes[1]);
2471template <
typename T>
2475 const size_t kNumBytes = (
N + 7) / 8;
2476 CopyBytes<kNumBytes>(&mask_bits, bits);
2480template <
typename T>
2485template <
typename T>
2491 return !wasm_i8x16_any_true(v8.raw);
2494 return (wasm_i64x2_extract_lane(m.
raw, 0) |
2495 wasm_i64x2_extract_lane(m.
raw, 1)) == 0;
2501template <
typename T>
2503 return wasm_i8x16_all_true(m.raw);
2505template <
typename T>
2507 return wasm_i16x8_all_true(m.raw);
2509template <
typename T>
2511 return wasm_i32x4_all_true(m.raw);
2516template <
typename T>
2521template <
typename T>
2532template <
typename T>
2536 const Rebind<uint8_t,
decltype(
d)> d8;
2544 alignas(32)
constexpr uint8_t table[256 * 8] = {
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2546 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2547 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2548 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2549 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2550 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2551 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2552 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2553 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2554 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2555 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2556 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2557 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2558 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2559 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2560 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2561 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2562 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2563 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2564 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2565 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2566 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2567 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2568 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2569 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2570 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2571 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2572 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2573 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2574 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2575 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2576 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2577 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2578 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2579 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2580 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2581 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2582 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2583 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2584 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2585 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2586 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2587 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2588 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2589 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2590 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2591 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2592 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2593 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2594 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2595 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2596 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2597 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2598 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2599 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2600 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2601 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2602 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2603 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2604 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2605 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2606 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2607 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2608 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2609 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2610 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2611 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2612 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2613 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2614 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2615 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2616 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2617 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2618 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2619 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2620 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2621 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2622 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2623 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2624 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2625 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2626 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2627 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2628 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2629 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2630 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2631 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2632 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2633 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2634 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2635 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2636 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2637 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2638 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2639 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2640 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2641 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2642 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2643 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2644 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2645 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2646 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2647 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2648 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2649 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2650 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2651 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2652 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2653 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2654 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2655 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2656 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2657 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2658 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2665template <
typename T>
2670 alignas(32)
constexpr uint8_t packed_array[16 * 16] = {
2671 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2672 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2673 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2674 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3,
2675 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2676 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2677 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2678 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
2679 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2680 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2681 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2682 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3,
2683 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2684 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2685 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2686 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2690 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
2693#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2695template <
typename T>
2700 alignas(32)
constexpr uint8_t packed_array[4 * 16] = {
2701 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
2702 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
2703 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
2704 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2708 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
2716template <
typename T>
2718 const uint64_t mask_bits) {
2719 const auto idx = detail::Idx16x8FromBits<T>(mask_bits);
2725template <
typename T>
2727 const uint64_t mask_bits) {
2728 const auto idx = detail::Idx32x4FromBits<T>(mask_bits);
2734#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2736template <
typename T>
2739 const uint64_t mask_bits) {
2740 const auto idx = detail::Idx64x2FromBits<uint64_t>(mask_bits);
2750template <
typename T>
2751struct CompressIsPartition {
2755template <
typename T>
2762template <
typename T>
2775template <
typename T>
2777 uint64_t mask_bits = 0;
2778 constexpr size_t kNumBytes = (
N + 7) / 8;
2779 CopyBytes<kNumBytes>(bits, &mask_bits);
2781 mask_bits &= (1ull <<
N) - 1;
2788template <
typename T>
2798template <
typename T>
2802 using TU =
TFromD<
decltype(du)>;
2804 const size_t count =
PopCount(mask_bits);
2815template <
typename T>
2818 uint64_t mask_bits = 0;
2819 constexpr size_t kNumBytes = (
N + 7) / 8;
2820 CopyBytes<kNumBytes>(bits, &mask_bits);
2822 mask_bits &= (1ull <<
N) - 1;
2839 alignas(32) uint64_t mul[2];
2841 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.
raw, 0)),
2842 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.
raw, 0)), &mul[1]);
2848 alignas(32) uint64_t mul[2];
2850 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.
raw, 1)),
2851 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.
raw, 1)), &mul[1]);
2879template <
typename T>
2883 const Vec256<T> v31_20_31_20 = v3210 + v1032;
2885 return v20_31_20_31 + v31_20_31_20;
2887template <
typename T>
2893 return Min(v20_31_20_31, v31_20_31_20);
2895template <
typename T>
2901 return Max(v20_31_20_31, v31_20_31_20);
2906template <
typename T>
2912template <
typename T>
2916 return Min(v10, v01);
2918template <
typename T>
2922 return Max(v10, v01);
2926template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2930 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
2935template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2939 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
2948template <
typename T>
2952template <
typename T>
2956template <
typename T>
2963template <
typename T>
2966template <
typename T>
2969template <
typename T>
2972template <
typename T>
2975template <
typename T>
2978template <
typename T>
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_ABORT(format,...)
Definition: base.h:141
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:61
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:55
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:49
Raw raw
Definition: x86_256-inl.h:100
Vec128< T > v1
Definition: wasm_256-inl.h:66
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:58
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:46
Vec128< T > v0
Definition: wasm_256-inl.h:65
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:43
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec256< T > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2533
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec256< T > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2666
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: wasm_256-inl.h:1801
__v128_u raw
Definition: wasm_256-inl.h:1802
Definition: wasm_256-inl.h:70
Mask128< T > m1
Definition: wasm_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:137
Mask128< T > m0
Definition: wasm_256-inl.h:71
Definition: ops/shared-inl.h:40