21 #include <wasm_simd128.h>
44 return *
this = (*
this * other);
47 return *
this = (*
this / other);
50 return *
this = (*
this + other);
53 return *
this = (*
this - other);
56 return *
this = (*
this & other);
59 return *
this = (*
this | other);
62 return *
this = (*
this ^ other);
77 template <
typename T,
typename FromT>
79 const Half<decltype(
d)> dh;
139 template <
typename T>
147 template <
typename T,
typename T2>
150 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
151 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
153 return Load(
d, lanes);
353 template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
360 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
368 return shifted &
Set(d8, 0xFF >> kBits);
376 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
377 return (shifted ^ shifted_sign) - shifted_sign;
381 template <
int kBits,
typename T>
383 constexpr
size_t kSizeInBits =
sizeof(T) * 8;
384 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
385 if (kBits == 0)
return v;
386 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
426 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
431 return shifted &
Set(d8, (0xFF << bits) & 0xFF);
439 return shifted &
Set(d8, 0xFF >> bits);
446 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
447 return (shifted ^ shifted_sign) - shifted_sign;
466 alignas(32)
float min[4];
468 HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
470 HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
485 alignas(32)
float min[4];
487 HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
489 HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
514 alignas(32)
float max[4];
516 HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
518 HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
533 alignas(32)
float max[4];
535 HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
537 HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
572 const auto al = wasm_u32x4_extend_low_u16x8(a.
raw);
573 const auto ah = wasm_u32x4_extend_high_u16x8(a.
raw);
574 const auto bl = wasm_u32x4_extend_low_u16x8(b.
raw);
575 const auto bh = wasm_u32x4_extend_high_u16x8(b.
raw);
576 const auto l = wasm_i32x4_mul(al, bl);
577 const auto h = wasm_i32x4_mul(ah, bh);
579 return Vec256<uint16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
584 const auto al = wasm_i32x4_extend_low_i16x8(a.
raw);
585 const auto ah = wasm_i32x4_extend_high_i16x8(a.
raw);
586 const auto bl = wasm_i32x4_extend_low_i16x8(b.
raw);
587 const auto bh = wasm_i32x4_extend_high_i16x8(b.
raw);
588 const auto l = wasm_i32x4_mul(al, bl);
589 const auto h = wasm_i32x4_mul(ah, bh);
591 return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
602 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
603 const auto ae = wasm_v128_and(a.
raw, kEvenMask);
604 const auto be = wasm_v128_and(b.
raw, kEvenMask);
610 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
611 const auto ae = wasm_v128_and(a.
raw, kEvenMask);
612 const auto be = wasm_v128_and(b.
raw, kEvenMask);
618 template <
typename T, HWY_IF_FLOAT(T)>
664 return mul * x + add;
671 return add - mul * x;
679 return mul * x - sub;
686 return Neg(mul) * x - sub;
700 return one /
Sqrt(
v);
729 template <
typename TFrom,
typename TTo>
731 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
735 template <
typename T>
737 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
738 return (
v & bit) == bit;
828 const auto a32 =
BitCast(d32, a);
829 const auto b32 =
BitCast(d32, b);
831 const auto m_gt = a32 < b32;
834 const auto m_eq = a32 == b32;
835 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
836 const auto lo_gt =
And(m_eq, lo_in_hi);
838 const auto gt =
Or(lo_gt, m_gt);
843 template <
typename T, HWY_IF_UNSIGNED(T)>
847 const Vec256<T> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
855 template <
typename T>
874 template <
typename T>
884 template <
typename T>
891 template <
typename T>
899 template <
typename T>
906 template <
typename T>
913 template <
typename T>
920 template <
typename T>
922 return Or(o,
And(a1, a2));
927 template <
typename T>
934 template <
typename T>
939 template <
typename T>
944 template <
typename T>
951 template <
typename T>
953 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
958 template <
typename T>
960 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
966 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
977 template <
typename T>
982 template <
typename T>
988 template <
typename T>
994 template <
typename T>
1000 template <
typename T>
1005 template <
typename T>
1011 template <
typename T, HWY_IF_FLOAT(T)>
1014 const auto zero =
Zero(
d);
1015 return IfThenElse(Mask256<T>{(
v > zero).raw},
v, zero);
1020 template <
typename T>
1025 template <
typename T>
1026 HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
1031 template <
typename T>
1032 HWY_API Mask256<T>
AndNot(
const Mask256<T> a, Mask256<T> b) {
1037 template <
typename T>
1038 HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
1043 template <
typename T>
1044 HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
1059 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1066 test = ShiftLeft<12>(test);
1069 test = ShiftLeft<1>(test);
1073 test = ShiftLeft<1>(test);
1077 test = ShiftLeft<1>(test);
1084 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1091 test = ShiftLeft<27>(test);
1094 test = ShiftLeft<1>(test);
1098 test = ShiftLeft<1>(test);
1102 test = ShiftLeft<1>(test);
1106 test = ShiftLeft<1>(test);
1115 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1122 test = ShiftLeft<12>(test);
1125 test = ShiftLeft<1>(test);
1129 test = ShiftLeft<1>(test);
1133 test = ShiftLeft<1>(test);
1140 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1147 test = ShiftLeft<27>(test);
1150 test = ShiftLeft<1>(test);
1154 test = ShiftLeft<1>(test);
1158 test = ShiftLeft<1>(test);
1162 test = ShiftLeft<1>(test);
1173 template <
typename T>
1175 return Vec256<T>{wasm_v128_load(aligned)};
1178 template <
typename T>
1185 template <
typename T>
1191 template <
typename T>
1198 template <
typename T>
1200 wasm_v128_store(aligned,
v.raw);
1204 template <
typename T>
1209 template <
typename T>
1219 template <
typename T>
1222 wasm_v128_store(aligned,
v.raw);
1227 template <
typename T,
typename Offset>
1230 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1232 alignas(32) T lanes[32 /
sizeof(T)];
1235 alignas(32) Offset offset_lanes[32 /
sizeof(T)];
1238 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1239 for (
size_t i = 0; i <
N; ++i) {
1240 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1244 template <
typename T,
typename Index>
1247 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1249 alignas(32) T lanes[32 /
sizeof(T)];
1252 alignas(32) Index index_lanes[32 /
sizeof(T)];
1255 for (
size_t i = 0; i <
N; ++i) {
1256 base[index_lanes[i]] = lanes[i];
1262 template <
typename T,
typename Offset>
1265 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1267 alignas(32) Offset offset_lanes[32 /
sizeof(T)];
1270 alignas(32) T lanes[32 /
sizeof(T)];
1271 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1272 for (
size_t i = 0; i <
N; ++i) {
1273 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1275 return Load(
d, lanes);
1278 template <
typename T,
typename Index>
1281 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1283 alignas(32) Index index_lanes[32 /
sizeof(T)];
1286 alignas(32) T lanes[32 /
sizeof(T)];
1287 for (
size_t i = 0; i <
N; ++i) {
1288 lanes[i] = base[index_lanes[i]];
1290 return Load(
d, lanes);
1299 return wasm_i8x16_extract_lane(
v.raw, 0);
1302 return wasm_i8x16_extract_lane(
v.raw, 0);
1305 return wasm_i16x8_extract_lane(
v.raw, 0);
1308 return wasm_i16x8_extract_lane(
v.raw, 0);
1311 return wasm_i32x4_extract_lane(
v.raw, 0);
1314 return wasm_i32x4_extract_lane(
v.raw, 0);
1317 return wasm_i64x2_extract_lane(
v.raw, 0);
1320 return wasm_i64x2_extract_lane(
v.raw, 0);
1324 return wasm_f32x4_extract_lane(
v.raw, 0);
1329 template <
typename T>
1334 template <
typename T>
1342 template <
int kBytes,
typename T>
1344 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1345 const __i8x16 zero = wasm_i8x16_splat(0);
1351 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
1352 7, 8, 9, 10, 11, 12, 13, 14)};
1355 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
1356 6, 7, 8, 9, 10, 11, 12, 13)};
1359 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
1360 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1363 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
1364 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1367 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
1368 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1371 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1372 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1375 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1376 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1379 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1380 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1383 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1384 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
1387 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1388 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
1391 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1392 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
1395 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1396 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
1399 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1400 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
1403 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1404 16, 16, 16, 16, 16, 16, 16, 16, 0,
1408 return Vec256<T>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16,
1409 16, 16, 16, 16, 16, 16, 16, 16, 16,
1415 template <
int kBytes,
typename T>
1422 template <
int kLanes,
typename T>
1428 template <
int kLanes,
typename T>
1437 template <
int kBytes,
typename T>
1439 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1440 const __i8x16 zero = wasm_i8x16_splat(0);
1447 return wasm_i8x16_shuffle(
v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1448 12, 13, 14, 15, 16);
1451 return wasm_i8x16_shuffle(
v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1452 13, 14, 15, 16, 16);
1455 return wasm_i8x16_shuffle(
v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1456 13, 14, 15, 16, 16, 16);
1459 return wasm_i8x16_shuffle(
v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1460 14, 15, 16, 16, 16, 16);
1463 return wasm_i8x16_shuffle(
v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1464 15, 16, 16, 16, 16, 16);
1467 return wasm_i8x16_shuffle(
v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1468 16, 16, 16, 16, 16, 16);
1471 return wasm_i8x16_shuffle(
v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1472 16, 16, 16, 16, 16, 16, 16);
1475 return wasm_i8x16_shuffle(
v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1476 16, 16, 16, 16, 16, 16, 16);
1479 return wasm_i8x16_shuffle(
v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1480 16, 16, 16, 16, 16, 16, 16);
1483 return wasm_i8x16_shuffle(
v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1484 16, 16, 16, 16, 16, 16, 16);
1487 return wasm_i8x16_shuffle(
v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1488 16, 16, 16, 16, 16, 16, 16);
1491 return wasm_i8x16_shuffle(
v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1492 16, 16, 16, 16, 16, 16, 16);
1495 return wasm_i8x16_shuffle(
v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1496 16, 16, 16, 16, 16, 16, 16);
1499 return wasm_i8x16_shuffle(
v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1500 16, 16, 16, 16, 16, 16, 16);
1503 return wasm_i8x16_shuffle(
v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1504 16, 16, 16, 16, 16, 16, 16);
1513 template <
int kBytes,
typename T>
1515 return Vec256<T>{detail::ShrBytes<kBytes>(
v)};
1519 template <
int kLanes,
typename T>
1528 template <
typename T>
1531 return Vec128<T, 8 /
sizeof(T)>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
1540 template <
int kBytes,
typename T,
class V = Vec256<T>>
1542 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1548 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1549 11, 12, 13, 14, 15, 16)};
1552 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1553 11, 12, 13, 14, 15, 16, 17)};
1556 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1557 12, 13, 14, 15, 16, 17, 18)};
1560 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1561 13, 14, 15, 16, 17, 18, 19)};
1564 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1565 14, 15, 16, 17, 18, 19, 20)};
1568 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1569 14, 15, 16, 17, 18, 19, 20, 21)};
1572 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1573 15, 16, 17, 18, 19, 20, 21, 22)};
1576 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1577 16, 17, 18, 19, 20, 21, 22, 23)};
1580 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1581 17, 18, 19, 20, 21, 22, 23, 24)};
1584 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1585 17, 18, 19, 20, 21, 22, 23, 24, 25)};
1588 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1589 18, 19, 20, 21, 22, 23, 24, 25, 26)};
1592 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1593 19, 20, 21, 22, 23, 24, 25, 26, 27)};
1596 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1597 20, 21, 22, 23, 24, 25, 26, 27, 28)};
1600 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1601 21, 22, 23, 24, 25, 26, 27, 28, 29)};
1604 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1605 22, 23, 24, 25, 26, 27, 28, 29, 30)};
1613 template <
int kLane>
1615 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1617 v.raw,
v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1619 template <
int kLane>
1621 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1623 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1627 template <
int kLane>
1629 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1631 kLane, kLane, kLane, kLane, kLane)};
1633 template <
int kLane>
1635 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1637 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1641 template <
int kLane>
1643 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
1645 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
1652 template <
typename T,
typename TI>
1662 alignas(32) uint8_t control[16];
1663 alignas(32) uint8_t input[16];
1664 alignas(32) uint8_t output[16];
1665 wasm_v128_store(control, from.
raw);
1666 wasm_v128_store(input, bytes.
raw);
1667 for (
size_t i = 0; i < 16; ++i) {
1668 output[i] = control[i] < 16 ? input[control[i]] : 0;
1674 template <
typename T,
typename TI>
1696 return Vec128<uint32_t>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1699 return Vec128<int32_t>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1702 return Vec128<float>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
1751 template <
typename T>
1756 template <
typename T,
typename TI>
1758 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
1762 template <
typename T,
typename TI>
1764 const Rebind<TI, decltype(
d)> di;
1768 template <
typename T>
1778 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1784 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1790 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1798 template <
typename T>
1805 template <
typename T>
1812 template <
typename T>
1821 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1826 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1839 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1843 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1857 template <
typename T,
class V = Vec256<T>>
1869 11, 27, 12, 28, 13, 29, 14, 30, 15,
1875 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1888 11, 27, 12, 28, 13, 29, 14, 30, 15,
1893 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1908 template <
typename T,
class V = Vec256<T>>
1917 template <
typename T,
class DW = RepartitionToW
ide<Full256<T>>>
1921 template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1926 template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1936 template <
typename T>
1938 const Half<decltype(
d)> d2;
1942 const VU lo{
BitCast(du2, lo_half).raw};
1943 const VU hi{
BitCast(du2, hi_half).raw};
1949 template <
typename T>
1957 template <
typename T>
1965 template <
typename T>
1973 template <
typename T>
1976 return CombineShiftRightBytes<8>(
d, hi, lo);
1980 template <
typename T>
1989 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1996 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2004 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2011 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2017 template <
typename T>
2023 template <
typename T>
2032 template <
typename T>
2037 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2038 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2041 template <
typename T>
2044 return Vec256<T>{wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2046 template <
typename T>
2051 template <
typename T>
2059 template <
typename T>
2068 template <
typename T>
2075 template <
typename T>
2082 template <
typename T>
2099 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2108 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2127 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(
v.raw))};
2146 const auto sign = ShiftRight<15>(bits16);
2147 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
2148 const auto mantissa = bits16 &
Set(du32, 0x3FF);
2149 const auto subnormal =
2151 Set(df32, 1.0f / 16384 / 1024));
2153 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
2154 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
2155 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2156 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
2157 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2162 const Rebind<uint16_t, decltype(df32)> du16;
2181 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2182 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2192 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2193 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2211 const auto bits32 =
BitCast(du,
v);
2212 const auto sign = ShiftRight<31>(bits32);
2213 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
2214 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
2216 const auto k15 =
Set(di, 15);
2217 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
2218 const auto is_tiny = exp <
Set(di, -24);
2220 const auto is_subnormal = exp <
Set(di, -14);
2221 const auto biased_exp16 =
2223 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
2224 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
2225 (mantissa32 >> (
Set(du, 13) + sub_exp));
2227 ShiftRight<13>(mantissa32));
2229 const auto sign16 = ShiftLeft<15>(sign);
2230 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2237 const Rebind<int32_t, decltype(dbf16)> di32;
2238 const Rebind<uint32_t, decltype(dbf16)> du32;
2239 const Rebind<uint16_t, decltype(dbf16)> du16;
2240 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
2247 const Repartition<uint32_t, decltype(dbf16)> du32;
2254 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2255 return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2280 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2285 const Vec256<T> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
2288 alignas(32) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2289 1, 1, 1, 1, 1, 1, 1, 1};
2292 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2293 1, 2, 4, 8, 16, 32, 64, 128};
2297 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2300 alignas(32) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2304 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2307 alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2311 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2314 alignas(32) constexpr uint64_t kBit[8] = {1, 2};
2321 template <
typename T>
2324 uint64_t mask_bits = 0;
2334 template <
typename T>
2336 const Mask128<T> mask) {
2337 alignas(32) uint64_t lanes[2];
2338 wasm_v128_store(lanes, mask.raw);
2340 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2341 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2342 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2346 template <
typename T>
2350 const __i16x8 zero = wasm_i16x8_splat(0);
2355 template <
typename T>
2358 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
2359 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2360 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2361 alignas(32) uint32_t lanes[4];
2362 wasm_v128_store(lanes, sliced_mask);
2363 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2369 (
N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2370 : (
N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2371 : (
N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2372 : (
N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2373 : (
N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2374 : (
N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2375 : (
N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2376 : (
N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2377 : (
N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2378 : (
N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2380 : (
N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2382 : (
N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2384 : (
N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2386 : (
N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2389 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2391 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2392 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2395 template <
typename T>
2400 template <
typename T>
2405 template <
typename T>
2410 template <
typename T>
2412 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2413 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2414 alignas(32) uint64_t lanes[2];
2415 wasm_v128_store(lanes, shifted_bits);
2416 return PopCount(lanes[0] | lanes[1]);
2422 template <
typename T>
2426 const size_t kNumBytes = (
N + 7) / 8;
2427 CopyBytes<kNumBytes>(&mask_bits, bits);
2431 template <
typename T>
2436 template <
typename T>
2442 return !wasm_i8x16_any_true(v8.raw);
2445 return (wasm_i64x2_extract_lane(m.
raw, 0) |
2446 wasm_i64x2_extract_lane(m.
raw, 1)) == 0;
2452 template <
typename T>
2454 return wasm_i8x16_all_true(m.raw);
2456 template <
typename T>
2458 return wasm_i16x8_all_true(m.raw);
2460 template <
typename T>
2462 return wasm_i32x4_all_true(m.raw);
2467 template <
typename T>
2472 template <
typename T>
2483 template <
typename T>
2487 const Rebind<uint8_t, decltype(
d)> d8;
2495 alignas(32) constexpr uint8_t table[256 * 8] = {
2496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2497 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2498 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2499 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2500 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2501 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2502 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2503 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2504 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2505 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2506 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2507 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2508 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2509 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2510 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2511 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2512 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2513 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2514 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2515 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2516 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2517 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2518 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2519 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2520 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2521 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2522 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2523 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2524 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2525 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2526 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2527 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2528 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2529 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2530 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2531 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2532 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2533 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2534 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2535 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2536 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2537 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2538 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2539 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2540 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2541 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2542 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2543 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2544 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2545 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2546 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2547 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2548 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2549 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2550 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2551 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2552 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2553 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2554 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2555 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2556 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2557 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2558 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2559 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2560 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2561 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2562 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2563 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2564 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2565 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2566 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2567 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2568 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2569 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2570 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2571 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2572 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2573 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2574 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2575 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2576 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2577 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2578 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2579 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2580 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2581 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2582 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2583 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2584 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2585 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2586 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2587 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2588 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2589 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2590 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2591 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2592 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2593 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2594 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2595 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2596 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2597 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2598 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2599 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2600 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2601 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2602 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2603 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2604 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2605 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2606 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2607 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2608 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2609 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2616 template <
typename T>
2621 alignas(32) constexpr uint8_t packed_array[16 * 16] = {
2622 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2623 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2624 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2625 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3,
2626 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2627 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2628 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
2629 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
2630 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
2631 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2632 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2633 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3,
2634 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
2635 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2636 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
2637 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2641 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
2644 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2646 template <
typename T>
2651 alignas(32) constexpr uint8_t packed_array[4 * 16] = {
2652 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
2653 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
2654 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
2655 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2659 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
2667 template <
typename T>
2669 const uint64_t mask_bits) {
2670 const auto idx = detail::Idx16x8FromBits<T>(mask_bits);
2676 template <
typename T>
2678 const uint64_t mask_bits) {
2679 const auto idx = detail::Idx32x4FromBits<T>(mask_bits);
2685 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2687 template <
typename T>
2690 const uint64_t mask_bits) {
2691 const auto idx = detail::Idx64x2FromBits<uint64_t>(mask_bits);
2701 template <
typename T>
2702 struct CompressIsPartition {
2706 template <
typename T>
2714 template <
typename T>
2716 uint64_t mask_bits = 0;
2717 constexpr
size_t kNumBytes = (
N + 7) / 8;
2718 CopyBytes<kNumBytes>(bits, &mask_bits);
2720 mask_bits &= (1ull <<
N) - 1;
2727 template <
typename T>
2737 template <
typename T>
2741 using TU =
TFromD<decltype(du)>;
2743 const size_t count =
PopCount(mask_bits);
2754 template <
typename T>
2757 uint64_t mask_bits = 0;
2758 constexpr
size_t kNumBytes = (
N + 7) / 8;
2759 CopyBytes<kNumBytes>(bits, &mask_bits);
2761 mask_bits &= (1ull <<
N) - 1;
2775 const auto k5 =
Set(
d, 5);
2776 const auto k6 =
Set(
d, 6);
2780 alignas(32)
static constexpr uint8_t tbl_r0[16] = {
2781 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
2782 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
2783 alignas(32)
static constexpr uint8_t tbl_g0[16] = {
2784 0x80, 0, 0x80, 0x80, 1, 0x80,
2785 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
2786 const auto shuf_r0 =
Load(
d, tbl_r0);
2787 const auto shuf_g0 =
Load(
d, tbl_g0);
2788 const auto shuf_b0 = CombineShiftRightBytes<15>(
d, shuf_g0, shuf_g0);
2792 const auto int0 = r0 | g0 | b0;
2793 StoreU(int0,
d, unaligned + 0 * 16);
2796 const auto shuf_r1 = shuf_b0 + k6;
2797 const auto shuf_g1 = shuf_r0 + k5;
2798 const auto shuf_b1 = shuf_g0 + k5;
2802 const auto int1 = r1 | g1 | b1;
2803 StoreU(int1,
d, unaligned + 1 * 16);
2806 const auto shuf_r2 = shuf_b1 + k6;
2807 const auto shuf_g2 = shuf_r1 + k5;
2808 const auto shuf_b2 = shuf_g1 + k5;
2812 const auto int2 = r2 | g2 | b2;
2813 StoreU(int2,
d, unaligned + 2 * 16);
2826 const auto ba0 =
ZipLower(d16, v0, v1);
2827 const auto dc0 =
ZipLower(d16, v2, v3);
2828 const auto ba8 =
ZipUpper(d16, v0, v1);
2829 const auto dc8 =
ZipUpper(d16, v2, v3);
2830 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
2831 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
2832 const auto dcba_8 =
ZipLower(d32, ba8, dc8);
2833 const auto dcba_C =
ZipUpper(d32, ba8, dc8);
2844 alignas(32) uint64_t mul[2];
2846 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.
raw, 0)),
2847 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.
raw, 0)), &mul[1]);
2853 alignas(32) uint64_t mul[2];
2855 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.
raw, 1)),
2856 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.
raw, 1)), &mul[1]);
2884 template <
typename T>
2888 const Vec256<T> v31_20_31_20 = v3210 + v1032;
2890 return v20_31_20_31 + v31_20_31_20;
2892 template <
typename T>
2898 return Min(v20_31_20_31, v31_20_31_20);
2900 template <
typename T>
2906 return Max(v20_31_20_31, v31_20_31_20);
2911 template <
typename T>
2917 template <
typename T>
2921 return Min(v10, v01);
2923 template <
typename T>
2927 return Max(v10, v01);
2931 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2935 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
2940 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2944 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
2953 template <
typename T>
2957 template <
typename T>
2961 template <
typename T>
2968 template <
typename T>
2971 template <
typename T>
2974 template <
typename T>
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_ABORT(format,...)
Definition: base.h:143
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
Raw raw
Definition: arm_neon-inl.h:539
Definition: arm_neon-inl.h:485
Raw raw
Definition: arm_neon-inl.h:518
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:46
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:43
Raw raw
Definition: x86_256-inl.h:94
Vec128< T > v1
Definition: wasm_256-inl.h:66
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:58
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:61
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:49
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:55
Vec128< T > v0
Definition: wasm_256-inl.h:65
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1819
HWY_INLINE Vec128< T, N > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3162
HWY_INLINE Vec128< T, N > Idx64x2FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3336
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3009
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4680
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3309
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:460
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: wasm_256-inl.h:1752
__v128_u raw
Definition: wasm_256-inl.h:1753
Definition: wasm_256-inl.h:70
Mask128< T > m1
Definition: wasm_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:131
Mask128< T > m0
Definition: wasm_256-inl.h:71
Definition: ops/shared-inl.h:40