39#define HWY_NEON_BUILD_TPL_1
40#define HWY_NEON_BUILD_TPL_2
41#define HWY_NEON_BUILD_TPL_3
45#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
46#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
47#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
50#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
51#define HWY_NEON_BUILD_PARAM_2(type, size) \
52 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
53#define HWY_NEON_BUILD_PARAM_3(type, size) \
54 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
55 const Vec128<type##_t, size> c
59#define HWY_NEON_BUILD_ARG_1 a.raw
60#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
61#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
70#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
76#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
77 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
78 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
79 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
80 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
81 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
91#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
92 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
93 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
94 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
95 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
96 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
99#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
100 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
101 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
102 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
103 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
104 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
107#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
108 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
109 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
110 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
111 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
114#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
115 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
116 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
117 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
118 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
121#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
122 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
123 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
124 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
127#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
128 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
129 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
130 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
133#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
134 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
135 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
138#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
139 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
140 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
143#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
144 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
145 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
146 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
150#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
151 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
152 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
154#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
159#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
160 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
161 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
165#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
166 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
167 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
168 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
171#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
172 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
173 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
174 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
177#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
178 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
179 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
182#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
183 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
184 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
187#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
188 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
189 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
192#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
193 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
194 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
196#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
197 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
198 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
199 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
203#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
204#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
205#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
206#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
207#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
208#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
209#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
210#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
211#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
212#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
213#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
214#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
215#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
216#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
217#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
218#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
219#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
220#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
221#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
222#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
223#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
224#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
225#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
226#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
227#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
228#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
229#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
230#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
231#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
232#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
233#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
234#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
235#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
236#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
237#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
238#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
239#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
240#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
241#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
242#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
243#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
244#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
245#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
246#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
247#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
248#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
249#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
250#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
251#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
252#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
253#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
254#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
255#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
256#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
257#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
258#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
264template <
typename T,
size_t N>
266template <
typename T,
size_t N>
268template <
typename T,
size_t N>
574template <
typename T,
size_t N>
636 using type = float64x2_t;
699 using type = float64x1_t;
759template <
typename T,
size_t N = 16 /
sizeof(T)>
772 return *
this = (*
this * other);
775 return *
this = (*
this / other);
778 return *
this = (*
this + other);
781 return *
this = (*
this - other);
784 return *
this = (*
this & other);
787 return *
this = (*
this | other);
790 return *
this = (*
this ^ other);
803template <
typename T,
size_t N = 16 /
sizeof(T)>
824 template <
typename T,
size_t N>
844#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
845#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
846 Vec128<uint8_t, size * sizeof(type##_t)>
847#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
848#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
873#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
874#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
875#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
876#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
886template <
size_t N, HWY_IF_LE64(
int8_t, N)>
891template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
896template <
size_t N, HWY_IF_LE64(
int16_t, N)>
901template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
906template <
size_t N, HWY_IF_LE64(
int32_t, N)>
911template <
size_t N, HWY_IF_LE64(
float, N)>
987template <
typename T,
size_t N,
typename FromT>
989 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
996#define HWY_NEON_BUILD_TPL_HWY_SET1
997#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
998#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
999 Simd<type##_t, size, 0> , const type##_t t
1000#define HWY_NEON_BUILD_ARG_HWY_SET1 t
1004#undef HWY_NEON_BUILD_TPL_HWY_SET1
1005#undef HWY_NEON_BUILD_RET_HWY_SET1
1006#undef HWY_NEON_BUILD_PARAM_HWY_SET1
1007#undef HWY_NEON_BUILD_ARG_HWY_SET1
1010template <
typename T,
size_t N>
1024template <
typename T,
size_t N>
1034template <
typename T,
size_t N,
typename T2>
1037 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
1038 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
1040 return Load(
d, lanes);
1046#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1047#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1048#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1049#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1053#undef HWY_NEON_BUILD_TPL_HWY_GET
1054#undef HWY_NEON_BUILD_RET_HWY_GET
1055#undef HWY_NEON_BUILD_PARAM_HWY_GET
1056#undef HWY_NEON_BUILD_ARG_HWY_GET
1062 return detail::GetLane<0>(
v);
1069template <
typename T>
1073 return detail::GetLane<0>(
v);
1076template <
typename T>
1078#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1079 if (__builtin_constant_p(i)) {
1082 return detail::GetLane<0>(
v);
1084 return detail::GetLane<1>(
v);
1088 alignas(16) T lanes[2];
1093template <
typename T>
1095#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1096 if (__builtin_constant_p(i)) {
1099 return detail::GetLane<0>(
v);
1101 return detail::GetLane<1>(
v);
1103 return detail::GetLane<2>(
v);
1105 return detail::GetLane<3>(
v);
1109 alignas(16) T lanes[4];
1114template <
typename T>
1116#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1117 if (__builtin_constant_p(i)) {
1120 return detail::GetLane<0>(
v);
1122 return detail::GetLane<1>(
v);
1124 return detail::GetLane<2>(
v);
1126 return detail::GetLane<3>(
v);
1128 return detail::GetLane<4>(
v);
1130 return detail::GetLane<5>(
v);
1132 return detail::GetLane<6>(
v);
1134 return detail::GetLane<7>(
v);
1138 alignas(16) T lanes[8];
1143template <
typename T>
1145#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1146 if (__builtin_constant_p(i)) {
1149 return detail::GetLane<0>(
v);
1151 return detail::GetLane<1>(
v);
1153 return detail::GetLane<2>(
v);
1155 return detail::GetLane<3>(
v);
1157 return detail::GetLane<4>(
v);
1159 return detail::GetLane<5>(
v);
1161 return detail::GetLane<6>(
v);
1163 return detail::GetLane<7>(
v);
1165 return detail::GetLane<8>(
v);
1167 return detail::GetLane<9>(
v);
1169 return detail::GetLane<10>(
v);
1171 return detail::GetLane<11>(
v);
1173 return detail::GetLane<12>(
v);
1175 return detail::GetLane<13>(
v);
1177 return detail::GetLane<14>(
v);
1179 return detail::GetLane<15>(
v);
1183 alignas(16) T lanes[16];
1191#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1192#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1193#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1194 Vec128<type##_t, size> v, type##_t t
1195#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1199#undef HWY_NEON_BUILD_TPL_HWY_INSERT
1200#undef HWY_NEON_BUILD_RET_HWY_INSERT
1201#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1202#undef HWY_NEON_BUILD_ARG_HWY_INSERT
1209template <
typename T>
1216template <
typename T>
1218#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1219 if (__builtin_constant_p(i)) {
1222 return detail::InsertLane<0>(
v, t);
1224 return detail::InsertLane<1>(
v, t);
1229 alignas(16) T lanes[2];
1232 return Load(
d, lanes);
1235template <
typename T>
1237#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1238 if (__builtin_constant_p(i)) {
1241 return detail::InsertLane<0>(
v, t);
1243 return detail::InsertLane<1>(
v, t);
1245 return detail::InsertLane<2>(
v, t);
1247 return detail::InsertLane<3>(
v, t);
1252 alignas(16) T lanes[4];
1255 return Load(
d, lanes);
1258template <
typename T>
1260#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1261 if (__builtin_constant_p(i)) {
1264 return detail::InsertLane<0>(
v, t);
1266 return detail::InsertLane<1>(
v, t);
1268 return detail::InsertLane<2>(
v, t);
1270 return detail::InsertLane<3>(
v, t);
1272 return detail::InsertLane<4>(
v, t);
1274 return detail::InsertLane<5>(
v, t);
1276 return detail::InsertLane<6>(
v, t);
1278 return detail::InsertLane<7>(
v, t);
1283 alignas(16) T lanes[8];
1286 return Load(
d, lanes);
1289template <
typename T>
1291#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1292 if (__builtin_constant_p(i)) {
1295 return detail::InsertLane<0>(
v, t);
1297 return detail::InsertLane<1>(
v, t);
1299 return detail::InsertLane<2>(
v, t);
1301 return detail::InsertLane<3>(
v, t);
1303 return detail::InsertLane<4>(
v, t);
1305 return detail::InsertLane<5>(
v, t);
1307 return detail::InsertLane<6>(
v, t);
1309 return detail::InsertLane<7>(
v, t);
1311 return detail::InsertLane<8>(
v, t);
1313 return detail::InsertLane<9>(
v, t);
1315 return detail::InsertLane<10>(
v, t);
1317 return detail::InsertLane<11>(
v, t);
1319 return detail::InsertLane<12>(
v, t);
1321 return detail::InsertLane<13>(
v, t);
1323 return detail::InsertLane<14>(
v, t);
1325 return detail::InsertLane<15>(
v, t);
1330 alignas(16) T lanes[16];
1333 return Load(
d, lanes);
1409#pragma push_macro("HWY_NEON_DEF_FUNCTION")
1410#undef HWY_NEON_DEF_FUNCTION
1411#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
1412 template <int kBits> \
1413 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
1414 return kBits == 0 ? v \
1415 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
1416 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
1424#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1428template <
int kBits,
size_t N>
1430 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1431 if (kBits == 0)
return v;
1435template <
int kBits,
size_t N>
1437 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1438 if (kBits == 0)
return v;
1451template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1461template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1471template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1490template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1500template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1510template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1532template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1544template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1556template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1578template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1588template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1598template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1615template <
typename T,
size_t N>
1617 return v << Set(Simd<T, N, 0>(),
static_cast<T
>(bits));
1619template <
typename T,
size_t N>
1636template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1641template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1657template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1662template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1671 int32x4_t rlo = vmull_s16(vget_low_s16(a.
raw), vget_low_s16(b.
raw));
1673 int32x4_t rhi = vmull_high_s16(a.
raw, b.
raw);
1675 int32x4_t rhi = vmull_s16(vget_high_s16(a.
raw), vget_high_s16(b.
raw));
1678 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1682 uint32x4_t rlo = vmull_u16(vget_low_u16(a.
raw), vget_low_u16(b.
raw));
1684 uint32x4_t rhi = vmull_high_u16(a.
raw, b.
raw);
1686 uint32x4_t rhi = vmull_u16(vget_high_u16(a.
raw), vget_high_u16(b.
raw));
1689 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1692template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1695 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.
raw, b.
raw));
1698template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1701 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.
raw, b.
raw));
1708template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1761template <
size_t N, HWY_IF_LE64(
float, N)>
1770#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1771template <
size_t N, HWY_IF_LE64(
float, N)>
1773 const Vec128<float, N> x,
1774 const Vec128<float, N> add) {
1775 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1777HWY_API Vec128<float>
MulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1778 const Vec128<float> add) {
1779 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1787 return mul * x + add;
1792HWY_API Vec64<double>
MulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1793 const Vec64<double> add) {
1794 return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1796HWY_API Vec128<double>
MulAdd(
const Vec128<double> mul,
const Vec128<double> x,
1797 const Vec128<double> add) {
1798 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1803#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1804template <
size_t N, HWY_IF_LE64(
float, N)>
1806 const Vec128<float, N> x,
1807 const Vec128<float, N> add) {
1808 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1810HWY_API Vec128<float>
NegMulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1811 const Vec128<float> add) {
1812 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1820 return add - mul * x;
1825HWY_API Vec64<double>
NegMulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1826 const Vec64<double> add) {
1827 return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1830 const Vec128<double> x,
1831 const Vec128<double> add) {
1832 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1854HWY_API Vec128<double, N>
MulSub(
const Vec128<double, N> mul,
1855 const Vec128<double, N> x,
1856 const Vec128<double, N> sub) {
1861 const Vec128<double, N> x,
1862 const Vec128<double, N> sub) {
1905 const auto root =
v * recip;
1915template <
typename T>
1921template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1925 using V8 =
decltype(
Zero(d8));
1948template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1951 return detail::reversed_andnot(mask, not_mask);
1955template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1957 const Vec128<T, N> mask) {
1958 const DFromV<
decltype(mask)>
d;
1960 VFromD<
decltype(du)> ret =
1961 detail::reversed_andnot(
BitCast(du, mask),
BitCast(du, not_mask));
1991template <
typename T,
size_t N>
1993 return Or(o1,
Or(o2, o3));
1998template <
typename T,
size_t N>
2000 return Or(o,
And(a1, a2));
2005template <
typename T,
size_t N>
2013template <
typename T,
size_t N>
2018template <
typename T,
size_t N>
2023template <
typename T,
size_t N>
2030#ifdef HWY_NATIVE_POPCNT
2031#undef HWY_NATIVE_POPCNT
2033#define HWY_NATIVE_POPCNT
2038template <
typename T>
2043template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2051template <
typename T>
2054 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2057template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2061 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2065template <
typename T>
2068 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2069 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2071template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2075 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2079template <
typename T>
2082 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2083 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2085template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2089 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2090 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2095template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
2119template <
size_t N, HWY_IF_LE64(
int8_t, N)>
2123template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2127template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2131template <
size_t N, HWY_IF_LE64(
float, N)>
2137HWY_API Vec128<double>
Abs(
const Vec128<double>
v) {
2138 return Vec128<double>(vabsq_f64(
v.raw));
2142 return Vec64<double>(vabs_f64(
v.raw));
2148template <
typename T,
size_t N>
2151 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2156template <
typename T,
size_t N>
2159 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2165template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
2175template <
typename T,
size_t N>
2181template <
typename T,
size_t N>
2188template <
typename TFrom,
typename TTo,
size_t N>
2190 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
2196#define HWY_NEON_BUILD_TPL_HWY_IF
2197#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2198#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2199 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2200 const Vec128<type##_t, size> no
2201#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2205#undef HWY_NEON_BUILD_TPL_HWY_IF
2206#undef HWY_NEON_BUILD_RET_HWY_IF
2207#undef HWY_NEON_BUILD_PARAM_HWY_IF
2208#undef HWY_NEON_BUILD_ARG_HWY_IF
2211template <
typename T,
size_t N>
2218template <
typename T,
size_t N>
2224template <
typename T,
size_t N>
2227 static_assert(IsSigned<T>(),
"Only works for signed/float");
2235template <
typename T,
size_t N>
2238 return Max(zero,
v);
2243template <
typename T,
size_t N>
2248template <
typename T,
size_t N>
2254template <
typename T,
size_t N>
2260template <
typename T,
size_t N>
2266template <
typename T,
size_t N>
2298#define HWY_NEON_BUILD_TPL_HWY_COMPARE
2299#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
2300#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
2301 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
2302#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
2326#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
2327#undef HWY_NEON_BUILD_RET_HWY_COMPARE
2328#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
2329#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
2337 const Vec128<int64_t, N> b) {
2338 const Simd<int32_t, N * 2, 0> d32;
2339 const Simd<int64_t, N, 0> d64;
2347 const Vec128<uint64_t, N> b) {
2348 const Simd<uint32_t, N * 2, 0> d32;
2349 const Simd<uint64_t, N, 0> d64;
2356 const Vec128<int64_t> b) {
2357 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
2361 const Vec64<int64_t> b) {
2362 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
2368 const Vec128<uint64_t, N> b) {
2369 const DFromV<
decltype(a)> du;
2371 const Vec128<uint64_t, N> msb =
AndNot(a, b) |
AndNot(a ^ b, a - b);
2380#pragma push_macro("HWY_NEON_DEF_FUNCTION")
2381#undef HWY_NEON_DEF_FUNCTION
2385#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2386 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
2387 Vec128<type##_t, size> b) { \
2388 return Not(a == b); \
2393#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2397template <
typename T,
size_t N>
2401template <
typename T,
size_t N>
2408template <
typename T,
size_t N>
2416#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
2417#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
2418#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
2419 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
2420#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
2432 return (
v & bit) == bit;
2437 return (
v & bit) == bit;
2441#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
2442#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
2443#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
2444#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
2471 const
Vec128<uint64_t,
N> b) {
2475 const DFromV<
decltype(a)> du;
2509 const
Vec128<uint64_t,
N> b) {
2513 const DFromV<
decltype(a)> du;
2583 return Vec128<double>(vld1q_f64(unaligned));
2628 return Vec64<double>(vld1_f64(p));
2647template <
typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2651 CopyBytes<4>(p, &buf);
2668template <
typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2672 CopyBytes<2>(p, &buf);
2693 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2700 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2705template <
typename T,
size_t N>
2710template <
typename T,
size_t N>
2717template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2727 vst1q_u8(unaligned,
v.raw);
2731 vst1q_u16(unaligned,
v.raw);
2735 vst1q_u32(unaligned,
v.raw);
2739 vst1q_u64(unaligned,
v.raw);
2743 vst1q_s8(unaligned,
v.raw);
2747 vst1q_s16(unaligned,
v.raw);
2751 vst1q_s32(unaligned,
v.raw);
2755 vst1q_s64(unaligned,
v.raw);
2759 vst1q_f32(unaligned,
v.raw);
2764 vst1q_f64(unaligned,
v.raw);
2817 vst1_lane_u32(p,
v.raw, 0);
2821 vst1_lane_s32(p,
v.raw, 0);
2825 vst1_lane_f32(p,
v.raw, 0);
2828template <
typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
2832 CopyBytes<4>(&buf, p);
2839 vst1_lane_u16(p,
v.raw, 0);
2843 vst1_lane_s16(p,
v.raw, 0);
2846template <
typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
2850 CopyBytes<2>(&buf, p);
2857 vst1_lane_u8(p,
v.raw, 0);
2861 vst1_lane_s8(p,
v.raw, 0);
2869 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2876 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2881template <
typename T,
size_t N>
2886template <
typename T,
size_t N>
2891 const auto blended =
2900template <
typename T,
size_t N>
2917 uint16x8_t a = vmovl_u8(
v.raw);
2932 uint16x8_t a = vmovl_u8(
v.raw);
2940template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
2945template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
2948 uint16x8_t a = vmovl_u8(
v.raw);
2956template <
size_t N, HWY_IF_LE64(u
int64_t, N)>
2961template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2966template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2969 uint16x8_t a = vmovl_u8(
v.raw);
2970 uint32x4_t b = vmovl_u16(vget_low_u16(a));
2973template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2976 uint32x4_t a = vmovl_u16(
v.raw);
2987 int16x8_t a = vmovl_s8(
v.raw);
3008 int16x8_t a = vmovl_s8(
v.raw);
3009 int32x4_t b = vmovl_s16(vget_low_s16(a));
3026 const Vec128<float16_t, 4>
v) {
3027 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
3028 return Vec128<float>(f32);
3032 const Vec128<float16_t, N>
v) {
3033 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(
v.raw));
3034 return Vec128<float, N>(vget_low_f32(f32));
3046 const auto sign = ShiftRight<15>(bits16);
3047 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3048 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3049 const auto subnormal =
3051 Set(df32, 1.0f / 16384 / 1024));
3053 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3054 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3055 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3056 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3057 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3065 const Vec64<float>
v) {
3066 return Vec128<double>(vcvt_f64_f32(
v.raw));
3070 const Vec32<float>
v) {
3071 return Vec64<double>(vget_low_f64(vcvt_f64_f32(
v.raw)));
3075 const Vec64<int32_t>
v) {
3076 const int64x2_t i64 = vmovl_s32(
v.raw);
3077 return Vec128<double>(vcvtq_f64_s64(i64));
3081 const Vec32<int32_t>
v) {
3082 const int64x1_t i64 = vget_low_s64(vmovl_s32(
v.raw));
3083 return Vec64<double>(vcvt_f64_s64(i64));
3101 const uint16x4_t a = vqmovun_s32(
v.raw);
3110 const int16x4_t a = vqmovn_s32(
v.raw);
3119template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3124template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3129template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3132 const uint16x4_t a = vqmovun_s32(vcombine_s32(
v.raw,
v.raw));
3135template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3140template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3143 const int16x4_t a = vqmovn_s32(vcombine_s32(
v.raw,
v.raw));
3146template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3155 const Vec128<float>
v) {
3156 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(
v.raw))};
3160 const Vec128<float, N>
v) {
3161 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(
v.raw,
v.raw));
3162 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
3171 const Rebind<uint32_t,
decltype(du16)> du;
3173 const auto bits32 =
BitCast(du,
v);
3174 const auto sign = ShiftRight<31>(bits32);
3175 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3176 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3178 const auto k15 =
Set(di, 15);
3179 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3180 const auto is_tiny = exp <
Set(di, -24);
3182 const auto is_subnormal = exp <
Set(di, -14);
3183 const auto biased_exp16 =
3185 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3186 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3187 (mantissa32 >> (
Set(du, 13) + sub_exp));
3189 ShiftRight<13>(mantissa32));
3191 const auto sign16 = ShiftLeft<15>(sign);
3192 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3202 const Rebind<int32_t,
decltype(dbf16)> di32;
3203 const Rebind<uint32_t,
decltype(dbf16)> du32;
3204 const Rebind<uint16_t,
decltype(dbf16)> du16;
3205 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3212 return Vec64<float>(vcvt_f32_f64(
v.raw));
3215 return Vec32<float>(vcvt_f32_f64(vcombine_f64(
v.raw,
v.raw)));
3219 const Vec128<double>
v) {
3220 const int64x2_t i64 = vcvtq_s64_f64(
v.raw);
3221 return Vec64<int32_t>(vqmovn_s64(i64));
3224 const Vec64<double>
v) {
3225 const int64x1_t i64 = vcvt_s64_f64(
v.raw);
3227 const int64x2_t i64x2 = vcombine_s64(i64, i64);
3228 return Vec32<int32_t>(vqmovn_s64(i64x2));
3235 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
3238template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3241 const uint8x8_t w = vuzp1_u8(org_v, org_v);
3256 uint16x8_t c = vcombine_u16(a.
raw, b.
raw);
3265 int16x8_t c = vcombine_s16(a.
raw, b.
raw);
3277template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3288template <
size_t N, HWY_IF_LE64(
float, N)>
3297 const Vec128<int64_t>
v) {
3298 return Vec128<double>(vcvtq_f64_s64(
v.raw));
3301 const Vec64<int64_t>
v) {
3302 return Vec64<double>(vcvt_f64_s64(
v.raw));
3307 const Vec128<double>
v) {
3308 return Vec128<int64_t>(vcvtq_s64_f64(
v.raw));
3311 const Vec64<double>
v) {
3312 return Vec64<int64_t>(vcvt_s64_f64(
v.raw));
3357 const auto int_f =
ConvertTo(df, integer);
3372 const auto added = large +
v;
3373 const auto rounded = added - large;
3385 const auto int_f =
ConvertTo(df, integer);
3399 const auto int_f =
ConvertTo(df, integer);
3414 return Vec128<int32_t>(vcvtnq_s32_f32(
v.raw));
3416template <
size_t N, HWY_IF_LE64(
float, N)>
3418 return Vec128<int32_t, N>(vcvtn_s32_f32(
v.raw));
3432template <
typename T,
size_t N>
3437template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3447template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3456 const VFromD<
decltype(di)> exp =
3466template <
typename T,
size_t N, HWY_IF_LE64(u
int8_t, N)>
3500 return Vec64<double>(vget_low_f64(
v.raw));
3504template <
typename T,
size_t N>
3513template <
int kBytes,
typename T,
class V128 = Vec128<T>>
3515 static_assert(0 < kBytes && kBytes < 16,
"kBytes must be in [1, 15]");
3517 uint8x16_t v8 = vextq_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3522template <
int kBytes,
typename T>
3524 static_assert(0 < kBytes && kBytes < 8,
"kBytes must be in [1, 7]");
3526 uint8x8_t v8 = vext_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3538template <
int kBytes>
3548 template <
class T,
size_t N, HWY_IF_LE64(T, N)>
3552 const auto zero64 =
Zero(d64);
3553 const decltype(zero64) v64(
v.raw);
3555 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3560 template <
class T,
size_t N>
3567 template <
class T,
size_t N>
3573template <
int kBytes>
3575 template <
class T,
size_t N>
3579 if (
N *
sizeof(T) < 8) {
3580 constexpr size_t kReg =
N *
sizeof(T) == 16 ? 16 : 8;
3581 const Simd<T, kReg /
sizeof(T), 0> dreg;
3585 return CombineShiftRightBytes<kBytes>(
d,
Zero(
d),
v);
3590 template <
class T,
size_t N>
3597 template <
class T,
size_t N>
3605template <
int kBytes,
typename T,
size_t N>
3611template <
int kBytes,
typename T,
size_t N>
3616template <
int kLanes,
typename T,
size_t N>
3622template <
int kLanes,
typename T,
size_t N>
3628template <
int kBytes,
typename T,
size_t N>
3634template <
int kLanes,
typename T,
size_t N>
3641template <
int kBytes,
typename T,
size_t N, HWY_IF_LE32(T, N)>
3644 constexpr size_t kSize =
N *
sizeof(T);
3645 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3649 using V64 =
VFromD<
decltype(d_full8)>;
3650 const V64 hi64(
BitCast(d8, hi).raw);
3698 const Vec128<double>
v) {
3699 return Vec64<double>(vget_high_f64(
v.raw));
3704template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3711 return Vec128<T, (
N + 1) / 2>(upper.raw);
3720 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3721 return Vec128<uint16_t>(vdupq_laneq_u16(
v.raw, kLane));
3723template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3725 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3726 return Vec128<uint16_t, N>(vdup_lane_u16(
v.raw, kLane));
3730 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3731 return Vec128<uint32_t>(vdupq_laneq_u32(
v.raw, kLane));
3733template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3735 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3736 return Vec128<uint32_t, N>(vdup_lane_u32(
v.raw, kLane));
3740 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3741 return Vec128<uint64_t>(vdupq_laneq_u64(
v.raw, kLane));
3748 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3749 return Vec128<int16_t>(vdupq_laneq_s16(
v.raw, kLane));
3751template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3753 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3754 return Vec128<int16_t, N>(vdup_lane_s16(
v.raw, kLane));
3758 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3759 return Vec128<int32_t>(vdupq_laneq_s32(
v.raw, kLane));
3761template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3763 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3764 return Vec128<int32_t, N>(vdup_lane_s32(
v.raw, kLane));
3768 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3769 return Vec128<int64_t>(vdupq_laneq_s64(
v.raw, kLane));
3776 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3777 return Vec128<float>(vdupq_laneq_f32(
v.raw, kLane));
3779template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3781 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3782 return Vec128<float, N>(vdup_lane_f32(
v.raw, kLane));
3786 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3787 return Vec128<double>(vdupq_laneq_f64(
v.raw, kLane));
3791 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3801 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3804template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3806 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3811 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3814template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3816 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3821 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3829 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3832template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3834 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3839 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3842template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3844 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3849 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3857 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3860template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3862 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3870 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3875 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3882template <
typename T,
size_t N>
3887template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3889 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3890#if HWY_IS_DEBUG_BUILD
3891 const Rebind<TI,
decltype(
d)> di;
3897 using V8 =
VFromD<
decltype(d8)>;
3901 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
3902 if (
sizeof(T) == 4) {
3903 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
3904 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3905 const V8 lane_indices =
3907 const V8 byte_indices =
3909 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3910 0, 1, 2, 3, 0, 1, 2, 3};
3911 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
3914 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
3915 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3916 const V8 lane_indices =
3918 const V8 byte_indices =
3920 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
3921 0, 1, 2, 3, 4, 5, 6, 7};
3922 const V8 sum =
Add(byte_indices,
Load(d8, kByteOffsets));
3927template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3929 const Rebind<TI,
decltype(
d)> di;
3933template <
typename T,
size_t N>
3944template <
typename T>
3950template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3955template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3961template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3967template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3975template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
3980template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3986template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
3991template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3997template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4004template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4009template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4015template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4020template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4027template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4032template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4045template <
typename T>
4049template <
typename T>
4055template <
typename T>
4061template <
typename T>
4067template <
typename T>
4083 const Vec128<uint64_t> b) {
4084 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
4087 const Vec128<int64_t> b) {
4088 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
4091 const Vec128<double> b) {
4092 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
4111template <
size_t N, HWY_IF_LE64(
float, N)>
4118template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4124template <
typename T,
size_t N,
class V = Vec128<T, N>>
4139 const Vec128<uint64_t> b) {
4140 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
4143 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
4146 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
4170template <
typename T,
size_t N, HWY_IF_GE64(T, N),
class V = Vec128<T, N>>
4176template <
typename T,
size_t N, HWY_IF_LE32(T, N),
class V = Vec128<T, N>>
4178 const Half<
decltype(
d)> d2;
4186template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4190template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4195template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4263HWY_API Vec128<double>
Combine(Full128<double> , Vec64<double> hi,
4265 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
4270template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4283template <
typename T,
size_t N>
4291template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4306#define HWY_NEON_BUILD_TPL_HWY_TRN
4307#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
4310#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
4311 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
4312#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
4334template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4342 using VU =
VFromD<
decltype(du)>;
4344 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
4352template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4361template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4369 using VU =
VFromD<
decltype(du)>;
4371 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
4379template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4386template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4389 constexpr size_t kSize =
N *
sizeof(T);
4391 const Full64<uint8_t> d8x8;
4392 const Full64<T> d64;
4393 using V8x8 =
VFromD<
decltype(d8x8)>;
4394 const V8x8 hi8x8(
BitCast(d8, hi).raw);
4399 return Vec128<T, N>(
BitCast(d64, r).raw);
4405template <
typename T,
size_t N>
4420template <typename T,
size_t N,
4428template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4431 const Twice<
decltype(
d)> d2;
4442template <
typename T>
4451template <
typename T,
size_t N,
4459template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4462 const Twice<
decltype(
d)> d2;
4473template <
typename T>
4481template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4484 return detail::InterleaveEven(
v,
v);
4486 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[0]);
4490template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4497template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4500 return detail::InterleaveOdd(
v,
v);
4502 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[1]);
4506template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4513template <
typename T,
size_t N>
4517 alignas(16)
constexpr uint8_t kBytes[16] = {
4518 ((0 /
sizeof(T)) & 1) ? 0 : 0xFF, ((1 /
sizeof(T)) & 1) ? 0 : 0xFF,
4519 ((2 /
sizeof(T)) & 1) ? 0 : 0xFF, ((3 /
sizeof(T)) & 1) ? 0 : 0xFF,
4520 ((4 /
sizeof(T)) & 1) ? 0 : 0xFF, ((5 /
sizeof(T)) & 1) ? 0 : 0xFF,
4521 ((6 /
sizeof(T)) & 1) ? 0 : 0xFF, ((7 /
sizeof(T)) & 1) ? 0 : 0xFF,
4522 ((8 /
sizeof(T)) & 1) ? 0 : 0xFF, ((9 /
sizeof(T)) & 1) ? 0 : 0xFF,
4523 ((10 /
sizeof(T)) & 1) ? 0 : 0xFF, ((11 /
sizeof(T)) & 1) ? 0 : 0xFF,
4524 ((12 /
sizeof(T)) & 1) ? 0 : 0xFF, ((13 /
sizeof(T)) & 1) ? 0 : 0xFF,
4525 ((14 /
sizeof(T)) & 1) ? 0 : 0xFF, ((15 /
sizeof(T)) & 1) ? 0 : 0xFF,
4532template <
typename T,
size_t N>
4539template <
typename T,
size_t N>
4547template <
typename T>
4558 const Repartition<uint32_t,
decltype(dbf16)> du32;
4565#if defined(__ARM_FEATURE_AES)
4568#ifdef HWY_NATIVE_AES
4569#undef HWY_NATIVE_AES
4571#define HWY_NATIVE_AES
4575 Vec128<uint8_t> round_key) {
4580 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4585 Vec128<uint8_t> round_key) {
4586 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4590 return Vec128<uint64_t>((uint64x2_t)vmull_p64(
GetLane(a),
GetLane(b)));
4594 return Vec128<uint64_t>(
4595 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4605 const Rebind<uint16_t,
decltype(df32)> du16;
4619 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4626 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4635 return Vec128<int64_t, (
N + 1) / 2>(
4636 vget_low_s64(vmull_s32(a_packed, b_packed)));
4644 return Vec128<uint64_t, (
N + 1) / 2>(
4645 vget_low_u64(vmull_u32(a_packed, b_packed)));
4650 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 0), vgetq_lane_u64(b.
raw, 0), &hi);
4656 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 1), vgetq_lane_u64(b.
raw, 1), &hi);
4663template <
typename T,
typename TI>
4672 uint8x16_t table0 =
BitCast(d8, bytes).raw;
4674 table.val[0] = vget_low_u8(table0);
4675 table.val[1] = vget_high_u8(table0);
4676 uint8x16_t idx =
BitCast(d8, from).raw;
4677 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4678 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4684template <
typename T,
typename TI,
size_t NI, HWY_IF_LE64(TI, NI)>
4689 const auto idx_full =
Combine(d_full, from64, from64);
4695template <
typename T,
size_t N,
typename TI, HWY_IF_LE64(T, N)>
4703template <
typename T,
size_t N,
typename TI,
size_t NI,
HWY_IF_LE64(T,
N),
4709 const Repartition<uint8_t,
decltype(d_idx)> d_idx8;
4712 const auto from8 =
BitCast(d_idx8, from);
4713 const VFromD<
decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4718template <
class V,
class VI>
4725template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
4729 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4731 alignas(16) T lanes[
N];
4734 alignas(16) Offset offset_lanes[
N];
4735 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
4737 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
4738 for (
size_t i = 0; i <
N; ++i) {
4739 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4743template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
4746 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4748 alignas(16) T lanes[
N];
4751 alignas(16) Index index_lanes[
N];
4752 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
4754 for (
size_t i = 0; i <
N; ++i) {
4755 base[index_lanes[i]] = lanes[i];
4761template <
typename T,
size_t N,
typename Offset>
4765 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4767 alignas(16) Offset offset_lanes[
N];
4768 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
4770 alignas(16) T lanes[
N];
4771 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
4772 for (
size_t i = 0; i <
N; ++i) {
4773 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4775 return Load(
d, lanes);
4778template <
typename T,
size_t N,
typename Index>
4782 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4784 alignas(16) Index index_lanes[
N];
4785 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
4787 alignas(16) T lanes[
N];
4788 for (
size_t i = 0; i <
N; ++i) {
4789 lanes[i] = base[index_lanes[i]];
4791 return Load(
d, lanes);
4799template <
typename T>
4803template <
typename T>
4808template <
typename T>
4815template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4819template <
typename T>
4824template <
typename T>
4836 return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(
v.raw)));
4839 return Vec128<float>(vdupq_n_f32(vaddvq_f32(
v.raw)));
4842 return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(
v.raw)));
4845 return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(
v.raw)));
4848 return Vec128<double>(vdupq_n_f64(vaddvq_f64(
v.raw)));
4853 uint32x4x2_t v0 = vuzpq_u32(
v.raw,
v.raw);
4854 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4855 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4859 int32x4x2_t v0 = vuzpq_s32(
v.raw,
v.raw);
4860 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4861 int32x4x2_t v1 = vuzpq_s32(c0, c0);
4865 float32x4x2_t v0 = vuzpq_f32(
v.raw,
v.raw);
4866 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4867 float32x4x2_t v1 = vuzpq_f32(c0, c0);
4878template <
typename T>
4884 return Min(v20_31_20_31, v31_20_31_20);
4886template <
typename T>
4892 return Max(v20_31_20_31, v31_20_31_20);
4896template <
typename T>
4900 return Min(v10, v01);
4902template <
typename T>
4906 return Max(v10, v01);
4910template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4914 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4919template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
4923 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4931template <
typename T,
size_t N>
4935template <
typename T,
size_t N>
4939template <
typename T,
size_t N>
4951template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4956template <
typename T>
4961template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4966 const auto vmask_bits =
Set64(du, mask_bits);
4969 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4970 1, 1, 1, 1, 1, 1, 1, 1};
4973 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4974 1, 2, 4, 8, 16, 32, 64, 128};
4978template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4981 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4982 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4986template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4989 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4990 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4994template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4997 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
5004template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5007 uint64_t mask_bits = 0;
5018template <
typename T>
5026template <
typename T>
5029 const Twice<
decltype(
d)> d2;
5035template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5040 constexpr size_t kBytes =
sizeof(T) *
N;
5041 return nib & ((1ull << (kBytes * 4)) - 1);
5044template <
typename T>
5047 alignas(16)
constexpr uint8_t kSliceLanes[16] = {
5048 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
5056 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.
raw, values.
raw));
5057 const uint8x8_t x4 = vpadd_u8(x2, x2);
5058 const uint8x8_t x8 = vpadd_u8(x4, x4);
5059 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
5062 const uint16x8_t x2 = vpaddlq_u8(values.
raw);
5063 const uint32x4_t x4 = vpaddlq_u16(x2);
5064 const uint64x2_t x8 = vpaddlq_u32(x4);
5065 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
5069template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5074 alignas(8)
constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
5075 0x10, 0x20, 0x40, 0x80};
5082 return vaddv_u8(values.
raw);
5084 const uint16x4_t x2 = vpaddl_u8(values.
raw);
5085 const uint32x2_t x4 = vpaddl_u16(x2);
5086 const uint64x1_t x8 = vpaddl_u32(x4);
5087 return vget_lane_u64(x8, 0);
5091template <
typename T>
5094 alignas(16)
constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
5095 0x10, 0x20, 0x40, 0x80};
5101 return vaddvq_u16(values.
raw);
5103 const uint32x4_t x2 = vpaddlq_u16(values.
raw);
5104 const uint64x2_t x4 = vpaddlq_u32(x2);
5105 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
5109template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5114 alignas(8)
constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
5120 return vaddv_u16(values.
raw);
5122 const uint32x2_t x2 = vpaddl_u16(values.
raw);
5123 const uint64x1_t x4 = vpaddl_u32(x2);
5124 return vget_lane_u64(x4, 0);
5128template <
typename T>
5131 alignas(16)
constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
5137 return vaddvq_u32(values.
raw);
5139 const uint64x2_t x2 = vpaddlq_u32(values.
raw);
5140 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
5144template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5149 alignas(8)
constexpr uint32_t kSliceLanes[2] = {1, 2};
5155 return vaddv_u32(values.
raw);
5157 const uint64x1_t x2 = vpaddl_u32(values.
raw);
5158 return vget_lane_u64(x2, 0);
5162template <
typename T>
5164 alignas(16)
constexpr uint64_t kSliceLanes[2] = {1, 2};
5170 return vaddvq_u64(values.
raw);
5172 return vgetq_lane_u64(values.
raw, 0) + vgetq_lane_u64(values.
raw, 1);
5176template <
typename T>
5182 return vget_lane_u64(values.
raw, 0);
5186template <
typename T,
size_t N>
5188 return ((
N *
sizeof(T)) >= 8) ? bits : (bits & ((1ull <<
N) - 1));
5191template <
typename T,
size_t N>
5206template <
typename T>
5209 const int8x16_t ones =
5213 return static_cast<size_t>(vaddvq_s8(ones));
5215 const int16x8_t x2 = vpaddlq_s8(ones);
5216 const int32x4_t x4 = vpaddlq_s16(x2);
5217 const int64x2_t x8 = vpaddlq_s32(x4);
5218 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
5221template <
typename T>
5224 const int16x8_t ones =
5228 return static_cast<size_t>(vaddvq_s16(ones));
5230 const int32x4_t x2 = vpaddlq_s16(ones);
5231 const int64x2_t x4 = vpaddlq_s32(x2);
5232 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
5236template <
typename T>
5239 const int32x4_t ones =
5243 return static_cast<size_t>(vaddvq_s32(ones));
5245 const int64x2_t x2 = vpaddlq_s32(ones);
5246 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
5250template <
typename T>
5254 const int64x2_t ones =
5256 return static_cast<size_t>(vaddvq_s64(ones));
5260 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
5261 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
5268template <
typename T>
5274template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5276 constexpr int kDiv = 4 *
sizeof(T);
5279template <
typename T,
size_t N>
5283 if (nib == 0)
return -1;
5284 constexpr int kDiv = 4 *
sizeof(T);
5289template <
typename T,
size_t N>
5293 const size_t kNumBytes = (
N + 7) / 8;
5294 CopyBytes<kNumBytes>(&mask_bits, bits);
5298template <
typename T,
size_t N>
5304template <
typename T>
5309template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5311 constexpr size_t kBytes =
sizeof(T) *
N;
5317template <
typename T>
5326 const uint8_t* bytes) {
5328 vld1q_dup_u64(
reinterpret_cast<const uint64_t*
>(bytes))));
5332template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
5334 const uint8_t* bytes) {
5335 return Load(
d, bytes);
5338template <
typename T,
size_t N>
5340 const uint64_t mask_bits) {
5354 alignas(16)
constexpr uint8_t table[256 * 8] = {
5356 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5357 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5358 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
5359 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5360 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
5361 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
5362 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
5363 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5364 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
5365 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
5366 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
5367 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
5368 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
5369 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
5370 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
5371 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5372 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
5373 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
5374 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
5375 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
5376 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
5377 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
5378 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
5379 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
5380 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
5381 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
5382 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
5383 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
5384 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
5385 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
5386 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
5387 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5388 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
5389 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
5390 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
5391 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
5392 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
5393 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
5394 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
5395 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
5396 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
5397 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
5398 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
5399 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
5400 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
5401 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
5402 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
5403 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
5404 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
5405 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
5406 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
5407 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
5408 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
5409 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
5410 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
5411 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
5412 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
5413 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
5414 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
5415 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
5416 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
5417 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
5418 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
5419 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5420 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
5421 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
5422 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
5423 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
5424 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
5425 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
5426 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
5427 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
5428 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
5429 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
5430 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
5431 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
5432 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
5433 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
5434 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
5435 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
5436 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
5437 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
5438 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
5439 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
5440 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
5441 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
5442 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
5443 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
5444 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
5445 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
5446 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
5447 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
5448 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
5449 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
5450 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
5451 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
5452 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
5453 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
5454 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
5455 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
5456 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
5457 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
5458 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
5459 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
5460 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
5461 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
5462 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
5463 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
5464 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
5465 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
5466 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
5467 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
5468 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
5469 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
5470 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
5471 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
5472 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
5473 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
5474 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
5475 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
5476 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
5477 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
5478 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
5479 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
5480 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
5481 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
5482 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
5483 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5490template <
typename T,
size_t N>
5492 const uint64_t mask_bits) {
5506 alignas(16)
constexpr uint8_t table[256 * 8] = {
5508 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
5509 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
5510 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
5511 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5512 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
5513 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
5514 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
5515 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
5516 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
5517 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
5518 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
5519 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
5520 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
5521 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
5522 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
5523 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
5524 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
5525 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
5526 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
5527 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
5528 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
5529 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
5530 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
5531 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
5532 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
5533 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
5534 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
5535 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
5536 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
5537 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
5538 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
5539 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
5540 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
5541 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
5542 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
5543 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
5544 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
5545 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
5546 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
5547 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
5548 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
5549 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
5550 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
5551 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
5552 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
5553 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
5554 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
5555 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
5556 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
5557 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
5558 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
5559 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
5560 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
5561 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
5562 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
5563 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
5564 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
5565 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
5566 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
5567 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
5568 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
5569 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
5570 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
5571 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
5572 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
5573 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
5574 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
5575 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
5576 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
5577 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
5578 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
5579 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
5580 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
5581 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
5582 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
5583 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
5584 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
5585 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
5586 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
5587 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
5588 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
5589 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
5590 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
5591 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
5592 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
5593 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
5594 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
5595 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
5596 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
5597 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
5598 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
5599 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
5600 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
5601 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
5602 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
5603 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
5604 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
5605 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
5606 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
5607 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
5608 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
5609 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
5610 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
5611 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
5612 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
5613 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
5614 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
5615 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
5616 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
5617 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
5618 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
5619 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
5620 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
5621 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
5622 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
5623 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
5624 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
5625 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
5626 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
5627 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
5628 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
5629 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
5630 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
5631 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
5632 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
5633 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
5634 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
5635 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
5642template <
typename T,
size_t N>
5644 const uint64_t mask_bits) {
5648 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
5650 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5651 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5652 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
5653 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5654 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
5655 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
5656 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
5657 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5658 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5659 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
5660 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
5661 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5662 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5663 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
5664 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
5665 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5668 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5671template <
typename T,
size_t N>
5673 const uint64_t mask_bits) {
5677 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
5679 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
5680 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
5681 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
5682 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
5683 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
5684 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
5685 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5686 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5687 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
5688 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5689 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
5690 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
5691 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
5692 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5696 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5699#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
5701template <
typename T,
size_t N>
5703 const uint64_t mask_bits) {
5707 alignas(16)
constexpr uint8_t u8_indices[64] = {
5709 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5710 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5711 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5712 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5714 const Simd<T, N, 0>
d;
5716 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5719template <
typename T,
size_t N>
5721 const uint64_t mask_bits) {
5725 alignas(16)
constexpr uint8_t u8_indices[4 * 16] = {
5727 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5728 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5729 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5730 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5732 const Simd<T, N, 0>
d;
5734 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
5741template <
typename T,
size_t N>
5744 detail::IdxFromBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
5750template <
typename T,
size_t N>
5753 detail::IdxFromNotBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
5762template <
typename T>
5768template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5780template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5786template <
typename T>
5792template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
5804template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
5808 if (
N < 16 /
sizeof(T)) {
5822template <
typename T,
size_t N>
5825 uint64_t mask_bits = 0;
5826 constexpr size_t kNumBytes = (
N + 7) / 8;
5827 CopyBytes<kNumBytes>(bits, &mask_bits);
5829 mask_bits &= (1ull <<
N) - 1;
5836template <
typename T,
size_t N>
5845template <
typename T,
size_t N>
5850 using TU =
TFromD<
decltype(du)>;
5852 const size_t count =
PopCount(mask_bits);
5861template <
typename T,
size_t N>
5865 uint64_t mask_bits = 0;
5866 constexpr size_t kNumBytes = (
N + 7) / 8;
5867 CopyBytes<kNumBytes>(bits, &mask_bits);
5869 mask_bits &= (1ull <<
N) - 1;
5879#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5880#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
5882#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
5886#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5887#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
5890#define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
5891#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
5894#define HWY_IF_LOAD_INT(T, N) \
5895 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
5896#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
5897 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
5898 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
5899 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
5900 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
5901 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
5907#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5908 decltype(Tuple2<type##_t, size>().raw)
5910#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5911 const type##_t *from, Tuple2<type##_t, size>
5913#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5914#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5916#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5917 decltype(Tuple3<type##_t, size>().raw)
5918#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5919 const type##_t *from, Tuple3<type##_t, size>
5921#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5922#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5924#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
5925 decltype(Tuple4<type##_t, size>().raw)
5926#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
5927 const type##_t *from, Tuple4<type##_t, size>
5929#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
5930#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
5932#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
5933#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
5934#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
5937template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
5947template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5952 alignas(16) T buf[2 * 8 /
sizeof(T)] = {};
5953 CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
5955 v0 = Vec128<T, N>(raw.val[0]);
5956 v1 = Vec128<T, N>(raw.val[1]);
5961template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
5963 Vec128<T>& v0, Vec128<T>& v1) {
5964 const Half<
decltype(
d)> dh;
5965 VFromD<
decltype(dh)> v00, v10, v01, v11;
5975template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
5986template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5989 Vec128<T, N>& v1, Vec128<T, N>& v2) {
5991 alignas(16) T buf[3 * 8 /
sizeof(T)] = {};
5992 CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
5994 v0 = Vec128<T, N>(raw.val[0]);
5995 v1 = Vec128<T, N>(raw.val[1]);
5996 v2 = Vec128<T, N>(raw.val[2]);
6001template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6003 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
6004 const Half<
decltype(
d)> dh;
6005 VFromD<
decltype(dh)> v00, v10, v20, v01, v11, v21;
6016template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6029template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6032 Vec128<T, N>& v1, Vec128<T, N>& v2,
6034 alignas(16) T buf[4 * 8 /
sizeof(T)] = {};
6035 CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
6037 v0 = Vec128<T, N>(raw.val[0]);
6038 v1 = Vec128<T, N>(raw.val[1]);
6039 v2 = Vec128<T, N>(raw.val[2]);
6040 v3 = Vec128<T, N>(raw.val[3]);
6045template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6047 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
6049 const Half<
decltype(
d)> dh;
6050 VFromD<
decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
6060#undef HWY_IF_LOAD_INT
6065#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
6066#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
6067#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
6070#define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
6071#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6074#define HWY_IF_STORE_INT(T, N) \
6075 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6076#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
6077 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6078 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6079 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6080 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6081 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6084#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6085 Tuple2<type##_t, size> tup, type##_t *to
6087#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6089#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6090 Tuple3<type##_t, size> tup, type##_t *to
6092#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6094#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6095 Tuple4<type##_t, size> tup, type##_t *to
6097#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6099#undef HWY_NEON_DEF_FUNCTION_STORE_INT
6100#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
6101#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
6102#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
6105template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6114template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6118 alignas(16) T buf[2 * 8 /
sizeof(T)];
6119 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6121 CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
6126template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6129 const Half<
decltype(
d)> dh;
6137template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6146template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6148 const Vec128<T, N> v2, Simd<T, N, 0> ,
6150 alignas(16) T buf[3 * 8 /
sizeof(T)];
6151 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6153 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
6158template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6160 const Vec128<T> v2, Full128<T>
d,
6162 const Half<
decltype(
d)> dh;
6172template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6182template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6184 const Vec128<T, N> v2,
const Vec128<T, N> v3,
6187 alignas(16) T buf[4 * 8 /
sizeof(T)];
6188 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6190 CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
6195template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6197 const Vec128<T> v2,
const Vec128<T> v3,
6199 const Half<
decltype(
d)> dh;
6207#undef HWY_IF_STORE_INT
6211template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6214 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"Use u64");
6239template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6387#undef HWY_NEON_BUILD_ARG_1
6388#undef HWY_NEON_BUILD_ARG_2
6389#undef HWY_NEON_BUILD_ARG_3
6390#undef HWY_NEON_BUILD_PARAM_1
6391#undef HWY_NEON_BUILD_PARAM_2
6392#undef HWY_NEON_BUILD_PARAM_3
6393#undef HWY_NEON_BUILD_RET_1
6394#undef HWY_NEON_BUILD_RET_2
6395#undef HWY_NEON_BUILD_RET_3
6396#undef HWY_NEON_BUILD_TPL_1
6397#undef HWY_NEON_BUILD_TPL_2
6398#undef HWY_NEON_BUILD_TPL_3
6399#undef HWY_NEON_DEF_FUNCTION
6400#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
6401#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
6402#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
6403#undef HWY_NEON_DEF_FUNCTION_INTS
6404#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
6405#undef HWY_NEON_DEF_FUNCTION_INT_16
6406#undef HWY_NEON_DEF_FUNCTION_INT_32
6407#undef HWY_NEON_DEF_FUNCTION_INT_8
6408#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
6409#undef HWY_NEON_DEF_FUNCTION_TPL
6410#undef HWY_NEON_DEF_FUNCTION_UIF81632
6411#undef HWY_NEON_DEF_FUNCTION_UINTS
6412#undef HWY_NEON_DEF_FUNCTION_UINT_16
6413#undef HWY_NEON_DEF_FUNCTION_UINT_32
6414#undef HWY_NEON_DEF_FUNCTION_UINT_8
6415#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:159
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:182
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:192
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:138
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:133
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:6076
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:91
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:187
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:121
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:165
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition: arm_neon-inl.h:2385
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:107
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:114
#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args)
Definition: arm_neon-inl.h:196
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:177
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:99
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:127
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition: arm_neon-inl.h:5896
#define HWY_IF_FLOAT(T)
Definition: base.h:343
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:809
Mask128(const Mask128 &)=default
Mask128 & operator=(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:812
Raw raw
Definition: arm_neon-inl.h:814
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:764
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:767
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:774
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:780
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:789
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:786
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:771
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:783
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:777
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1733
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:5325
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4952
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5491
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_INLINE uint64_t NibblesFromMask(const Full128< T > d, Mask128< T > mask)
Definition: arm_neon-inl.h:5019
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5339
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1884
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition: arm_neon-inl.h:1388
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec128< T, 2 > ConcatEven(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4474
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, 2 > ConcatOdd(Simd< T, 2, 0 > d, Vec128< T, 2 > hi, Vec128< T, 2 > lo)
Definition: arm_neon-inl.h:4443
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr float MantissaEnd< float >()
Definition: base.h:636
double float64_t
Definition: base.h:258
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
float float32_t
Definition: base.h:257
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5318
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
Definition: arm_neon-inl.h:823
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:825
uint16x4_t type
Definition: arm_neon-inl.h:688
uint16x8_t type
Definition: arm_neon-inl.h:625
uint16x4_t type
Definition: arm_neon-inl.h:683
uint16x8_t type
Definition: arm_neon-inl.h:620
float32x2_t type
Definition: arm_neon-inl.h:693
float32x4_t type
Definition: arm_neon-inl.h:630
int16x4_t type
Definition: arm_neon-inl.h:668
int16x8_t type
Definition: arm_neon-inl.h:605
int32x2_t type
Definition: arm_neon-inl.h:673
int32x4_t type
Definition: arm_neon-inl.h:610
int64x1_t type
Definition: arm_neon-inl.h:678
int64x2_t type
Definition: arm_neon-inl.h:615
int8x16_t type
Definition: arm_neon-inl.h:600
int8x8_t type
Definition: arm_neon-inl.h:663
uint16x4_t type
Definition: arm_neon-inl.h:648
uint16x8_t type
Definition: arm_neon-inl.h:585
uint32x2_t type
Definition: arm_neon-inl.h:653
uint32x4_t type
Definition: arm_neon-inl.h:590
uint64x1_t type
Definition: arm_neon-inl.h:658
uint64x2_t type
Definition: arm_neon-inl.h:595
uint8x16_t type
Definition: arm_neon-inl.h:580
uint8x8_t type
Definition: arm_neon-inl.h:643
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3561
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3568
Definition: arm_neon-inl.h:3539
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3549
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:3542
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3591
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3598
Definition: arm_neon-inl.h:3574
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3576
uint16x8x2_t raw
Definition: arm_neon-inl.h:346
uint16x4x2_t raw
Definition: arm_neon-inl.h:350
uint16x8x2_t raw
Definition: arm_neon-inl.h:338
uint16x4x2_t raw
Definition: arm_neon-inl.h:342
float32x4x2_t raw
Definition: arm_neon-inl.h:355
float32x2x2_t raw
Definition: arm_neon-inl.h:359
int16x8x2_t raw
Definition: arm_neon-inl.h:297
int16x4x2_t raw
Definition: arm_neon-inl.h:301
int32x4x2_t raw
Definition: arm_neon-inl.h:313
int32x2x2_t raw
Definition: arm_neon-inl.h:317
int64x2x2_t raw
Definition: arm_neon-inl.h:329
int64x1x2_t raw
Definition: arm_neon-inl.h:333
int8x16x2_t raw
Definition: arm_neon-inl.h:281
int8x8x2_t raw
Definition: arm_neon-inl.h:285
uint16x8x2_t raw
Definition: arm_neon-inl.h:289
uint16x4x2_t raw
Definition: arm_neon-inl.h:293
uint32x4x2_t raw
Definition: arm_neon-inl.h:305
uint32x2x2_t raw
Definition: arm_neon-inl.h:309
uint64x2x2_t raw
Definition: arm_neon-inl.h:321
uint64x1x2_t raw
Definition: arm_neon-inl.h:325
uint8x16x2_t raw
Definition: arm_neon-inl.h:273
uint8x8x2_t raw
Definition: arm_neon-inl.h:277
Definition: arm_neon-inl.h:265
uint16x8x3_t raw
Definition: arm_neon-inl.h:447
uint16x4x3_t raw
Definition: arm_neon-inl.h:451
uint16x8x3_t raw
Definition: arm_neon-inl.h:439
uint16x4x3_t raw
Definition: arm_neon-inl.h:443
float32x4x3_t raw
Definition: arm_neon-inl.h:456
float32x2x3_t raw
Definition: arm_neon-inl.h:460
int16x8x3_t raw
Definition: arm_neon-inl.h:398
int16x4x3_t raw
Definition: arm_neon-inl.h:402
int32x4x3_t raw
Definition: arm_neon-inl.h:414
int32x2x3_t raw
Definition: arm_neon-inl.h:418
int64x2x3_t raw
Definition: arm_neon-inl.h:430
int64x1x3_t raw
Definition: arm_neon-inl.h:434
int8x16x3_t raw
Definition: arm_neon-inl.h:382
int8x8x3_t raw
Definition: arm_neon-inl.h:386
uint16x8x3_t raw
Definition: arm_neon-inl.h:390
uint16x4x3_t raw
Definition: arm_neon-inl.h:394
uint32x4x3_t raw
Definition: arm_neon-inl.h:406
uint32x2x3_t raw
Definition: arm_neon-inl.h:410
uint64x2x3_t raw
Definition: arm_neon-inl.h:422
uint64x1x3_t raw
Definition: arm_neon-inl.h:426
uint8x16x3_t raw
Definition: arm_neon-inl.h:374
uint8x8x3_t raw
Definition: arm_neon-inl.h:378
Definition: arm_neon-inl.h:267
uint16x8x4_t raw
Definition: arm_neon-inl.h:548
uint16x4x4_t raw
Definition: arm_neon-inl.h:552
uint16x8x4_t raw
Definition: arm_neon-inl.h:540
uint16x4x4_t raw
Definition: arm_neon-inl.h:544
float32x4x4_t raw
Definition: arm_neon-inl.h:557
float32x2x4_t raw
Definition: arm_neon-inl.h:561
int16x8x4_t raw
Definition: arm_neon-inl.h:499
int16x4x4_t raw
Definition: arm_neon-inl.h:503
int32x4x4_t raw
Definition: arm_neon-inl.h:515
int32x2x4_t raw
Definition: arm_neon-inl.h:519
int64x2x4_t raw
Definition: arm_neon-inl.h:531
int64x1x4_t raw
Definition: arm_neon-inl.h:535
int8x16x4_t raw
Definition: arm_neon-inl.h:483
int8x8x4_t raw
Definition: arm_neon-inl.h:487
uint16x8x4_t raw
Definition: arm_neon-inl.h:491
uint16x4x4_t raw
Definition: arm_neon-inl.h:495
uint32x4x4_t raw
Definition: arm_neon-inl.h:507
uint32x2x4_t raw
Definition: arm_neon-inl.h:511
uint64x2x4_t raw
Definition: arm_neon-inl.h:523
uint64x1x4_t raw
Definition: arm_neon-inl.h:527
uint8x16x4_t raw
Definition: arm_neon-inl.h:475
uint8x8x4_t raw
Definition: arm_neon-inl.h:479
Definition: arm_neon-inl.h:269