38 #define HWY_NEON_BUILD_TPL_1
39 #define HWY_NEON_BUILD_TPL_2
40 #define HWY_NEON_BUILD_TPL_3
43 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type, size>
44 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type, size>
45 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type, size>
48 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type, size> a
49 #define HWY_NEON_BUILD_PARAM_2(type, size) \
50 const Vec128<type, size> a, const Vec128<type, size> b
51 #define HWY_NEON_BUILD_PARAM_3(type, size) \
52 const Vec128<type, size> a, const Vec128<type, size> b, \
53 const Vec128<type, size> c
57 #define HWY_NEON_BUILD_ARG_1 a.raw
58 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
59 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
68 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
74 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
75 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
76 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
77 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
78 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
79 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
89 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
90 HWY_NEON_DEF_FUNCTION(uint8_t, 16, name, prefix##q, infix, u8, args) \
91 HWY_NEON_DEF_FUNCTION(uint8_t, 8, name, prefix, infix, u8, args) \
92 HWY_NEON_DEF_FUNCTION(uint8_t, 4, name, prefix, infix, u8, args) \
93 HWY_NEON_DEF_FUNCTION(uint8_t, 2, name, prefix, infix, u8, args) \
94 HWY_NEON_DEF_FUNCTION(uint8_t, 1, name, prefix, infix, u8, args)
97 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
98 HWY_NEON_DEF_FUNCTION(int8_t, 16, name, prefix##q, infix, s8, args) \
99 HWY_NEON_DEF_FUNCTION(int8_t, 8, name, prefix, infix, s8, args) \
100 HWY_NEON_DEF_FUNCTION(int8_t, 4, name, prefix, infix, s8, args) \
101 HWY_NEON_DEF_FUNCTION(int8_t, 2, name, prefix, infix, s8, args) \
102 HWY_NEON_DEF_FUNCTION(int8_t, 1, name, prefix, infix, s8, args)
105 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
106 HWY_NEON_DEF_FUNCTION(uint16_t, 8, name, prefix##q, infix, u16, args) \
107 HWY_NEON_DEF_FUNCTION(uint16_t, 4, name, prefix, infix, u16, args) \
108 HWY_NEON_DEF_FUNCTION(uint16_t, 2, name, prefix, infix, u16, args) \
109 HWY_NEON_DEF_FUNCTION(uint16_t, 1, name, prefix, infix, u16, args)
112 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
113 HWY_NEON_DEF_FUNCTION(int16_t, 8, name, prefix##q, infix, s16, args) \
114 HWY_NEON_DEF_FUNCTION(int16_t, 4, name, prefix, infix, s16, args) \
115 HWY_NEON_DEF_FUNCTION(int16_t, 2, name, prefix, infix, s16, args) \
116 HWY_NEON_DEF_FUNCTION(int16_t, 1, name, prefix, infix, s16, args)
119 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
120 HWY_NEON_DEF_FUNCTION(uint32_t, 4, name, prefix##q, infix, u32, args) \
121 HWY_NEON_DEF_FUNCTION(uint32_t, 2, name, prefix, infix, u32, args) \
122 HWY_NEON_DEF_FUNCTION(uint32_t, 1, name, prefix, infix, u32, args)
125 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
126 HWY_NEON_DEF_FUNCTION(int32_t, 4, name, prefix##q, infix, s32, args) \
127 HWY_NEON_DEF_FUNCTION(int32_t, 2, name, prefix, infix, s32, args) \
128 HWY_NEON_DEF_FUNCTION(int32_t, 1, name, prefix, infix, s32, args)
131 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
132 HWY_NEON_DEF_FUNCTION(uint64_t, 2, name, prefix##q, infix, u64, args) \
133 HWY_NEON_DEF_FUNCTION(uint64_t, 1, name, prefix, infix, u64, args)
136 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
137 HWY_NEON_DEF_FUNCTION(int64_t, 2, name, prefix##q, infix, s64, args) \
138 HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)
142 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
143 HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
144 HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
145 HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args) \
146 HWY_NEON_DEF_FUNCTION(double, 2, name, prefix##q, infix, f64, args) \
147 HWY_NEON_DEF_FUNCTION(double, 1, name, prefix, infix, f64, args)
149 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
150 HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
151 HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
152 HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args)
157 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
158 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
159 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
160 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
163 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
164 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
165 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
166 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
169 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
170 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
171 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
174 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
175 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
176 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
179 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
180 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
181 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
184 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
185 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
186 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
190 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
191 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
192 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
193 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
194 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
195 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
196 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
197 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
198 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
199 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
200 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
201 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
202 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
203 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
204 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
205 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
206 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
207 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
208 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
209 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
210 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
211 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
212 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
213 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
214 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
215 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
216 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
217 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
218 #define vzip1_s8(x, y) vzip_s8(x, y).val[0]
219 #define vzip1_u8(x, y) vzip_u8(x, y).val[0]
220 #define vzip1_s16(x, y) vzip_s16(x, y).val[0]
221 #define vzip1_u16(x, y) vzip_u16(x, y).val[0]
222 #define vzip1_f32(x, y) vzip_f32(x, y).val[0]
223 #define vzip1_u32(x, y) vzip_u32(x, y).val[0]
224 #define vzip1_s32(x, y) vzip_s32(x, y).val[0]
225 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
226 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
227 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
228 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
229 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
230 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
231 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
232 #define vzip2_s8(x, y) vzip_s8(x, y).val[1]
233 #define vzip2_u8(x, y) vzip_u8(x, y).val[1]
234 #define vzip2_s16(x, y) vzip_s16(x, y).val[1]
235 #define vzip2_u16(x, y) vzip_u16(x, y).val[1]
236 #define vzip2_s32(x, y) vzip_s32(x, y).val[1]
237 #define vzip2_u32(x, y) vzip_u32(x, y).val[1]
238 #define vzip2_f32(x, y) vzip_f32(x, y).val[1]
239 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
240 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
241 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
242 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
243 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
244 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
245 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
248 template <
typename T,
size_t N>
309 struct Raw128<double, 2> {
310 using type = float64x2_t;
372 struct Raw128<double, 1> {
373 using type = float64x1_t;
467 template <
typename T,
size_t N = 16 /
sizeof(T)>
480 return *
this = (*
this * other);
483 return *
this = (*
this / other);
486 return *
this = (*
this + other);
489 return *
this = (*
this - other);
492 return *
this = (*
this & other);
495 return *
this = (*
this | other);
498 return *
this = (*
this ^ other);
505 template <
typename T,
size_t N = 16 /
sizeof(T)>
523 template <
typename T,
size_t N>
543 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
544 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
545 Vec128<uint8_t, size * sizeof(type)>
546 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type, size> v
547 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
572 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
573 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
574 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
575 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
585 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
590 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
595 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
600 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
605 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
610 template <
size_t N, HWY_IF_LE64(
float, N)>
686 template <
typename T,
size_t N,
typename FromT>
688 Vec128<FromT, N *
sizeof(T) /
sizeof(FromT)> v) {
695 #define HWY_NEON_BUILD_TPL_HWY_SET1
696 #define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type, size>
697 #define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
698 Simd<type, size> , const type t
699 #define HWY_NEON_BUILD_ARG_HWY_SET1 t
703 #undef HWY_NEON_BUILD_TPL_HWY_SET1
704 #undef HWY_NEON_BUILD_RET_HWY_SET1
705 #undef HWY_NEON_BUILD_PARAM_HWY_SET1
706 #undef HWY_NEON_BUILD_ARG_HWY_SET1
709 template <
typename T,
size_t N>
723 template <
typename T,
size_t N>
733 template <
typename T,
size_t N,
typename T2>
736 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
737 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
739 return Load(d, lanes);
745 return vgetq_lane_u8(v.
raw, 0);
749 return vget_lane_u8(v.
raw, 0);
753 return vgetq_lane_s8(v.
raw, 0);
757 return vget_lane_s8(v.
raw, 0);
761 return vgetq_lane_u16(v.
raw, 0);
765 return vget_lane_u16(v.
raw, 0);
769 return vgetq_lane_s16(v.
raw, 0);
773 return vget_lane_s16(v.
raw, 0);
777 return vgetq_lane_u32(v.
raw, 0);
781 return vget_lane_u32(v.
raw, 0);
785 return vgetq_lane_s32(v.
raw, 0);
789 return vget_lane_s32(v.
raw, 0);
793 return vgetq_lane_u64(v.
raw, 0);
796 return vget_lane_u64(v.
raw, 0);
799 return vgetq_lane_s64(v.
raw, 0);
802 return vget_lane_s64(v.
raw, 0);
806 return vgetq_lane_f32(v.
raw, 0);
809 return vget_lane_f32(v.
raw, 0);
812 return vget_lane_f32(v.
raw, 0);
816 return vgetq_lane_f64(v.raw, 0);
819 return vget_lane_f64(v.raw, 0);
885 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
886 #undef HWY_NEON_DEF_FUNCTION
887 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
888 template <int kBits> \
889 HWY_API Vec128<type, size> name(const Vec128<type, size> v) { \
890 return kBits == 0 ? v \
891 : Vec128<type, size>(HWY_NEON_EVAL( \
892 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
900 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
908 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
918 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
928 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
947 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
957 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
967 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
989 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1001 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1013 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1035 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1045 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1055 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1072 template <
typename T,
size_t N>
1074 return v << Set(Simd<T, N>(),
static_cast<T
>(bits));
1076 template <
typename T,
size_t N>
1093 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1098 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1114 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1119 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1128 int32x4_t rlo = vmull_s16(vget_low_s16(a.
raw), vget_low_s16(b.
raw));
1129 #if HWY_ARCH_ARM_A64
1130 int32x4_t rhi = vmull_high_s16(a.
raw, b.
raw);
1132 int32x4_t rhi = vmull_s16(vget_high_s16(a.
raw), vget_high_s16(b.
raw));
1135 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1139 uint32x4_t rlo = vmull_u16(vget_low_u16(a.
raw), vget_low_u16(b.
raw));
1140 #if HWY_ARCH_ARM_A64
1141 uint32x4_t rhi = vmull_high_u16(a.
raw, b.
raw);
1143 uint32x4_t rhi = vmull_u16(vget_high_u16(a.
raw), vget_high_u16(b.
raw));
1146 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1149 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1152 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.
raw, b.
raw));
1155 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1158 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.
raw, b.
raw));
1175 #if HWY_ARCH_ARM_A64
1209 template <
size_t N, HWY_IF_LE64(
float, N)>
1218 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1219 template <
size_t N, HWY_IF_LE64(
float, N)>
1221 const Vec128<float, N> x,
1222 const Vec128<float, N> add) {
1223 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1225 HWY_API Vec128<float>
MulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1226 const Vec128<float> add) {
1227 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1235 return mul * x + add;
1239 #if HWY_ARCH_ARM_A64
1240 HWY_API Vec128<double, 1>
MulAdd(
const Vec128<double, 1> mul,
1241 const Vec128<double, 1> x,
1242 const Vec128<double, 1> add) {
1243 return Vec128<double, 1>(vfma_f64(add.raw, mul.raw, x.raw));
1245 HWY_API Vec128<double>
MulAdd(
const Vec128<double> mul,
const Vec128<double> x,
1246 const Vec128<double> add) {
1247 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1252 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1253 template <
size_t N, HWY_IF_LE64(
float, N)>
1255 const Vec128<float, N> x,
1256 const Vec128<float, N> add) {
1257 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1259 HWY_API Vec128<float>
NegMulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1260 const Vec128<float> add) {
1261 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1269 return add - mul * x;
1273 #if HWY_ARCH_ARM_A64
1275 const Vec128<double, 1> x,
1276 const Vec128<double, 1> add) {
1277 return Vec128<double, 1>(vfms_f64(add.raw, mul.raw, x.raw));
1280 const Vec128<double> x,
1281 const Vec128<double> add) {
1282 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1302 #if HWY_ARCH_ARM_A64
1304 HWY_API Vec128<double, N>
MulSub(
const Vec128<double, N> mul,
1305 const Vec128<double, N> x,
1306 const Vec128<double, N> sub) {
1311 const Vec128<double, N> x,
1312 const Vec128<double, N> sub) {
1329 #if HWY_ARCH_ARM_A64
1355 const auto root = v * recip;
1365 template <
typename T>
1371 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1375 using V8 = decltype(
Zero(d8));
1391 namespace internal {
1397 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1400 return internal::reversed_andnot(mask, not_mask);
1404 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1406 const Vec128<T, N> mask) {
1407 const Simd<MakeUnsigned<T>, N> du;
1408 Vec128<MakeUnsigned<T>, N> ret =
1409 internal::reversed_andnot(
BitCast(du, mask),
BitCast(du, not_mask));
1410 return BitCast(Simd<T, N>(), ret);
1437 template <
typename T,
size_t N>
1442 template <
typename T,
size_t N>
1447 template <
typename T,
size_t N>
1454 #ifdef HWY_NATIVE_POPCNT
1455 #undef HWY_NATIVE_POPCNT
1457 #define HWY_NATIVE_POPCNT
1462 template <
typename T>
1467 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1475 template <
typename T>
1478 const uint8x16_t bytes = vcntq_u8(
BitCast(d8, v).raw);
1481 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1485 const uint8x8_t bytes = vcnt_u8(
BitCast(d8, v).raw);
1489 template <
typename T>
1492 const uint8x16_t bytes = vcntq_u8(
BitCast(d8, v).raw);
1493 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
1495 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1499 const uint8x8_t bytes = vcnt_u8(
BitCast(d8, v).raw);
1503 template <
typename T>
1506 const uint8x16_t bytes = vcntq_u8(
BitCast(d8, v).raw);
1507 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
1509 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1513 const uint8x8_t bytes = vcnt_u8(
BitCast(d8, v).raw);
1514 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
1519 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1543 template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1547 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1551 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1555 template <
size_t N, HWY_IF_LE64(
float, N)>
1560 #if HWY_ARCH_ARM_A64
1561 HWY_API Vec128<double>
Abs(
const Vec128<double> v) {
1562 return Vec128<double>(vabsq_f64(v.raw));
1565 HWY_API Vec128<double, 1>
Abs(
const Vec128<double, 1> v) {
1566 return Vec128<double, 1>(vabs_f64(v.raw));
1572 template <
typename T,
size_t N>
1575 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1580 template <
typename T,
size_t N>
1583 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1589 template <
typename T,
size_t N>
1599 template <
typename T,
size_t N>
1606 template <
typename T,
size_t N>
1611 template <
typename T,
size_t N>
1618 template <
typename TFrom,
typename TTo,
size_t N>
1620 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1626 #define HWY_NEON_BUILD_TPL_HWY_IF
1627 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
1628 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
1629 const Mask128<type, size> mask, const Vec128<type, size> yes, \
1630 const Vec128<type, size> no
1631 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
1635 #undef HWY_NEON_BUILD_TPL_HWY_IF
1636 #undef HWY_NEON_BUILD_RET_HWY_IF
1637 #undef HWY_NEON_BUILD_PARAM_HWY_IF
1638 #undef HWY_NEON_BUILD_ARG_HWY_IF
1641 template <
typename T,
size_t N>
1648 template <
typename T,
size_t N>
1654 template <
typename T,
size_t N>
1657 return Max(zero, v);
1662 template <
typename T,
size_t N>
1667 template <
typename T,
size_t N>
1673 template <
typename T,
size_t N>
1679 template <
typename T,
size_t N>
1685 template <
typename T,
size_t N>
1717 #define HWY_NEON_BUILD_TPL_HWY_COMPARE
1718 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
1719 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
1720 const Vec128<type, size> a, const Vec128<type, size> b
1721 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
1725 #if HWY_ARCH_ARM_A64
1734 template <
typename T,
size_t N>
1740 #if HWY_ARCH_ARM_A64
1750 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE
1751 #undef HWY_NEON_BUILD_RET_HWY_COMPARE
1752 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
1753 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE
1761 const Vec128<int64_t, N> b) {
1762 const Simd<int32_t, N * 2> d32;
1763 const Simd<int64_t, N> d64;
1771 const Vec128<uint64_t, N> b) {
1772 const Simd<uint32_t, N * 2> d32;
1773 const Simd<uint64_t, N> d64;
1780 const Vec128<int64_t> b) {
1781 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
1785 const Vec128<int64_t, 1> b) {
1786 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
1794 template <
typename T,
size_t N>
1798 template <
typename T,
size_t N>
1805 template <
typename T,
size_t N>
1813 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT
1814 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
1815 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
1816 Vec128<type, size> v, Vec128<type, size> bit
1817 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
1819 #if HWY_ARCH_ARM_A64
1829 return (v & bit) == bit;
1834 return (v & bit) == bit;
1838 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
1839 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT
1840 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
1841 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
1845 #if HWY_ARCH_ARM_A64
1853 #if HWY_ARCH_ARM_A64
1863 #if HWY_ARCH_ARM_A64
1865 HWY_API Mask128<uint64_t>
operator<(Vec128<uint64_t> a, Vec128<uint64_t> b) {
1866 return Mask128<uint64_t>(vcltq_u64(a.raw, b.raw));
1869 Vec128<uint64_t, 1> b) {
1870 return Mask128<uint64_t, 1>(vclt_u64(a.raw, b.raw));
1880 const
Vec128<uint64_t, N> b) {
1881 #if HWY_ARCH_ARM_A64
1895 const
Vec128<int64_t, N> b) {
1896 #if HWY_ARCH_ARM_A64
1905 #if HWY_ARCH_ARM_A64
1918 const
Vec128<uint64_t, N> b) {
1919 #if HWY_ARCH_ARM_A64
1933 const
Vec128<int64_t, N> b) {
1934 #if HWY_ARCH_ARM_A64
1943 #if HWY_ARCH_ARM_A64
1989 #if HWY_ARCH_ARM_A64
1992 return Vec128<double>(vld1q_f64(unaligned));
2034 #if HWY_ARCH_ARM_A64
2037 return Vec128<double, 1>(vld1_f64(p));
2051 uint32x2_t b = vld1_lane_u32(
reinterpret_cast<const uint32_t*
>(p), a, 0);
2057 uint32x2_t b = vld1_lane_u32(
reinterpret_cast<const uint32_t*
>(p), a, 0);
2063 uint32x2_t b = vld1_lane_u32(p, a, 0);
2069 int32x2_t b = vld1_lane_s32(
reinterpret_cast<const int32_t*
>(p), a, 0);
2075 int32x2_t b = vld1_lane_s32(
reinterpret_cast<const int32_t*
>(p), a, 0);
2081 int32x2_t b = vld1_lane_s32(p, a, 0);
2087 float32x2_t b = vld1_lane_f32(p, a, 0);
2096 uint16x4_t b = vld1_lane_u16(
reinterpret_cast<const uint16_t*
>(p), a, 0);
2102 uint16x4_t b = vld1_lane_u16(p, a, 0);
2108 int16x4_t b = vld1_lane_s16(
reinterpret_cast<const int16_t*
>(p), a, 0);
2114 int16x4_t b = vld1_lane_s16(p, a, 0);
2123 uint8x8_t b = vld1_lane_u8(p, a, 0);
2130 int8x8_t b = vld1_lane_s8(p, a, 0);
2139 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2146 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2151 template <
typename T,
size_t N>
2156 template <
typename T,
size_t N>
2163 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2172 vst1q_u8(unaligned, v.
raw);
2176 vst1q_u16(unaligned, v.
raw);
2180 vst1q_u32(unaligned, v.
raw);
2184 vst1q_u64(unaligned, v.
raw);
2188 vst1q_s8(unaligned, v.
raw);
2192 vst1q_s16(unaligned, v.
raw);
2196 vst1q_s32(unaligned, v.
raw);
2200 vst1q_s64(unaligned, v.
raw);
2204 vst1q_f32(unaligned, v.
raw);
2206 #if HWY_ARCH_ARM_A64
2207 HWY_API void StoreU(
const Vec128<double> v, Full128<double> ,
2209 vst1q_f64(unaligned, v.raw);
2251 #if HWY_ARCH_ARM_A64
2252 HWY_API void StoreU(
const Vec128<double, 1> v, Simd<double, 1> ,
2262 uint32x2_t a = vreinterpret_u32_u8(v.
raw);
2263 vst1_lane_u32(
reinterpret_cast<uint32_t*
>(p), a, 0);
2267 uint32x2_t a = vreinterpret_u32_u16(v.
raw);
2268 vst1_lane_u32(
reinterpret_cast<uint32_t*
>(p), a, 0);
2272 vst1_lane_u32(p, v.
raw, 0);
2276 int32x2_t a = vreinterpret_s32_s8(v.
raw);
2277 vst1_lane_s32(
reinterpret_cast<int32_t*
>(p), a, 0);
2281 int32x2_t a = vreinterpret_s32_s16(v.
raw);
2282 vst1_lane_s32(
reinterpret_cast<int32_t*
>(p), a, 0);
2286 vst1_lane_s32(p, v.
raw, 0);
2290 vst1_lane_f32(p, v.
raw, 0);
2297 uint16x4_t a = vreinterpret_u16_u8(v.
raw);
2298 vst1_lane_u16(
reinterpret_cast<uint16_t*
>(p), a, 0);
2302 vst1_lane_u16(p, v.
raw, 0);
2306 int16x4_t a = vreinterpret_s16_s8(v.
raw);
2307 vst1_lane_s16(
reinterpret_cast<int16_t*
>(p), a, 0);
2311 vst1_lane_s16(p, v.
raw, 0);
2318 vst1_lane_u8(p, v.
raw, 0);
2322 vst1_lane_s8(p, v.
raw, 0);
2330 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2337 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2342 template <
typename T,
size_t N>
2351 template <
typename T,
size_t N>
2354 Store(v, d, aligned);
2368 uint16x8_t a = vmovl_u8(v.
raw);
2385 uint16x8_t a = vmovl_u8(v.
raw);
2394 template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
2399 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
2402 uint16x8_t a = vmovl_u8(v.
raw);
2410 template <
size_t N, HWY_IF_LE64(u
int64_t, N)>
2415 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2420 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2423 uint16x8_t a = vmovl_u8(v.
raw);
2424 uint32x4_t b = vmovl_u16(vget_low_u16(a));
2427 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2430 uint32x4_t a = vmovl_u16(v.
raw);
2441 int16x8_t a = vmovl_s8(v.
raw);
2462 int16x8_t a = vmovl_s8(v.
raw);
2463 int32x4_t b = vmovl_s16(vget_low_s16(a));
2480 const Vec128<float16_t, 4> v) {
2481 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
2482 return Vec128<float>(f32);
2486 const Vec128<float16_t, N> v) {
2487 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
2488 return Vec128<float, N>(vget_low_f32(f32));
2501 const auto sign = ShiftRight<15>(bits16);
2502 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
2503 const auto mantissa = bits16 &
Set(du32, 0x3FF);
2504 const auto subnormal =
2506 Set(df32, 1.0f / 16384 / 1024));
2508 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
2509 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
2510 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2511 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
2512 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2517 #if HWY_ARCH_ARM_A64
2520 const Vec128<float, 2> v) {
2521 return Vec128<double>(vcvt_f64_f32(v.raw));
2525 const Vec128<float, 1> v) {
2526 return Vec128<double, 1>(vget_low_f64(vcvt_f64_f32(v.raw)));
2530 const Vec128<int32_t, 2> v) {
2531 const int64x2_t i64 = vmovl_s32(v.raw);
2532 return Vec128<double>(vcvtq_f64_s64(i64));
2536 const Vec128<int32_t, 1> v) {
2537 const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
2538 return Vec128<double, 1>(vcvt_f64_s64(i64));
2556 const uint16x4_t a = vqmovun_s32(v.
raw);
2565 const int16x4_t a = vqmovn_s32(v.
raw);
2574 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2579 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2584 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2587 const uint16x4_t a = vqmovun_s32(vcombine_s32(v.
raw, v.
raw));
2590 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2595 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2598 const int16x4_t a = vqmovn_s32(vcombine_s32(v.
raw, v.
raw));
2601 template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2610 const Vec128<float> v) {
2611 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
2615 const Vec128<float, N> v) {
2616 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
2617 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
2628 const auto bits32 =
BitCast(du, v);
2629 const auto sign = ShiftRight<31>(bits32);
2630 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
2631 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
2633 const auto k15 =
Set(di, 15);
2634 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
2635 const auto is_tiny = exp <
Set(di, -24);
2637 const auto is_subnormal = exp <
Set(di, -14);
2638 const auto biased_exp16 =
2640 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
2641 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
2642 (mantissa32 >> (
Set(du, 13) + sub_exp));
2644 ShiftRight<13>(mantissa32));
2646 const auto sign16 = ShiftLeft<15>(sign);
2647 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2657 const Rebind<int32_t, decltype(dbf16)> di32;
2658 const Rebind<uint32_t, decltype(dbf16)> du32;
2659 const Rebind<uint16_t, decltype(dbf16)> du16;
2660 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32, v)));
2668 const Repartition<uint32_t, decltype(dbf16)> du32;
2673 #if HWY_ARCH_ARM_A64
2676 const Vec128<double> v) {
2677 return Vec128<float, 2>(vcvt_f32_f64(v.raw));
2680 const Vec128<double, 1> v) {
2681 return Vec128<float, 1>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
2685 const Vec128<double> v) {
2686 const int64x2_t i64 = vcvtq_s64_f64(v.raw);
2687 return Vec128<int32_t, 2>(vqmovn_s64(i64));
2690 const Vec128<double, 1> v) {
2691 const int64x1_t i64 = vcvt_s64_f64(v.raw);
2693 const int64x2_t i64x2 = vcombine_s64(i64, i64);
2694 return Vec128<int32_t, 1>(vqmovn_s64(i64x2));
2701 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
2704 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
2707 const uint8x8_t w = vuzp1_u8(org_v, org_v);
2722 uint16x8_t c = vcombine_u16(a.
raw, b.
raw);
2731 int16x8_t c = vcombine_s16(a.
raw, b.
raw);
2740 const
Vec128<int32_t> v) {
2743 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2754 template <
size_t N, HWY_IF_LE64(
float, N)>
2760 #if HWY_ARCH_ARM_A64
2763 const Vec128<int64_t> v) {
2764 return Vec128<double>(vcvtq_f64_s64(v.raw));
2767 const Vec128<int64_t, 1> v) {
2768 return Vec128<double, 1>(vcvt_f64_s64(v.raw));
2773 const Vec128<double> v) {
2774 return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
2777 const Vec128<double, 1> v) {
2778 return Vec128<int64_t, 1>(vcvt_s64_f64(v.raw));
2785 #if HWY_ARCH_ARM_A64
2823 const auto int_f =
ConvertTo(df, integer);
2838 const auto added = large + v;
2839 const auto rounded = added - large;
2851 const auto int_f =
ConvertTo(df, integer);
2865 const auto int_f =
ConvertTo(df, integer);
2877 #if HWY_ARCH_ARM_A64
2880 return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
2882 template <
size_t N, HWY_IF_LE64(
float, N)>
2884 return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
2902 template <
typename T,
size_t N, HWY_IF_LE64(u
int8_t, N)>
2934 #if HWY_ARCH_ARM_A64
2936 return Vec128<double, 1>(vget_low_f64(v.raw));
2940 template <
typename T,
size_t N>
2948 template <
int kBytes,
typename T,
class V128 = Vec128<T>>
2950 static_assert(0 < kBytes && kBytes < 16,
"kBytes must be in [1, 15]");
2952 uint8x16_t v8 = vextq_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
2957 template <
int kBytes,
typename T,
class V64 = Vec128<T, 8 /
sizeof(T)>>
2959 static_assert(0 < kBytes && kBytes < 8,
"kBytes must be in [1, 7]");
2961 uint8x8_t v8 = vext_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
2973 template <
int kBytes>
2983 template <
class T,
size_t N, HWY_IF_LE64(T, N)>
2986 const Simd<T, 8 /
sizeof(T)> d64;
2987 const auto zero64 =
Zero(d64);
2988 const decltype(zero64) v64(v.
raw);
2990 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
2995 template <
class T,
size_t N>
3002 template <
class T,
size_t N>
3008 template <
int kBytes>
3010 template <
class T,
size_t N>
3014 if (N *
sizeof(T) < 8) {
3015 constexpr
size_t kReg = N *
sizeof(T) == 16 ? 16 : 8;
3016 const Simd<T, kReg /
sizeof(T)> dreg;
3020 return CombineShiftRightBytes<kBytes>(d,
Zero(d), v);
3025 template <
class T,
size_t N>
3032 template <
class T,
size_t N>
3040 template <
int kBytes,
typename T,
size_t N>
3046 template <
int kBytes,
typename T,
size_t N>
3048 return ShiftLeftBytes<kBytes>(
Simd<T, N>(), v);
3051 template <
int kLanes,
typename T,
size_t N>
3057 template <
int kLanes,
typename T,
size_t N>
3059 return ShiftLeftLanes<kLanes>(
Simd<T, N>(), v);
3063 template <
int kBytes,
typename T,
size_t N>
3069 template <
int kLanes,
typename T,
size_t N>
3076 template <
int kBytes,
typename T,
size_t N, HWY_IF_LE32(T, N)>
3079 constexpr
size_t kSize = N *
sizeof(T);
3080 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3084 using V64 =
VFromD<decltype(d_full8)>;
3085 const V64 hi64(
BitCast(d8, hi).raw);
3132 #if HWY_ARCH_ARM_A64
3134 const Vec128<double> v) {
3135 return Vec128<double, 1>(vget_high_f64(v.raw));
3140 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3146 return Vec128<T, (N + 1) / 2>(upper.raw);
3151 #if HWY_ARCH_ARM_A64
3153 template <
int kLane>
3155 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3156 return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
3158 template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3160 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3161 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3163 template <
int kLane>
3165 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3166 return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
3168 template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3170 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3171 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3173 template <
int kLane>
3175 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3176 return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
3181 template <
int kLane>
3183 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3184 return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
3186 template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3188 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3189 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3191 template <
int kLane>
3193 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3194 return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
3196 template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3198 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3199 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3201 template <
int kLane>
3203 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3204 return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
3209 template <
int kLane>
3211 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3212 return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
3214 template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3216 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3217 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3219 template <
int kLane>
3221 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3222 return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
3224 template <
int kLane>
3226 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3234 template <
int kLane>
3236 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3239 template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3241 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3244 template <
int kLane>
3246 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3249 template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3251 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3254 template <
int kLane>
3256 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3262 template <
int kLane>
3264 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3267 template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3269 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3272 template <
int kLane>
3274 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3277 template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3279 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3282 template <
int kLane>
3284 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3290 template <
int kLane>
3292 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3295 template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3297 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3303 template <
int kLane>
3305 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3308 template <
int kLane>
3310 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3317 template <
typename T,
size_t N>
3322 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3324 #if HWY_IS_DEBUG_BUILD
3325 for (
size_t i = 0; i < N; ++i) {
3326 HWY_DASSERT(0 <= idx[i] && idx[i] <
static_cast<int32_t
>(N));
3331 alignas(16) uint8_t control[16] = {0};
3332 for (
size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
3333 for (
size_t idx_byte = 0; idx_byte <
sizeof(T); ++idx_byte) {
3334 control[idx_lane *
sizeof(T) + idx_byte] =
static_cast<uint8_t
>(
3335 static_cast<size_t>(idx[idx_lane]) *
sizeof(T) + idx_byte);
3361 template <
typename T>
3366 template <
typename T>
3371 template <
typename T>
3384 template <
typename T>
3386 return CombineShiftRightBytes<8>(
Full128<T>(), v, v);
3388 template <
typename T>
3390 return CombineShiftRightBytes<8>(
Full128<T>(), v, v);
3394 template <
typename T>
3396 return CombineShiftRightBytes<4>(
Full128<T>(), v, v);
3400 template <
typename T>
3402 return CombineShiftRightBytes<12>(
Full128<T>(), v, v);
3406 template <
typename T>
3419 #if HWY_ARCH_ARM_A64
3422 const Vec128<uint64_t> b) {
3423 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
3426 const Vec128<int64_t> b) {
3427 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
3430 const Vec128<double> b) {
3431 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
3450 template <
size_t N, HWY_IF_LE64(
float, N)>
3457 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3459 using V64 =
Vec128<T, 8 /
sizeof(T)>;
3464 template <
typename T,
size_t N,
class V = Vec128<T, N>>
3476 #if HWY_ARCH_ARM_A64
3479 const Vec128<uint64_t> b) {
3480 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
3483 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
3486 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
3510 template <
typename T,
size_t N, HWY_IF_GE64(T, N),
class V = Vec128<T, N>>
3516 template <
typename T,
size_t N, HWY_IF_LE32(T, N),
class V = Vec128<T, N>>
3518 const Half<decltype(d)> d2;
3526 template <
typename T,
size_t N,
class DW = RepartitionToW
ide<Simd<T, N>>>
3530 template <
typename T,
size_t N,
class D = Simd<T, N>,
3531 class DW = RepartitionToW
ide<D>>
3536 template <
typename T,
size_t N,
class D = Simd<T, N>,
3537 class DW = RepartitionToW
ide<D>>
3607 #if HWY_ARCH_ARM_A64
3608 HWY_API Vec128<double>
Combine(Full128<double> , Vec128<double, 1> hi,
3609 Vec128<double, 1> lo) {
3610 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
3615 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3628 template <
typename T,
size_t N>
3636 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
3644 #if HWY_ARCH_ARM_A64
3648 Vec128<uint8_t, 2> lo) {
3649 return Vec128<uint8_t, 2>(vtrn1_u8(lo.raw, hi.raw));
3652 Vec128<uint16_t, 2> lo) {
3653 return Vec128<uint16_t, 2>(vtrn1_u16(lo.raw, hi.raw));
3659 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3669 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3672 const Half<decltype(d)> d2;
3680 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
3688 #if HWY_ARCH_ARM_A64
3692 Vec128<uint8_t, 2> lo) {
3693 return Vec128<uint8_t, 2>(vtrn2_u8(lo.raw, hi.raw));
3696 Vec128<uint16_t, 2> lo) {
3697 return Vec128<uint16_t, 2>(vtrn2_u16(lo.raw, hi.raw));
3703 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3713 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3716 const Half<decltype(d)> d2;
3725 template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
3732 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
3735 constexpr
size_t kSize = N *
sizeof(T);
3737 const Simd<uint8_t, 8> d8x8;
3738 const Simd<T, 8 /
sizeof(T)> d64;
3739 using V8x8 =
VFromD<decltype(d8x8)>;
3740 const V8x8 hi8x8(
BitCast(d8, hi).raw);
3745 return Vec128<T, N>(
BitCast(d64, r).raw);
3751 template <
typename T,
size_t N>
3774 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3780 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3786 template <
size_t N, HWY_IF_LE64(
float, N)>
3794 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3816 template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3822 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3828 template <
size_t N, HWY_IF_LE64(
float, N)>
3836 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3843 template <
typename T,
size_t N>
3847 alignas(16) constexpr uint8_t kBytes[16] = {
3848 ((0 /
sizeof(T)) & 1) ? 0 : 0xFF, ((1 /
sizeof(T)) & 1) ? 0 : 0xFF,
3849 ((2 /
sizeof(T)) & 1) ? 0 : 0xFF, ((3 /
sizeof(T)) & 1) ? 0 : 0xFF,
3850 ((4 /
sizeof(T)) & 1) ? 0 : 0xFF, ((5 /
sizeof(T)) & 1) ? 0 : 0xFF,
3851 ((6 /
sizeof(T)) & 1) ? 0 : 0xFF, ((7 /
sizeof(T)) & 1) ? 0 : 0xFF,
3852 ((8 /
sizeof(T)) & 1) ? 0 : 0xFF, ((9 /
sizeof(T)) & 1) ? 0 : 0xFF,
3853 ((10 /
sizeof(T)) & 1) ? 0 : 0xFF, ((11 /
sizeof(T)) & 1) ? 0 : 0xFF,
3854 ((12 /
sizeof(T)) & 1) ? 0 : 0xFF, ((13 /
sizeof(T)) & 1) ? 0 : 0xFF,
3855 ((14 /
sizeof(T)) & 1) ? 0 : 0xFF, ((15 /
sizeof(T)) & 1) ? 0 : 0xFF,
3863 #if defined(__ARM_FEATURE_AES)
3866 #ifdef HWY_NATIVE_AES
3867 #undef HWY_NATIVE_AES
3869 #define HWY_NATIVE_AES
3873 Vec128<uint8_t> round_key) {
3878 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
3883 return Vec128<uint64_t>((uint64x2_t)vmull_p64(
GetLane(a),
GetLane(b)));
3887 return Vec128<uint64_t>(
3888 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
3898 const Rebind<uint16_t, decltype(df32)> du16;
3909 int32x4_t a_packed =
ConcatEven(d, a, a).raw;
3910 int32x4_t b_packed =
ConcatEven(d, b, b).raw;
3912 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
3916 uint32x4_t a_packed =
ConcatEven(d, a, a).raw;
3917 uint32x4_t b_packed =
ConcatEven(d, b, b).raw;
3919 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
3926 int32x2_t a_packed =
ConcatEven(d, a, a).raw;
3927 int32x2_t b_packed =
ConcatEven(d, b, b).raw;
3928 return Vec128<int64_t, (N + 1) / 2>(
3929 vget_low_s64(vmull_s32(a_packed, b_packed)));
3935 uint32x2_t a_packed =
ConcatEven(d, a, a).raw;
3936 uint32x2_t b_packed =
ConcatEven(d, b, b).raw;
3937 return Vec128<uint64_t, (N + 1) / 2>(
3938 vget_low_u64(vmull_u32(a_packed, b_packed)));
3943 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 0), vgetq_lane_u64(b.
raw, 0), &hi);
3949 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 1), vgetq_lane_u64(b.
raw, 1), &hi);
3956 template <
typename T,
typename TI>
3961 #if HWY_ARCH_ARM_A64
3965 uint8x16_t table0 =
BitCast(d8, bytes).raw;
3967 table.val[0] = vget_low_u8(table0);
3968 table.val[1] = vget_high_u8(table0);
3969 uint8x16_t idx =
BitCast(d8, from).raw;
3970 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
3971 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
3977 template <
typename T,
typename TI,
size_t NI, HWY_IF_LE64(TI, NI)>
3981 const Vec128<TI, 8 /
sizeof(T)> from64(from.
raw);
3982 const auto idx_full =
Combine(d_full, from64, from64);
3988 template <
typename T,
size_t N,
typename TI, HWY_IF_LE64(T, N)>
3996 template <
typename T,
size_t N,
typename TI,
size_t NI,
HWY_IF_LE64(T, N),
4002 const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4005 const auto from8 =
BitCast(d_idx8, from);
4006 const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4011 template <
class V,
class VI>
4018 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
4021 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4023 alignas(16) T lanes[N];
4026 alignas(16) Offset offset_lanes[N];
4029 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
4030 for (
size_t i = 0; i < N; ++i) {
4031 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4035 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
4038 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4040 alignas(16) T lanes[N];
4043 alignas(16) Index index_lanes[N];
4046 for (
size_t i = 0; i < N; ++i) {
4047 base[index_lanes[i]] = lanes[i];
4053 template <
typename T,
size_t N,
typename Offset>
4057 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4059 alignas(16) Offset offset_lanes[N];
4062 alignas(16) T lanes[N];
4063 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
4064 for (
size_t i = 0; i < N; ++i) {
4065 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4067 return Load(d, lanes);
4070 template <
typename T,
size_t N,
typename Index>
4073 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
4075 alignas(16) Index index_lanes[N];
4078 alignas(16) T lanes[N];
4079 for (
size_t i = 0; i < N; ++i) {
4080 lanes[i] = base[index_lanes[i]];
4082 return Load(d, lanes);
4090 template <
typename T>
4094 template <
typename T>
4099 template <
typename T>
4106 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4110 template <
typename T>
4115 template <
typename T>
4122 #if HWY_ARCH_ARM_A64
4127 return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
4130 return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
4133 return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
4136 return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
4139 return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
4144 uint32x4x2_t v0 = vuzpq_u32(v.
raw, v.
raw);
4145 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4146 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4150 int32x4x2_t v0 = vuzpq_s32(v.
raw, v.
raw);
4151 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4152 int32x4x2_t v1 = vuzpq_s32(c0, c0);
4156 float32x4x2_t v0 = vuzpq_f32(v.
raw, v.
raw);
4157 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4158 float32x4x2_t v1 = vuzpq_f32(c0, c0);
4169 template <
typename T>
4175 return Min(v20_31_20_31, v31_20_31_20);
4177 template <
typename T>
4183 return Max(v20_31_20_31, v31_20_31_20);
4187 template <
typename T>
4191 return Min(v10, v01);
4193 template <
typename T>
4197 return Max(v10, v01);
4202 template <
typename T,
size_t N>
4206 template <
typename T,
size_t N>
4210 template <
typename T,
size_t N>
4222 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4227 template <
typename T>
4232 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
4237 const auto vmask_bits =
Set64(du, mask_bits);
4240 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4241 1, 1, 1, 1, 1, 1, 1, 1};
4244 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4245 1, 2, 4, 8, 16, 32, 64, 128};
4249 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4252 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4253 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4257 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4260 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4261 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4265 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4268 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4275 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
4278 uint64_t mask_bits = 0;
4279 CopyBytes<(N + 7) / 8>(bits, &mask_bits);
4287 template <
typename T>
4290 alignas(16) constexpr uint8_t kSliceLanes[16] = {
4291 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
4297 #if HWY_ARCH_ARM_A64
4299 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.
raw, values.
raw));
4300 const uint8x8_t x4 = vpadd_u8(x2, x2);
4301 const uint8x8_t x8 = vpadd_u8(x4, x4);
4302 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
4305 const uint16x8_t x2 = vpaddlq_u8(values.
raw);
4306 const uint32x4_t x4 = vpaddlq_u16(x2);
4307 const uint64x2_t x8 = vpaddlq_u32(x4);
4308 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
4312 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4317 alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
4318 0x10, 0x20, 0x40, 0x80};
4324 #if HWY_ARCH_ARM_A64
4325 return vaddv_u8(values.
raw);
4327 const uint16x4_t x2 = vpaddl_u8(values.
raw);
4328 const uint32x2_t x4 = vpaddl_u16(x2);
4329 const uint64x1_t x8 = vpaddl_u32(x4);
4330 return vget_lane_u64(x8, 0);
4334 template <
typename T>
4337 alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
4338 0x10, 0x20, 0x40, 0x80};
4343 #if HWY_ARCH_ARM_A64
4344 return vaddvq_u16(values.
raw);
4346 const uint32x4_t x2 = vpaddlq_u16(values.
raw);
4347 const uint64x2_t x4 = vpaddlq_u32(x2);
4348 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
4352 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4357 alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
4362 #if HWY_ARCH_ARM_A64
4363 return vaddv_u16(values.
raw);
4365 const uint32x2_t x2 = vpaddl_u16(values.
raw);
4366 const uint64x1_t x4 = vpaddl_u32(x2);
4367 return vget_lane_u64(x4, 0);
4371 template <
typename T>
4374 alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
4379 #if HWY_ARCH_ARM_A64
4380 return vaddvq_u32(values.
raw);
4382 const uint64x2_t x2 = vpaddlq_u32(values.
raw);
4383 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
4387 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4392 alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
4397 #if HWY_ARCH_ARM_A64
4398 return vaddv_u32(values.
raw);
4400 const uint64x1_t x2 = vpaddl_u32(values.
raw);
4401 return vget_lane_u64(x2, 0);
4405 template <
typename T>
4407 alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
4412 #if HWY_ARCH_ARM_A64
4413 return vaddvq_u64(values.
raw);
4415 return vgetq_lane_u64(values.
raw, 0) + vgetq_lane_u64(values.
raw, 1);
4419 template <
typename T>
4426 return vget_lane_u64(values.
raw, 0);
4430 template <
typename T,
size_t N>
4432 return ((N *
sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
4435 template <
typename T,
size_t N>
4446 template <
typename T>
4449 const int8x16_t ones =
4452 #if HWY_ARCH_ARM_A64
4453 return static_cast<size_t>(vaddvq_s8(ones));
4455 const int16x8_t x2 = vpaddlq_s8(ones);
4456 const int32x4_t x4 = vpaddlq_s16(x2);
4457 const int64x2_t x8 = vpaddlq_s32(x4);
4458 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
4461 template <
typename T>
4464 const int16x8_t ones =
4467 #if HWY_ARCH_ARM_A64
4468 return static_cast<size_t>(vaddvq_s16(ones));
4470 const int32x4_t x2 = vpaddlq_s16(ones);
4471 const int64x2_t x4 = vpaddlq_s32(x2);
4472 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
4476 template <
typename T>
4479 const int32x4_t ones =
4482 #if HWY_ARCH_ARM_A64
4483 return static_cast<size_t>(vaddvq_s32(ones));
4485 const int64x2_t x2 = vpaddlq_s32(ones);
4486 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
4490 template <
typename T>
4492 #if HWY_ARCH_ARM_A64
4494 const int64x2_t ones =
4496 return static_cast<size_t>(vaddvq_s64(ones));
4500 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
4501 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
4508 template <
typename T>
4514 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4519 template <
typename T,
size_t N>
4527 template <
typename T,
size_t N>
4531 const size_t kNumBytes = (N + 7) / 8;
4532 CopyBytes<kNumBytes>(&mask_bits, bits);
4537 template <
typename T>
4539 #if HWY_ARCH_ARM_A64
4542 return (vmaxvq_u32(m32.raw) == 0);
4545 uint32x2_t a = vqmovn_u64(v64.raw);
4546 return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
4551 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4556 template <
typename T,
size_t N>
4567 const uint8_t* bytes) {
4569 vld1q_dup_u64(
reinterpret_cast<const uint64_t*
>(bytes))));
4573 template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
4575 const uint8_t* bytes) {
4576 return Load(d, bytes);
4579 template <
typename T,
size_t N>
4581 const uint64_t mask_bits) {
4595 alignas(16) constexpr uint8_t table[256 * 8] = {
4596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
4597 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
4598 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
4599 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
4600 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
4601 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
4602 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
4603 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
4604 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
4605 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
4606 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
4607 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
4608 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
4609 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
4610 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
4611 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
4612 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
4613 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
4614 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
4615 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
4616 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
4617 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
4618 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
4619 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
4620 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
4621 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
4622 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
4623 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
4624 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
4625 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
4626 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
4627 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
4628 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
4629 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
4630 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
4631 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
4632 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
4633 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
4634 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
4635 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
4636 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
4637 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
4638 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
4639 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
4640 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
4641 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
4642 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
4643 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
4644 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
4645 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
4646 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
4647 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
4648 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
4649 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
4650 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
4651 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
4652 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
4653 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
4654 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
4655 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
4656 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
4657 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
4658 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
4659 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
4660 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
4661 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
4662 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
4663 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
4664 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
4665 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
4666 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
4667 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
4668 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
4669 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
4670 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
4671 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
4672 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
4673 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
4674 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
4675 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
4676 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
4677 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
4678 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
4679 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
4680 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
4681 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
4682 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
4683 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
4684 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
4685 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
4686 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
4687 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
4688 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
4689 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
4690 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
4691 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
4692 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
4693 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
4694 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
4695 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
4696 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
4697 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
4698 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
4699 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
4700 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
4701 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
4702 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
4703 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
4704 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
4705 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
4706 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
4707 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
4708 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
4709 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
4716 template <
typename T,
size_t N>
4718 const uint64_t mask_bits) {
4722 alignas(16) constexpr uint8_t packed_array[16 * 16] = {
4723 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4724 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4725 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4726 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3,
4727 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4728 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
4729 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3,
4730 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
4731 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4732 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
4733 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
4734 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3,
4735 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
4736 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
4737 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
4738 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4742 return BitCast(d,
Load(d8, packed_array + 16 * mask_bits));
4745 #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
4747 template <
typename T,
size_t N>
4749 const uint64_t mask_bits) {
4753 alignas(16) constexpr uint8_t packed_array[4 * 16] = {
4754 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
4755 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
4756 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
4757 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4761 return BitCast(d,
Load(d8, packed_array + 16 * mask_bits));
4768 template <
typename T,
size_t N>
4771 detail::IdxFromBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
4779 template <
typename T,
size_t N>
4786 template <
typename T,
size_t N>
4789 uint64_t mask_bits = 0;
4790 constexpr
size_t kNumBytes = (N + 7) / 8;
4791 CopyBytes<kNumBytes>(bits, &mask_bits);
4793 mask_bits &= (1ull << N) - 1;
4801 template <
typename T,
size_t N>
4811 template <
typename T,
size_t N>
4815 uint64_t mask_bits = 0;
4816 constexpr
size_t kNumBytes = (N + 7) / 8;
4817 CopyBytes<kNumBytes>(bits, &mask_bits);
4819 mask_bits &= (1ull << N) - 1;
4834 const uint8x16x3_t triple = {v0.
raw, v1.
raw, v2.
raw};
4835 vst3q_u8(unaligned, triple);
4844 const uint8x8x3_t triple = {v0.
raw, v1.
raw, v2.
raw};
4845 vst3_u8(unaligned, triple);
4849 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
4855 alignas(16) uint8_t buf[24];
4856 const uint8x8x3_t triple = {v0.
raw, v1.
raw, v2.
raw};
4857 vst3_u8(buf, triple);
4858 CopyBytes<N * 3>(buf, unaligned);
4870 const uint8x16x4_t quad = {v0.
raw, v1.
raw, v2.
raw, v3.
raw};
4871 vst4q_u8(unaligned, quad);
4881 const uint8x8x4_t quad = {v0.
raw, v1.
raw, v2.
raw, v3.
raw};
4882 vst4_u8(unaligned, quad);
4886 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
4893 alignas(16) uint8_t buf[32];
4894 const uint8x8x4_t quad = {v0.
raw, v1.
raw, v2.
raw, v3.
raw};
4896 CopyBytes<N * 4>(buf, unaligned);
4901 template <
typename T,
size_t N>
4906 template <
typename T,
size_t N>
4911 template <
typename T,
size_t N>
4916 template <
typename T,
size_t N>
4921 template <
typename T,
size_t N>
4925 template <
typename T,
size_t N>
4929 template <
typename T,
size_t N>
4934 template <
typename T,
size_t N>
4939 template <
int kBytes,
typename T,
size_t N>
4941 return ShiftRightBytes<kBytes>(
Simd<T, N>(), v);
4944 template <
int kLanes,
typename T,
size_t N>
4946 return ShiftRightLanes<kLanes>(
Simd<T, N>(), v);
4949 template <
size_t kBytes,
typename T,
size_t N>
4951 return CombineShiftRightBytes<kBytes>(
Simd<T, N>(), hi, lo);
4954 template <
typename T,
size_t N>
4959 template <
typename T,
size_t N,
class D = Simd<T, N>>
4964 template <
typename T,
size_t N2>
4969 template <
typename T,
size_t N2, HWY_IF_LE64(T, N2)>
4974 template <
typename T,
size_t N>
4979 template <
typename T,
size_t N>
4984 template <
typename T,
size_t N>
4990 template <
typename T,
size_t N>
5113 #undef HWY_NEON_BUILD_ARG_1
5114 #undef HWY_NEON_BUILD_ARG_2
5115 #undef HWY_NEON_BUILD_ARG_3
5116 #undef HWY_NEON_BUILD_PARAM_1
5117 #undef HWY_NEON_BUILD_PARAM_2
5118 #undef HWY_NEON_BUILD_PARAM_3
5119 #undef HWY_NEON_BUILD_RET_1
5120 #undef HWY_NEON_BUILD_RET_2
5121 #undef HWY_NEON_BUILD_RET_3
5122 #undef HWY_NEON_BUILD_TPL_1
5123 #undef HWY_NEON_BUILD_TPL_2
5124 #undef HWY_NEON_BUILD_TPL_3
5125 #undef HWY_NEON_DEF_FUNCTION
5126 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
5127 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
5128 #undef HWY_NEON_DEF_FUNCTION_INT_8
5129 #undef HWY_NEON_DEF_FUNCTION_INT_16
5130 #undef HWY_NEON_DEF_FUNCTION_INT_32
5131 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
5132 #undef HWY_NEON_DEF_FUNCTION_INTS
5133 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
5134 #undef HWY_NEON_DEF_FUNCTION_TPL
5135 #undef HWY_NEON_DEF_FUNCTION_UINT_8
5136 #undef HWY_NEON_DEF_FUNCTION_UINT_16
5137 #undef HWY_NEON_DEF_FUNCTION_UINT_32
5138 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
5139 #undef HWY_NEON_DEF_FUNCTION_UINTS
5140 #undef HWY_NEON_EVAL
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:149
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:174
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:184
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:136
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:131
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:89
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:179
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:119
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:157
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:105
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:112
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:169
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:97
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:125
#define HWY_IF_FLOAT(T)
Definition: base.h:280
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_IF_LE64(T, N)
Definition: base.h:271
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:506
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:511
Mask128(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:514
Raw raw
Definition: arm_neon-inl.h:516
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:508
Mask128 & operator=(const Mask128 &)=default
Definition: arm_neon-inl.h:468
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:491
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:494
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:472
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:482
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:475
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:497
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:479
Vec128(const Vec128 &)=default
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:469
Raw raw
Definition: arm_neon-inl.h:501
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:485
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:488
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2811
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4233
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_INLINE Vec128< T, N > Set64(Simd< T, N >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4223
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1334
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:4566
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1181
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4447
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4769
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4431
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4580
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T > ConcatOdd(Full128< T > d, Vec128< T > hi, Vec128< T > lo)
Definition: arm_neon-inl.h:3795
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec128< int64_t
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > ConcatEven(Full128< T > d, Vec128< T > hi, Vec128< T > lo)
Definition: arm_neon-inl.h:3837
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
typename D::T TFromD
Definition: shared-inl.h:140
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:613
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:535
constexpr float MantissaEnd< float >()
Definition: base.h:391
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_neon-inl.h:3318
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3319
Definition: shared-inl.h:35
Definition: arm_neon-inl.h:522
Simd< T, N > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:524
uint16x4_t type
Definition: arm_neon-inl.h:451
uint16x4_t type
Definition: arm_neon-inl.h:415
uint16x4_t type
Definition: arm_neon-inl.h:362
uint16x8_t type
Definition: arm_neon-inl.h:299
uint16x4_t type
Definition: arm_neon-inl.h:446
uint16x4_t type
Definition: arm_neon-inl.h:410
uint16x4_t type
Definition: arm_neon-inl.h:357
uint16x8_t type
Definition: arm_neon-inl.h:294
float32x2_t type
Definition: arm_neon-inl.h:420
float32x2_t type
Definition: arm_neon-inl.h:367
float32x4_t type
Definition: arm_neon-inl.h:304
int16x4_t type
Definition: arm_neon-inl.h:441
int16x4_t type
Definition: arm_neon-inl.h:400
int16x4_t type
Definition: arm_neon-inl.h:342
int16x8_t type
Definition: arm_neon-inl.h:279
int32x2_t type
Definition: arm_neon-inl.h:405
int32x2_t type
Definition: arm_neon-inl.h:347
int32x4_t type
Definition: arm_neon-inl.h:284
int64x1_t type
Definition: arm_neon-inl.h:352
int64x2_t type
Definition: arm_neon-inl.h:289
int8x16_t type
Definition: arm_neon-inl.h:274
int8x8_t type
Definition: arm_neon-inl.h:462
int8x8_t type
Definition: arm_neon-inl.h:436
int8x8_t type
Definition: arm_neon-inl.h:395
int8x8_t type
Definition: arm_neon-inl.h:337
uint16x4_t type
Definition: arm_neon-inl.h:431
uint16x4_t type
Definition: arm_neon-inl.h:385
uint16x4_t type
Definition: arm_neon-inl.h:322
uint16x8_t type
Definition: arm_neon-inl.h:259
uint32x2_t type
Definition: arm_neon-inl.h:390
uint32x2_t type
Definition: arm_neon-inl.h:327
uint32x4_t type
Definition: arm_neon-inl.h:264
uint64x1_t type
Definition: arm_neon-inl.h:332
uint64x2_t type
Definition: arm_neon-inl.h:269
uint8x16_t type
Definition: arm_neon-inl.h:254
uint8x8_t type
Definition: arm_neon-inl.h:457
uint8x8_t type
Definition: arm_neon-inl.h:426
uint8x8_t type
Definition: arm_neon-inl.h:380
uint8x8_t type
Definition: arm_neon-inl.h:317
Definition: x86_128-inl.h:51
__v128_u type
Definition: wasm_128-inl.h:58
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2996
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3003
Definition: arm_neon-inl.h:2974
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:2977
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2984
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3026
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3033
Definition: arm_neon-inl.h:3009
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3011