Grok  9.5.0
arm_neon-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // 128-bit ARM64 NEON vectors and operations.
16 // External include guard in highway.h - see comment there.
17 
18 #include <arm_neon.h>
19 #include <stddef.h>
20 #include <stdint.h>
21 
22 #include "hwy/base.h"
23 #include "hwy/ops/shared-inl.h"
24 
26 namespace hwy {
27 namespace HWY_NAMESPACE {
28 
29 template <typename T>
30 using Full128 = Simd<T, 16 / sizeof(T)>;
31 
32 namespace detail { // for code folding and Raw128
33 
34 // Macros used to define single and double function calls for multiple types
35 // for full and half vectors. These macros are undefined at the end of the file.
36 
37 // HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
38 #define HWY_NEON_BUILD_TPL_1
39 #define HWY_NEON_BUILD_TPL_2
40 #define HWY_NEON_BUILD_TPL_3
41 
42 // HWY_NEON_BUILD_RET_* is return type.
43 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type, size>
44 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type, size>
45 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type, size>
46 
47 // HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
48 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type, size> a
49 #define HWY_NEON_BUILD_PARAM_2(type, size) \
50  const Vec128<type, size> a, const Vec128<type, size> b
51 #define HWY_NEON_BUILD_PARAM_3(type, size) \
52  const Vec128<type, size> a, const Vec128<type, size> b, \
53  const Vec128<type, size> c
54 
55 // HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
56 // function.
57 #define HWY_NEON_BUILD_ARG_1 a.raw
58 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
59 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
60 
61 // We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
62 // the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
63 // itself like with some of the library "functions" such as vshlq_u8. For
64 // example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
65 // "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
66 // Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
67 // expects two arguments.
68 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
69 
70 // Main macro definition that defines a single function for the given type and
71 // size of vector, using the underlying (prefix##infix##suffix) function and
72 // the template, return type, parameters and arguments defined by the "args"
73 // parameters passed here (see HWY_NEON_BUILD_* macros defined before).
74 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
75  HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
76  HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
77  name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
78  return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
79  HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
80  }
81 
82 // The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
83 // called "name" using the set of neon functions starting with the given
84 // "prefix" for all the variants of certain types, as specified next to each
85 // macro. For example, the prefix "vsub" can be used to define the operator-
86 // using args=2.
87 
88 // uint8_t
89 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
90  HWY_NEON_DEF_FUNCTION(uint8_t, 16, name, prefix##q, infix, u8, args) \
91  HWY_NEON_DEF_FUNCTION(uint8_t, 8, name, prefix, infix, u8, args) \
92  HWY_NEON_DEF_FUNCTION(uint8_t, 4, name, prefix, infix, u8, args) \
93  HWY_NEON_DEF_FUNCTION(uint8_t, 2, name, prefix, infix, u8, args) \
94  HWY_NEON_DEF_FUNCTION(uint8_t, 1, name, prefix, infix, u8, args)
95 
96 // int8_t
97 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
98  HWY_NEON_DEF_FUNCTION(int8_t, 16, name, prefix##q, infix, s8, args) \
99  HWY_NEON_DEF_FUNCTION(int8_t, 8, name, prefix, infix, s8, args) \
100  HWY_NEON_DEF_FUNCTION(int8_t, 4, name, prefix, infix, s8, args) \
101  HWY_NEON_DEF_FUNCTION(int8_t, 2, name, prefix, infix, s8, args) \
102  HWY_NEON_DEF_FUNCTION(int8_t, 1, name, prefix, infix, s8, args)
103 
104 // uint16_t
105 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
106  HWY_NEON_DEF_FUNCTION(uint16_t, 8, name, prefix##q, infix, u16, args) \
107  HWY_NEON_DEF_FUNCTION(uint16_t, 4, name, prefix, infix, u16, args) \
108  HWY_NEON_DEF_FUNCTION(uint16_t, 2, name, prefix, infix, u16, args) \
109  HWY_NEON_DEF_FUNCTION(uint16_t, 1, name, prefix, infix, u16, args)
110 
111 // int16_t
112 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
113  HWY_NEON_DEF_FUNCTION(int16_t, 8, name, prefix##q, infix, s16, args) \
114  HWY_NEON_DEF_FUNCTION(int16_t, 4, name, prefix, infix, s16, args) \
115  HWY_NEON_DEF_FUNCTION(int16_t, 2, name, prefix, infix, s16, args) \
116  HWY_NEON_DEF_FUNCTION(int16_t, 1, name, prefix, infix, s16, args)
117 
118 // uint32_t
119 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
120  HWY_NEON_DEF_FUNCTION(uint32_t, 4, name, prefix##q, infix, u32, args) \
121  HWY_NEON_DEF_FUNCTION(uint32_t, 2, name, prefix, infix, u32, args) \
122  HWY_NEON_DEF_FUNCTION(uint32_t, 1, name, prefix, infix, u32, args)
123 
124 // int32_t
125 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
126  HWY_NEON_DEF_FUNCTION(int32_t, 4, name, prefix##q, infix, s32, args) \
127  HWY_NEON_DEF_FUNCTION(int32_t, 2, name, prefix, infix, s32, args) \
128  HWY_NEON_DEF_FUNCTION(int32_t, 1, name, prefix, infix, s32, args)
129 
130 // uint64_t
131 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
132  HWY_NEON_DEF_FUNCTION(uint64_t, 2, name, prefix##q, infix, u64, args) \
133  HWY_NEON_DEF_FUNCTION(uint64_t, 1, name, prefix, infix, u64, args)
134 
135 // int64_t
136 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
137  HWY_NEON_DEF_FUNCTION(int64_t, 2, name, prefix##q, infix, s64, args) \
138  HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)
139 
140 // float and double
141 #if HWY_ARCH_ARM_A64
142 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
143  HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
144  HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
145  HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args) \
146  HWY_NEON_DEF_FUNCTION(double, 2, name, prefix##q, infix, f64, args) \
147  HWY_NEON_DEF_FUNCTION(double, 1, name, prefix, infix, f64, args)
148 #else
149 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
150  HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
151  HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
152  HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args)
153 #endif
154 
155 // Helper macros to define for more than one type.
156 // uint8_t, uint16_t and uint32_t
157 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
158  HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
159  HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
160  HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
161 
162 // int8_t, int16_t and int32_t
163 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
164  HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
165  HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
166  HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
167 
168 // uint8_t, uint16_t, uint32_t and uint64_t
169 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
170  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
171  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
172 
173 // int8_t, int16_t, int32_t and int64_t
174 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
175  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
176  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
177 
178 // All int*_t and uint*_t up to 64
179 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
180  HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
181  HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
182 
183 // All previous types.
184 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
185  HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
186  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
187 
188 // Emulation of some intrinsics on armv7.
189 #if HWY_ARCH_ARM_V7
190 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
191 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
192 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
193 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
194 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
195 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
196 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
197 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
198 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
199 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
200 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
201 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
202 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
203 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
204 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
205 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
206 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
207 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
208 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
209 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
210 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
211 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
212 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
213 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
214 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
215 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
216 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
217 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
218 #define vzip1_s8(x, y) vzip_s8(x, y).val[0]
219 #define vzip1_u8(x, y) vzip_u8(x, y).val[0]
220 #define vzip1_s16(x, y) vzip_s16(x, y).val[0]
221 #define vzip1_u16(x, y) vzip_u16(x, y).val[0]
222 #define vzip1_f32(x, y) vzip_f32(x, y).val[0]
223 #define vzip1_u32(x, y) vzip_u32(x, y).val[0]
224 #define vzip1_s32(x, y) vzip_s32(x, y).val[0]
225 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
226 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
227 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
228 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
229 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
230 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
231 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
232 #define vzip2_s8(x, y) vzip_s8(x, y).val[1]
233 #define vzip2_u8(x, y) vzip_u8(x, y).val[1]
234 #define vzip2_s16(x, y) vzip_s16(x, y).val[1]
235 #define vzip2_u16(x, y) vzip_u16(x, y).val[1]
236 #define vzip2_s32(x, y) vzip_s32(x, y).val[1]
237 #define vzip2_u32(x, y) vzip_u32(x, y).val[1]
238 #define vzip2_f32(x, y) vzip_f32(x, y).val[1]
239 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
240 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
241 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
242 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
243 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
244 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
245 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
246 #endif
247 
248 template <typename T, size_t N>
249 struct Raw128;
250 
251 // 128
252 template <>
253 struct Raw128<uint8_t, 16> {
254  using type = uint8x16_t;
255 };
256 
257 template <>
258 struct Raw128<uint16_t, 8> {
259  using type = uint16x8_t;
260 };
261 
262 template <>
263 struct Raw128<uint32_t, 4> {
264  using type = uint32x4_t;
265 };
266 
267 template <>
268 struct Raw128<uint64_t, 2> {
269  using type = uint64x2_t;
270 };
271 
272 template <>
273 struct Raw128<int8_t, 16> {
274  using type = int8x16_t;
275 };
276 
277 template <>
278 struct Raw128<int16_t, 8> {
279  using type = int16x8_t;
280 };
281 
282 template <>
283 struct Raw128<int32_t, 4> {
284  using type = int32x4_t;
285 };
286 
287 template <>
288 struct Raw128<int64_t, 2> {
289  using type = int64x2_t;
290 };
291 
292 template <>
293 struct Raw128<float16_t, 8> {
294  using type = uint16x8_t;
295 };
296 
297 template <>
298 struct Raw128<bfloat16_t, 8> {
299  using type = uint16x8_t;
300 };
301 
302 template <>
303 struct Raw128<float, 4> {
304  using type = float32x4_t;
305 };
306 
307 #if HWY_ARCH_ARM_A64
308 template <>
309 struct Raw128<double, 2> {
310  using type = float64x2_t;
311 };
312 #endif
313 
314 // 64
315 template <>
316 struct Raw128<uint8_t, 8> {
317  using type = uint8x8_t;
318 };
319 
320 template <>
321 struct Raw128<uint16_t, 4> {
322  using type = uint16x4_t;
323 };
324 
325 template <>
326 struct Raw128<uint32_t, 2> {
327  using type = uint32x2_t;
328 };
329 
330 template <>
331 struct Raw128<uint64_t, 1> {
332  using type = uint64x1_t;
333 };
334 
335 template <>
336 struct Raw128<int8_t, 8> {
337  using type = int8x8_t;
338 };
339 
340 template <>
341 struct Raw128<int16_t, 4> {
342  using type = int16x4_t;
343 };
344 
345 template <>
346 struct Raw128<int32_t, 2> {
347  using type = int32x2_t;
348 };
349 
350 template <>
351 struct Raw128<int64_t, 1> {
352  using type = int64x1_t;
353 };
354 
355 template <>
356 struct Raw128<float16_t, 4> {
357  using type = uint16x4_t;
358 };
359 
360 template <>
361 struct Raw128<bfloat16_t, 4> {
362  using type = uint16x4_t;
363 };
364 
365 template <>
366 struct Raw128<float, 2> {
367  using type = float32x2_t;
368 };
369 
370 #if HWY_ARCH_ARM_A64
371 template <>
372 struct Raw128<double, 1> {
373  using type = float64x1_t;
374 };
375 #endif
376 
377 // 32 (same as 64)
378 template <>
379 struct Raw128<uint8_t, 4> {
380  using type = uint8x8_t;
381 };
382 
383 template <>
384 struct Raw128<uint16_t, 2> {
385  using type = uint16x4_t;
386 };
387 
388 template <>
389 struct Raw128<uint32_t, 1> {
390  using type = uint32x2_t;
391 };
392 
393 template <>
394 struct Raw128<int8_t, 4> {
395  using type = int8x8_t;
396 };
397 
398 template <>
399 struct Raw128<int16_t, 2> {
400  using type = int16x4_t;
401 };
402 
403 template <>
404 struct Raw128<int32_t, 1> {
405  using type = int32x2_t;
406 };
407 
408 template <>
409 struct Raw128<float16_t, 2> {
410  using type = uint16x4_t;
411 };
412 
413 template <>
414 struct Raw128<bfloat16_t, 2> {
415  using type = uint16x4_t;
416 };
417 
418 template <>
419 struct Raw128<float, 1> {
420  using type = float32x2_t;
421 };
422 
423 // 16 (same as 64)
424 template <>
425 struct Raw128<uint8_t, 2> {
426  using type = uint8x8_t;
427 };
428 
429 template <>
430 struct Raw128<uint16_t, 1> {
431  using type = uint16x4_t;
432 };
433 
434 template <>
435 struct Raw128<int8_t, 2> {
436  using type = int8x8_t;
437 };
438 
439 template <>
440 struct Raw128<int16_t, 1> {
441  using type = int16x4_t;
442 };
443 
444 template <>
445 struct Raw128<float16_t, 1> {
446  using type = uint16x4_t;
447 };
448 
449 template <>
450 struct Raw128<bfloat16_t, 1> {
451  using type = uint16x4_t;
452 };
453 
454 // 8 (same as 64)
455 template <>
456 struct Raw128<uint8_t, 1> {
457  using type = uint8x8_t;
458 };
459 
460 template <>
461 struct Raw128<int8_t, 1> {
462  using type = int8x8_t;
463 };
464 
465 } // namespace detail
466 
467 template <typename T, size_t N = 16 / sizeof(T)>
468 class Vec128 {
469  using Raw = typename detail::Raw128<T, N>::type;
470 
471  public:
473  Vec128(const Vec128&) = default;
474  Vec128& operator=(const Vec128&) = default;
475  HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
476 
477  // Compound assignment. Only usable if there is a corresponding non-member
478  // binary operator overload. For example, only f32 and f64 support division.
480  return *this = (*this * other);
481  }
483  return *this = (*this / other);
484  }
486  return *this = (*this + other);
487  }
489  return *this = (*this - other);
490  }
492  return *this = (*this & other);
493  }
495  return *this = (*this | other);
496  }
498  return *this = (*this ^ other);
499  }
500 
502 };
503 
504 // FF..FF or 0.
505 template <typename T, size_t N = 16 / sizeof(T)>
506 class Mask128 {
507  // ARM C Language Extensions return and expect unsigned type.
508  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
509 
510  public:
512  Mask128(const Mask128&) = default;
513  Mask128& operator=(const Mask128&) = default;
514  HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
515 
517 };
518 
519 namespace detail {
520 
521 // Deduce Simd<T, N> from Vec128<T, N>
522 struct DeduceD {
523  template <typename T, size_t N>
525  return Simd<T, N>();
526  }
527 };
528 
529 } // namespace detail
530 
531 template <class V>
532 using DFromV = decltype(detail::DeduceD()(V()));
533 
534 template <class V>
536 
537 // ------------------------------ BitCast
538 
539 namespace detail {
540 
541 // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
542 // vreinterpret*_u8_*() set of functions.
543 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
544 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
545  Vec128<uint8_t, size * sizeof(type)>
546 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type, size> v
547 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
548 
549 // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
550 template <size_t N>
552  return v;
553 }
554 
556  HWY_CAST_TO_U8)
557 HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
558 HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
559 HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
560 HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
561 
562 // Special cases for [b]float16_t, which have the same Raw as uint16_t.
563 template <size_t N>
566 }
567 template <size_t N>
570 }
571 
572 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
573 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
574 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
575 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
576 
577 template <size_t N>
579  Vec128<uint8_t, N> v) {
580  return v;
581 }
582 
583 // 64-bit or less:
584 
585 template <size_t N, HWY_IF_LE64(int8_t, N)>
587  Vec128<uint8_t, N> v) {
588  return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw));
589 }
590 template <size_t N, HWY_IF_LE64(uint16_t, N)>
593  return Vec128<uint16_t, N>(vreinterpret_u16_u8(v.raw));
594 }
595 template <size_t N, HWY_IF_LE64(int16_t, N)>
598  return Vec128<int16_t, N>(vreinterpret_s16_u8(v.raw));
599 }
600 template <size_t N, HWY_IF_LE64(uint32_t, N)>
603  return Vec128<uint32_t, N>(vreinterpret_u32_u8(v.raw));
604 }
605 template <size_t N, HWY_IF_LE64(int32_t, N)>
608  return Vec128<int32_t, N>(vreinterpret_s32_u8(v.raw));
609 }
610 template <size_t N, HWY_IF_LE64(float, N)>
613  return Vec128<float, N>(vreinterpret_f32_u8(v.raw));
614 }
617  return Vec128<uint64_t, 1>(vreinterpret_u64_u8(v.raw));
618 }
621  return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw));
622 }
623 #if HWY_ARCH_ARM_A64
626  return Vec128<double, 1>(vreinterpret_f64_u8(v.raw));
627 }
628 #endif
629 
630 // 128-bit full:
631 
633  Vec128<uint8_t> v) {
634  return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
635 }
637  Vec128<uint8_t> v) {
638  return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
639 }
641  Vec128<uint8_t> v) {
642  return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
643 }
645  Vec128<uint8_t> v) {
646  return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
647 }
649  Vec128<uint8_t> v) {
650  return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
651 }
653  Vec128<uint8_t> v) {
654  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
655 }
657  Vec128<uint8_t> v) {
658  return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
659 }
661  Vec128<uint8_t> v) {
662  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
663 }
664 
665 #if HWY_ARCH_ARM_A64
667  Vec128<uint8_t> v) {
668  return Vec128<double>(vreinterpretq_f64_u8(v.raw));
669 }
670 #endif
671 
672 // Special cases for [b]float16_t, which have the same Raw as uint16_t.
673 template <size_t N>
677 }
678 template <size_t N>
682 }
683 
684 } // namespace detail
685 
686 template <typename T, size_t N, typename FromT>
688  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
690 }
691 
692 // ------------------------------ Set
693 
694 // Returns a vector with all lanes set to "t".
695 #define HWY_NEON_BUILD_TPL_HWY_SET1
696 #define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type, size>
697 #define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
698  Simd<type, size> /* tag */, const type t
699 #define HWY_NEON_BUILD_ARG_HWY_SET1 t
700 
701 HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1)
702 
703 #undef HWY_NEON_BUILD_TPL_HWY_SET1
704 #undef HWY_NEON_BUILD_RET_HWY_SET1
705 #undef HWY_NEON_BUILD_PARAM_HWY_SET1
706 #undef HWY_NEON_BUILD_ARG_HWY_SET1
707 
708 // Returns an all-zero vector.
709 template <typename T, size_t N>
711  return Set(d, 0);
712 }
713 
714 template <size_t N>
717 }
718 
719 template <class D>
720 using VFromD = decltype(Zero(D()));
721 
722 // Returns a vector with uninitialized elements.
723 template <typename T, size_t N>
725  HWY_DIAGNOSTICS(push)
726  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
727  typename detail::Raw128<T, N>::type a;
728  return Vec128<T, N>(a);
729  HWY_DIAGNOSTICS(pop)
730 }
731 
732 // Returns a vector with lane i=[0, N) set to "first" + i.
733 template <typename T, size_t N, typename T2>
734 Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
735  HWY_ALIGN T lanes[16 / sizeof(T)];
736  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
737  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
738  }
739  return Load(d, lanes);
740 }
741 
742 // ------------------------------ GetLane
743 
745  return vgetq_lane_u8(v.raw, 0);
746 }
747 template <size_t N>
749  return vget_lane_u8(v.raw, 0);
750 }
751 
753  return vgetq_lane_s8(v.raw, 0);
754 }
755 template <size_t N>
757  return vget_lane_s8(v.raw, 0);
758 }
759 
761  return vgetq_lane_u16(v.raw, 0);
762 }
763 template <size_t N>
765  return vget_lane_u16(v.raw, 0);
766 }
767 
769  return vgetq_lane_s16(v.raw, 0);
770 }
771 template <size_t N>
773  return vget_lane_s16(v.raw, 0);
774 }
775 
777  return vgetq_lane_u32(v.raw, 0);
778 }
779 template <size_t N>
781  return vget_lane_u32(v.raw, 0);
782 }
783 
785  return vgetq_lane_s32(v.raw, 0);
786 }
787 template <size_t N>
789  return vget_lane_s32(v.raw, 0);
790 }
791 
793  return vgetq_lane_u64(v.raw, 0);
794 }
796  return vget_lane_u64(v.raw, 0);
797 }
799  return vgetq_lane_s64(v.raw, 0);
800 }
802  return vget_lane_s64(v.raw, 0);
803 }
804 
806  return vgetq_lane_f32(v.raw, 0);
807 }
809  return vget_lane_f32(v.raw, 0);
810 }
812  return vget_lane_f32(v.raw, 0);
813 }
814 #if HWY_ARCH_ARM_A64
815 HWY_API double GetLane(const Vec128<double, 2> v) {
816  return vgetq_lane_f64(v.raw, 0);
817 }
818 HWY_API double GetLane(const Vec128<double, 1> v) {
819  return vget_lane_f64(v.raw, 0);
820 }
821 #endif
822 
823 // ================================================== ARITHMETIC
824 
825 // ------------------------------ Addition
826 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
827 
828 // ------------------------------ Subtraction
829 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
830 
831 // ------------------------------ Saturating addition and subtraction
832 // Only defined for uint8_t, uint16_t and their signed versions, as in other
833 // architectures.
834 
835 // Returns a + b clamped to the destination range.
840 
841 // Returns a - b clamped to the destination range.
846 
847 // Not part of API, used in implementation.
848 namespace detail {
853 } // namespace detail
854 
855 // ------------------------------ Average
856 
857 // Returns (a + b + 1) / 2
860 
861 // ------------------------------ Neg
862 
864 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
865 
866 HWY_API Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
867 #if HWY_ARCH_ARM_A64
868  return Vec128<int64_t, 1>(vneg_s64(v.raw));
869 #else
870  return Zero(Simd<int64_t, 1>()) - v;
871 #endif
872 }
873 
875 #if HWY_ARCH_ARM_A64
876  return Vec128<int64_t>(vnegq_s64(v.raw));
877 #else
878  return Zero(Full128<int64_t>()) - v;
879 #endif
880 }
881 
882 // ------------------------------ ShiftLeft
883 
884 // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
885 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
886 #undef HWY_NEON_DEF_FUNCTION
887 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
888  template <int kBits> \
889  HWY_API Vec128<type, size> name(const Vec128<type, size> v) { \
890  return kBits == 0 ? v \
891  : Vec128<type, size>(HWY_NEON_EVAL( \
892  prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
893  }
894 
895 HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, HWY_SHIFT)
896 
897 HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, HWY_SHIFT)
898 HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, HWY_SHIFT)
899 
900 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
901 
902 // ------------------------------ Shl
903 
905  const Vec128<uint8_t> bits) {
906  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
907 }
908 template <size_t N, HWY_IF_LE64(uint8_t, N)>
910  const Vec128<uint8_t, N> bits) {
911  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
912 }
913 
915  const Vec128<uint16_t> bits) {
916  return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
917 }
918 template <size_t N, HWY_IF_LE64(uint16_t, N)>
920  const Vec128<uint16_t, N> bits) {
921  return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
922 }
923 
925  const Vec128<uint32_t> bits) {
926  return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
927 }
928 template <size_t N, HWY_IF_LE64(uint32_t, N)>
930  const Vec128<uint32_t, N> bits) {
931  return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
932 }
933 
935  const Vec128<uint64_t> bits) {
936  return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
937 }
939  const Vec128<uint64_t, 1> bits) {
940  return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
941 }
942 
944  const Vec128<int8_t> bits) {
945  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
946 }
947 template <size_t N, HWY_IF_LE64(int8_t, N)>
949  const Vec128<int8_t, N> bits) {
950  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
951 }
952 
954  const Vec128<int16_t> bits) {
955  return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
956 }
957 template <size_t N, HWY_IF_LE64(int16_t, N)>
959  const Vec128<int16_t, N> bits) {
960  return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
961 }
962 
964  const Vec128<int32_t> bits) {
965  return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
966 }
967 template <size_t N, HWY_IF_LE64(int32_t, N)>
969  const Vec128<int32_t, N> bits) {
970  return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
971 }
972 
974  const Vec128<int64_t> bits) {
975  return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
976 }
978  const Vec128<int64_t, 1> bits) {
979  return Vec128<int64_t, 1>(vshl_s64(v.raw, bits.raw));
980 }
981 
982 // ------------------------------ Shr (Neg)
983 
985  const Vec128<uint8_t> bits) {
986  const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
987  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
988 }
989 template <size_t N, HWY_IF_LE64(uint8_t, N)>
991  const Vec128<uint8_t, N> bits) {
992  const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N>(), bits)).raw;
993  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
994 }
995 
997  const Vec128<uint16_t> bits) {
998  const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
999  return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
1000 }
1001 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1003  const Vec128<uint16_t, N> bits) {
1004  const int16x4_t neg_bits = Neg(BitCast(Simd<int16_t, N>(), bits)).raw;
1005  return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
1006 }
1007 
1009  const Vec128<uint32_t> bits) {
1010  const int32x4_t neg_bits = Neg(BitCast(Full128<int32_t>(), bits)).raw;
1011  return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
1012 }
1013 template <size_t N, HWY_IF_LE64(uint32_t, N)>
1015  const Vec128<uint32_t, N> bits) {
1016  const int32x2_t neg_bits = Neg(BitCast(Simd<int32_t, N>(), bits)).raw;
1017  return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
1018 }
1019 
1021  const Vec128<uint64_t> bits) {
1022  const int64x2_t neg_bits = Neg(BitCast(Full128<int64_t>(), bits)).raw;
1023  return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
1024 }
1026  const Vec128<uint64_t, 1> bits) {
1027  const int64x1_t neg_bits = Neg(BitCast(Simd<int64_t, 1>(), bits)).raw;
1028  return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits));
1029 }
1030 
1032  const Vec128<int8_t> bits) {
1033  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
1034 }
1035 template <size_t N, HWY_IF_LE64(int8_t, N)>
1037  const Vec128<int8_t, N> bits) {
1038  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
1039 }
1040 
1042  const Vec128<int16_t> bits) {
1043  return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
1044 }
1045 template <size_t N, HWY_IF_LE64(int16_t, N)>
1047  const Vec128<int16_t, N> bits) {
1048  return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
1049 }
1050 
1052  const Vec128<int32_t> bits) {
1053  return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
1054 }
1055 template <size_t N, HWY_IF_LE64(int32_t, N)>
1057  const Vec128<int32_t, N> bits) {
1058  return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
1059 }
1060 
1062  const Vec128<int64_t> bits) {
1063  return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
1064 }
1066  const Vec128<int64_t, 1> bits) {
1067  return Vec128<int64_t, 1>(vshl_s64(v.raw, Neg(bits).raw));
1068 }
1069 
1070 // ------------------------------ ShiftLeftSame (Shl)
1071 
1072 template <typename T, size_t N>
1074  return v << Set(Simd<T, N>(), static_cast<T>(bits));
1075 }
1076 template <typename T, size_t N>
1078  return v >> Set(Simd<T, N>(), static_cast<T>(bits));
1079 }
1080 
1081 // ------------------------------ Integer multiplication
1082 
1083 // Unsigned
1085  const Vec128<uint16_t> b) {
1086  return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw));
1087 }
1089  const Vec128<uint32_t> b) {
1090  return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw));
1091 }
1092 
1093 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1095  const Vec128<uint16_t, N> b) {
1096  return Vec128<uint16_t, N>(vmul_u16(a.raw, b.raw));
1097 }
1098 template <size_t N, HWY_IF_LE64(uint32_t, N)>
1100  const Vec128<uint32_t, N> b) {
1101  return Vec128<uint32_t, N>(vmul_u32(a.raw, b.raw));
1102 }
1103 
1104 // Signed
1106  const Vec128<int16_t> b) {
1107  return Vec128<int16_t>(vmulq_s16(a.raw, b.raw));
1108 }
1110  const Vec128<int32_t> b) {
1111  return Vec128<int32_t>(vmulq_s32(a.raw, b.raw));
1112 }
1113 
1114 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1116  const Vec128<int16_t, N> b) {
1117  return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw));
1118 }
1119 template <size_t N, HWY_IF_LE64(int32_t, N)>
1121  const Vec128<int32_t, N> b) {
1122  return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw));
1123 }
1124 
1125 // Returns the upper 16 bits of a * b in each lane.
1127  const Vec128<int16_t> b) {
1128  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
1129 #if HWY_ARCH_ARM_A64
1130  int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
1131 #else
1132  int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
1133 #endif
1134  return Vec128<int16_t>(
1135  vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1136 }
1138  const Vec128<uint16_t> b) {
1139  uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
1140 #if HWY_ARCH_ARM_A64
1141  uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
1142 #else
1143  uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
1144 #endif
1145  return Vec128<uint16_t>(
1146  vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1147 }
1148 
1149 template <size_t N, HWY_IF_LE64(int16_t, N)>
1151  const Vec128<int16_t, N> b) {
1152  int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
1153  return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
1154 }
1155 template <size_t N, HWY_IF_LE64(uint16_t, N)>
1157  const Vec128<uint16_t, N> b) {
1158  uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
1159  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
1160 }
1161 
1162 // ------------------------------ Floating-point mul / div
1163 
1164 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
1165 
1166 // Approximate reciprocal
1167 HWY_API Vec128<float> ApproximateReciprocal(const Vec128<float> v) {
1168  return Vec128<float>(vrecpeq_f32(v.raw));
1169 }
1170 template <size_t N>
1172  return Vec128<float, N>(vrecpe_f32(v.raw));
1173 }
1174 
1175 #if HWY_ARCH_ARM_A64
1176 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
1177 #else
1178 // Not defined on armv7: approximate
1179 namespace detail {
1180 
1182  const Vec128<float> recip, const Vec128<float> divisor) {
1183  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
1184 }
1185 template <size_t N>
1187  const Vec128<float, N> recip, Vec128<float, N> divisor) {
1188  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
1189 }
1190 
1191 } // namespace detail
1192 
1193 template <size_t N>
1195  const Vec128<float, N> b) {
1196  auto x = ApproximateReciprocal(b);
1200  return a * x;
1201 }
1202 #endif
1203 
1204 // ------------------------------ Absolute value of difference.
1205 
1207  return Vec128<float>(vabdq_f32(a.raw, b.raw));
1208 }
1209 template <size_t N, HWY_IF_LE64(float, N)>
1211  const Vec128<float, N> b) {
1212  return Vec128<float, N>(vabd_f32(a.raw, b.raw));
1213 }
1214 
1215 // ------------------------------ Floating-point multiply-add variants
1216 
1217 // Returns add + mul * x
1218 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1219 template <size_t N, HWY_IF_LE64(float, N)>
1220 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
1221  const Vec128<float, N> x,
1222  const Vec128<float, N> add) {
1223  return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1224 }
1225 HWY_API Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x,
1226  const Vec128<float> add) {
1227  return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1228 }
1229 #else
1230 // Emulate FMA for floats.
1231 template <size_t N>
1233  const Vec128<float, N> x,
1234  const Vec128<float, N> add) {
1235  return mul * x + add;
1236 }
1237 #endif
1238 
1239 #if HWY_ARCH_ARM_A64
1240 HWY_API Vec128<double, 1> MulAdd(const Vec128<double, 1> mul,
1241  const Vec128<double, 1> x,
1242  const Vec128<double, 1> add) {
1243  return Vec128<double, 1>(vfma_f64(add.raw, mul.raw, x.raw));
1244 }
1245 HWY_API Vec128<double> MulAdd(const Vec128<double> mul, const Vec128<double> x,
1246  const Vec128<double> add) {
1247  return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1248 }
1249 #endif
1250 
1251 // Returns add - mul * x
1252 #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1253 template <size_t N, HWY_IF_LE64(float, N)>
1254 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
1255  const Vec128<float, N> x,
1256  const Vec128<float, N> add) {
1257  return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1258 }
1259 HWY_API Vec128<float> NegMulAdd(const Vec128<float> mul, const Vec128<float> x,
1260  const Vec128<float> add) {
1261  return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1262 }
1263 #else
1264 // Emulate FMA for floats.
1265 template <size_t N>
1267  const Vec128<float, N> x,
1268  const Vec128<float, N> add) {
1269  return add - mul * x;
1270 }
1271 #endif
1272 
1273 #if HWY_ARCH_ARM_A64
1274 HWY_API Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul,
1275  const Vec128<double, 1> x,
1276  const Vec128<double, 1> add) {
1277  return Vec128<double, 1>(vfms_f64(add.raw, mul.raw, x.raw));
1278 }
1279 HWY_API Vec128<double> NegMulAdd(const Vec128<double> mul,
1280  const Vec128<double> x,
1281  const Vec128<double> add) {
1282  return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1283 }
1284 #endif
1285 
1286 // Returns mul * x - sub
1287 template <size_t N>
1289  const Vec128<float, N> x,
1290  const Vec128<float, N> sub) {
1291  return MulAdd(mul, x, Neg(sub));
1292 }
1293 
1294 // Returns -mul * x - sub
1295 template <size_t N>
1297  const Vec128<float, N> x,
1298  const Vec128<float, N> sub) {
1299  return Neg(MulAdd(mul, x, sub));
1300 }
1301 
1302 #if HWY_ARCH_ARM_A64
1303 template <size_t N>
1304 HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
1305  const Vec128<double, N> x,
1306  const Vec128<double, N> sub) {
1307  return MulAdd(mul, x, Neg(sub));
1308 }
1309 template <size_t N>
1310 HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
1311  const Vec128<double, N> x,
1312  const Vec128<double, N> sub) {
1313  return Neg(MulAdd(mul, x, sub));
1314 }
1315 #endif
1316 
1317 // ------------------------------ Floating-point square root (IfThenZeroElse)
1318 
1319 // Approximate reciprocal square root
1321  return Vec128<float>(vrsqrteq_f32(v.raw));
1322 }
1323 template <size_t N>
1325  return Vec128<float, N>(vrsqrte_f32(v.raw));
1326 }
1327 
1328 // Full precision square root
1329 #if HWY_ARCH_ARM_A64
1331 #else
1332 namespace detail {
1333 
1335  const Vec128<float> recip) {
1336  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
1337 }
1338 template <size_t N>
1340  Vec128<float, N> recip) {
1341  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
1342 }
1343 
1344 } // namespace detail
1345 
1346 // Not defined on armv7: approximate
1347 template <size_t N>
1349  auto recip = ApproximateReciprocalSqrt(v);
1350 
1351  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1352  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1353  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
1354 
1355  const auto root = v * recip;
1356  return IfThenZeroElse(v == Zero(Simd<float, N>()), root);
1357 }
1358 #endif
1359 
1360 // ================================================== LOGICAL
1361 
1362 // ------------------------------ Not
1363 
1364 // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
1365 template <typename T>
1367  const Full128<T> d;
1368  const Repartition<uint8_t, decltype(d)> d8;
1369  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
1370 }
1371 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1373  const Simd<T, N> d;
1374  const Repartition<uint8_t, decltype(d)> d8;
1375  using V8 = decltype(Zero(d8));
1376  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
1377 }
1378 
1379 // ------------------------------ And
1381 
1382 // Uses the u32/64 defined above.
1383 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1384 HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
1385  const Simd<MakeUnsigned<T>, N> d;
1386  return BitCast(Simd<T, N>(), BitCast(d, a) & BitCast(d, b));
1387 }
1388 
1389 // ------------------------------ AndNot
1390 
1391 namespace internal {
1392 // reversed_andnot returns a & ~b.
1393 HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
1394 } // namespace internal
1395 
1396 // Returns ~not_mask & mask.
1397 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1399  const Vec128<T, N> mask) {
1400  return internal::reversed_andnot(mask, not_mask);
1401 }
1402 
1403 // Uses the u32/64 defined above.
1404 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1405 HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
1406  const Vec128<T, N> mask) {
1407  const Simd<MakeUnsigned<T>, N> du;
1408  Vec128<MakeUnsigned<T>, N> ret =
1409  internal::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
1410  return BitCast(Simd<T, N>(), ret);
1411 }
1412 
1413 // ------------------------------ Or
1414 
1416 
1417 // Uses the u32/64 defined above.
1418 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1419 HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
1420  const Simd<MakeUnsigned<T>, N> d;
1421  return BitCast(Simd<T, N>(), BitCast(d, a) | BitCast(d, b));
1422 }
1423 
1424 // ------------------------------ Xor
1425 
1427 
1428 // Uses the u32/64 defined above.
1429 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1430 HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
1431  const Simd<MakeUnsigned<T>, N> d;
1432  return BitCast(Simd<T, N>(), BitCast(d, a) ^ BitCast(d, b));
1433 }
1434 
1435 // ------------------------------ Operator overloads (internal-only if float)
1436 
1437 template <typename T, size_t N>
1439  return And(a, b);
1440 }
1441 
1442 template <typename T, size_t N>
1444  return Or(a, b);
1445 }
1446 
1447 template <typename T, size_t N>
1449  return Xor(a, b);
1450 }
1451 
1452 // ------------------------------ PopulationCount
1453 
1454 #ifdef HWY_NATIVE_POPCNT
1455 #undef HWY_NATIVE_POPCNT
1456 #else
1457 #define HWY_NATIVE_POPCNT
1458 #endif
1459 
1460 namespace detail {
1461 
1462 template <typename T>
1464  const Full128<uint8_t> d8;
1465  return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
1466 }
1467 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1469  Vec128<T, N> v) {
1470  const Simd<uint8_t, N> d8;
1471  return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
1472 }
1473 
1474 // ARM lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
1475 template <typename T>
1477  const Full128<uint8_t> d8;
1478  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
1479  return Vec128<T>(vpaddlq_u8(bytes));
1480 }
1481 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1483  Vec128<T, N> v) {
1485  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
1486  return Vec128<T, N>(vpaddl_u8(bytes));
1487 }
1488 
1489 template <typename T>
1491  const Full128<uint8_t> d8;
1492  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
1493  return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
1494 }
1495 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1497  Vec128<T, N> v) {
1499  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
1500  return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
1501 }
1502 
1503 template <typename T>
1505  const Full128<uint8_t> d8;
1506  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
1507  return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
1508 }
1509 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1511  Vec128<T, N> v) {
1513  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
1514  return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
1515 }
1516 
1517 } // namespace detail
1518 
1519 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1521  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
1522 }
1523 
1524 // ================================================== SIGN
1525 
1526 // ------------------------------ Abs
1527 
1528 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1530  return Vec128<int8_t>(vabsq_s8(v.raw));
1531 }
1533  return Vec128<int16_t>(vabsq_s16(v.raw));
1534 }
1536  return Vec128<int32_t>(vabsq_s32(v.raw));
1537 }
1538 // i64 is implemented after BroadcastSignBit.
1540  return Vec128<float>(vabsq_f32(v.raw));
1541 }
1542 
1543 template <size_t N, HWY_IF_LE64(int8_t, N)>
1545  return Vec128<int8_t, N>(vabs_s8(v.raw));
1546 }
1547 template <size_t N, HWY_IF_LE64(int16_t, N)>
1549  return Vec128<int16_t, N>(vabs_s16(v.raw));
1550 }
1551 template <size_t N, HWY_IF_LE64(int32_t, N)>
1553  return Vec128<int32_t, N>(vabs_s32(v.raw));
1554 }
1555 template <size_t N, HWY_IF_LE64(float, N)>
1557  return Vec128<float, N>(vabs_f32(v.raw));
1558 }
1559 
1560 #if HWY_ARCH_ARM_A64
1561 HWY_API Vec128<double> Abs(const Vec128<double> v) {
1562  return Vec128<double>(vabsq_f64(v.raw));
1563 }
1564 
1565 HWY_API Vec128<double, 1> Abs(const Vec128<double, 1> v) {
1566  return Vec128<double, 1>(vabs_f64(v.raw));
1567 }
1568 #endif
1569 
1570 // ------------------------------ CopySign
1571 
1572 template <typename T, size_t N>
1574  const Vec128<T, N> sign) {
1575  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1576  const auto msb = SignBit(Simd<T, N>());
1577  return Or(AndNot(msb, magn), And(msb, sign));
1578 }
1579 
1580 template <typename T, size_t N>
1582  const Vec128<T, N> sign) {
1583  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1584  return Or(abs, And(SignBit(Simd<T, N>()), sign));
1585 }
1586 
1587 // ------------------------------ BroadcastSignBit
1588 
1589 template <typename T, size_t N>
1591  return ShiftRight<sizeof(T) * 8 - 1>(v);
1592 }
1593 
1594 // ================================================== MASK
1595 
1596 // ------------------------------ To/from vector
1597 
1598 // Mask and Vec have the same representation (true = FF..FF).
1599 template <typename T, size_t N>
1601  const Simd<MakeUnsigned<T>, N> du;
1602  return Mask128<T, N>(BitCast(du, v).raw);
1603 }
1604 
1605 // DEPRECATED
1606 template <typename T, size_t N>
1608  return BitCast(Simd<T, N>(), Vec128<MakeUnsigned<T>, N>(v.raw));
1609 }
1610 
1611 template <typename T, size_t N>
1613  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
1614 }
1615 
1616 // ------------------------------ RebindMask
1617 
1618 template <typename TFrom, typename TTo, size_t N>
1620  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1621  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N>(), m)));
1622 }
1623 
1624 // ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
1625 
1626 #define HWY_NEON_BUILD_TPL_HWY_IF
1627 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
1628 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
1629  const Mask128<type, size> mask, const Vec128<type, size> yes, \
1630  const Vec128<type, size> no
1631 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
1632 
1634 
1635 #undef HWY_NEON_BUILD_TPL_HWY_IF
1636 #undef HWY_NEON_BUILD_RET_HWY_IF
1637 #undef HWY_NEON_BUILD_PARAM_HWY_IF
1638 #undef HWY_NEON_BUILD_ARG_HWY_IF
1639 
1640 // mask ? yes : 0
1641 template <typename T, size_t N>
1643  const Vec128<T, N> yes) {
1644  return yes & VecFromMask(Simd<T, N>(), mask);
1645 }
1646 
1647 // mask ? 0 : no
1648 template <typename T, size_t N>
1650  const Vec128<T, N> no) {
1651  return AndNot(VecFromMask(Simd<T, N>(), mask), no);
1652 }
1653 
1654 template <typename T, size_t N>
1656  const auto zero = Zero(Simd<T, N>());
1657  return Max(zero, v);
1658 }
1659 
1660 // ------------------------------ Mask logical
1661 
1662 template <typename T, size_t N>
1664  return MaskFromVec(Not(VecFromMask(Simd<T, N>(), m)));
1665 }
1666 
1667 template <typename T, size_t N>
1669  const Simd<T, N> d;
1670  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1671 }
1672 
1673 template <typename T, size_t N>
1675  const Simd<T, N> d;
1676  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1677 }
1678 
1679 template <typename T, size_t N>
1681  const Simd<T, N> d;
1682  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1683 }
1684 
1685 template <typename T, size_t N>
1687  const Simd<T, N> d;
1688  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1689 }
1690 
1691 // ================================================== COMPARE
1692 
1693 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1694 
1695 // ------------------------------ Shuffle2301 (for i64 compares)
1696 
1697 // Swap 32-bit halves in 64-bits
1699  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
1700 }
1702  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
1703 }
1705  return Vec128<float, 2>(vrev64_f32(v.raw));
1706 }
1708  return Vec128<uint32_t>(vrev64q_u32(v.raw));
1709 }
1711  return Vec128<int32_t>(vrev64q_s32(v.raw));
1712 }
1714  return Vec128<float>(vrev64q_f32(v.raw));
1715 }
1716 
1717 #define HWY_NEON_BUILD_TPL_HWY_COMPARE
1718 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
1719 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
1720  const Vec128<type, size> a, const Vec128<type, size> b
1721 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
1722 
1723 // ------------------------------ Equality
1724 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
1725 #if HWY_ARCH_ARM_A64
1726 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
1727 #else
1728 // No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
1729 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
1730 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
1731 #endif
1732 
1733 // ------------------------------ Inequality
1734 template <typename T, size_t N>
1736  return Not(a == b);
1737 }
1738 
1739 // ------------------------------ Strict inequality (signed, float)
1740 #if HWY_ARCH_ARM_A64
1741 HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
1742 #else
1743 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
1744 #endif
1745 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
1746 
1747 // ------------------------------ Weak inequality (float)
1748 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
1749 
1750 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE
1751 #undef HWY_NEON_BUILD_RET_HWY_COMPARE
1752 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
1753 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE
1754 
1755 // ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
1756 
1757 #if HWY_ARCH_ARM_V7
1758 
1759 template <size_t N>
1760 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1761  const Vec128<int64_t, N> b) {
1762  const Simd<int32_t, N * 2> d32;
1763  const Simd<int64_t, N> d64;
1764  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1765  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1766  return MaskFromVec(BitCast(d64, cmp64));
1767 }
1768 
1769 template <size_t N>
1770 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1771  const Vec128<uint64_t, N> b) {
1772  const Simd<uint32_t, N * 2> d32;
1773  const Simd<uint64_t, N> d64;
1774  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1775  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1776  return MaskFromVec(BitCast(d64, cmp64));
1777 }
1778 
1779 HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
1780  const Vec128<int64_t> b) {
1781  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
1782  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
1783 }
1784 HWY_API Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
1785  const Vec128<int64_t, 1> b) {
1786  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
1787  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
1788 }
1789 
1790 #endif
1791 
1792 // ------------------------------ Reversed comparisons
1793 
1794 template <typename T, size_t N>
1796  return operator<(b, a);
1797 }
1798 template <typename T, size_t N>
1800  return operator<=(b, a);
1801 }
1802 
1803 // ------------------------------ FirstN (Iota, Lt)
1804 
1805 template <typename T, size_t N>
1806 HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
1807  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1808  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1809 }
1810 
1811 // ------------------------------ TestBit (Eq)
1812 
1813 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT
1814 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
1815 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
1816  Vec128<type, size> v, Vec128<type, size> bit
1817 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
1818 
1819 #if HWY_ARCH_ARM_A64
1820 HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
1821 #else
1822 // No 64-bit versions on armv7
1823 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
1824 HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
1825 
1826 template <size_t N>
1828  Vec128<uint64_t, N> bit) {
1829  return (v & bit) == bit;
1830 }
1831 template <size_t N>
1833  Vec128<int64_t, N> bit) {
1834  return (v & bit) == bit;
1835 }
1836 
1837 #endif
1838 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
1839 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT
1840 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
1841 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
1842 
1843 // ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
1845 #if HWY_ARCH_ARM_A64
1846  return Vec128<int64_t>(vabsq_s64(v.raw));
1847 #else
1848  const auto zero = Zero(Full128<int64_t>());
1849  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1850 #endif
1851 }
1853 #if HWY_ARCH_ARM_A64
1854  return Vec128<int64_t, 1>(vabs_s64(v.raw));
1855 #else
1856  const auto zero = Zero(Simd<int64_t, 1>());
1857  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1858 #endif
1859 }
1860 
1861 // ------------------------------ Min (IfThenElse, BroadcastSignBit)
1862 
1863 #if HWY_ARCH_ARM_A64
1864 
1865 HWY_API Mask128<uint64_t> operator<(Vec128<uint64_t> a, Vec128<uint64_t> b) {
1866  return Mask128<uint64_t>(vcltq_u64(a.raw, b.raw));
1867 }
1868 HWY_API Mask128<uint64_t, 1> operator<(Vec128<uint64_t, 1> a,
1869  Vec128<uint64_t, 1> b) {
1870  return Mask128<uint64_t, 1>(vclt_u64(a.raw, b.raw));
1871 }
1872 
1873 #endif
1874 
1875 // Unsigned
1877 
1878 template <size_t N>
1879 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
1880  const Vec128<uint64_t, N> b) {
1881 #if HWY_ARCH_ARM_A64
1882  return IfThenElse(b < a, b, a);
1883 #else
1884  const Simd<uint64_t, N> du;
1885  const Simd<int64_t, N> di;
1886  return BitCast(du, BitCast(di, a) - BitCast(di, detail::SaturatedSub(a, b)));
1887 #endif
1888 }
1889 
1890 // Signed
1892 
1893 template <size_t N>
1894 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
1895  const Vec128<int64_t, N> b) {
1896 #if HWY_ARCH_ARM_A64
1897  return IfThenElse(b < a, b, a);
1898 #else
1899  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
1900  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
1901 #endif
1902 }
1903 
1904 // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
1905 #if HWY_ARCH_ARM_A64
1907 #else
1909 #endif
1910 
1911 // ------------------------------ Max (IfThenElse, BroadcastSignBit)
1912 
1913 // Unsigned (no u64)
1915 
1916 template <size_t N>
1917 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
1918  const Vec128<uint64_t, N> b) {
1919 #if HWY_ARCH_ARM_A64
1920  return IfThenElse(b < a, a, b);
1921 #else
1922  const Simd<uint64_t, N> du;
1923  const Simd<int64_t, N> di;
1924  return BitCast(du, BitCast(di, b) + BitCast(di, detail::SaturatedSub(a, b)));
1925 #endif
1926 }
1927 
1928 // Signed (no i64)
1930 
1931 template <size_t N>
1932 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
1933  const Vec128<int64_t, N> b) {
1934 #if HWY_ARCH_ARM_A64
1935  return IfThenElse(b < a, a, b);
1936 #else
1937  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
1938  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
1939 #endif
1940 }
1941 
1942 // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
1943 #if HWY_ARCH_ARM_A64
1945 #else
1947 #endif
1948 
1949 // ================================================== MEMORY
1950 
1951 // ------------------------------ Load 128
1952 
1954  const uint8_t* HWY_RESTRICT unaligned) {
1955  return Vec128<uint8_t>(vld1q_u8(unaligned));
1956 }
1958  const uint16_t* HWY_RESTRICT unaligned) {
1959  return Vec128<uint16_t>(vld1q_u16(unaligned));
1960 }
1962  const uint32_t* HWY_RESTRICT unaligned) {
1963  return Vec128<uint32_t>(vld1q_u32(unaligned));
1964 }
1966  const uint64_t* HWY_RESTRICT unaligned) {
1967  return Vec128<uint64_t>(vld1q_u64(unaligned));
1968 }
1970  const int8_t* HWY_RESTRICT unaligned) {
1971  return Vec128<int8_t>(vld1q_s8(unaligned));
1972 }
1974  const int16_t* HWY_RESTRICT unaligned) {
1975  return Vec128<int16_t>(vld1q_s16(unaligned));
1976 }
1978  const int32_t* HWY_RESTRICT unaligned) {
1979  return Vec128<int32_t>(vld1q_s32(unaligned));
1980 }
1982  const int64_t* HWY_RESTRICT unaligned) {
1983  return Vec128<int64_t>(vld1q_s64(unaligned));
1984 }
1986  const float* HWY_RESTRICT unaligned) {
1987  return Vec128<float>(vld1q_f32(unaligned));
1988 }
1989 #if HWY_ARCH_ARM_A64
1990 HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
1991  const double* HWY_RESTRICT unaligned) {
1992  return Vec128<double>(vld1q_f64(unaligned));
1993 }
1994 #endif
1995 
1996 // ------------------------------ Load 64
1997 
1999  const uint8_t* HWY_RESTRICT p) {
2000  return Vec128<uint8_t, 8>(vld1_u8(p));
2001 }
2003  const uint16_t* HWY_RESTRICT p) {
2004  return Vec128<uint16_t, 4>(vld1_u16(p));
2005 }
2007  const uint32_t* HWY_RESTRICT p) {
2008  return Vec128<uint32_t, 2>(vld1_u32(p));
2009 }
2011  const uint64_t* HWY_RESTRICT p) {
2012  return Vec128<uint64_t, 1>(vld1_u64(p));
2013 }
2015  const int8_t* HWY_RESTRICT p) {
2016  return Vec128<int8_t, 8>(vld1_s8(p));
2017 }
2019  const int16_t* HWY_RESTRICT p) {
2020  return Vec128<int16_t, 4>(vld1_s16(p));
2021 }
2023  const int32_t* HWY_RESTRICT p) {
2024  return Vec128<int32_t, 2>(vld1_s32(p));
2025 }
2027  const int64_t* HWY_RESTRICT p) {
2028  return Vec128<int64_t, 1>(vld1_s64(p));
2029 }
2031  const float* HWY_RESTRICT p) {
2032  return Vec128<float, 2>(vld1_f32(p));
2033 }
2034 #if HWY_ARCH_ARM_A64
2035 HWY_API Vec128<double, 1> LoadU(Simd<double, 1> /* tag */,
2036  const double* HWY_RESTRICT p) {
2037  return Vec128<double, 1>(vld1_f64(p));
2038 }
2039 #endif
2040 
2041 // ------------------------------ Load 32
2042 
2043 // In the following load functions, |a| is purposely undefined.
2044 // It is a required parameter to the intrinsic, however
2045 // we don't actually care what is in it, and we don't want
2046 // to introduce extra overhead by initializing it to something.
2047 
2049  const uint8_t* HWY_RESTRICT p) {
2050  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
2051  uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
2052  return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
2053 }
2055  const uint16_t* HWY_RESTRICT p) {
2056  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
2057  uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
2058  return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
2059 }
2061  const uint32_t* HWY_RESTRICT p) {
2062  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
2063  uint32x2_t b = vld1_lane_u32(p, a, 0);
2064  return Vec128<uint32_t, 1>(b);
2065 }
2067  const int8_t* HWY_RESTRICT p) {
2068  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
2069  int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
2070  return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
2071 }
2073  const int16_t* HWY_RESTRICT p) {
2074  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
2075  int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
2076  return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
2077 }
2079  const int32_t* HWY_RESTRICT p) {
2080  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
2081  int32x2_t b = vld1_lane_s32(p, a, 0);
2082  return Vec128<int32_t, 1>(b);
2083 }
2085  const float* HWY_RESTRICT p) {
2086  float32x2_t a = Undefined(Simd<float, 2>()).raw;
2087  float32x2_t b = vld1_lane_f32(p, a, 0);
2088  return Vec128<float, 1>(b);
2089 }
2090 
2091 // ------------------------------ Load 16
2092 
2094  const uint8_t* HWY_RESTRICT p) {
2095  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
2096  uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
2097  return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
2098 }
2100  const uint16_t* HWY_RESTRICT p) {
2101  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
2102  uint16x4_t b = vld1_lane_u16(p, a, 0);
2103  return Vec128<uint16_t, 1>(b);
2104 }
2106  const int8_t* HWY_RESTRICT p) {
2107  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
2108  int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
2109  return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
2110 }
2112  const int16_t* HWY_RESTRICT p) {
2113  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
2114  int16x4_t b = vld1_lane_s16(p, a, 0);
2115  return Vec128<int16_t, 1>(b);
2116 }
2117 
2118 // ------------------------------ Load 8
2119 
2121  const uint8_t* HWY_RESTRICT p) {
2122  uint8x8_t a = Undefined(d).raw;
2123  uint8x8_t b = vld1_lane_u8(p, a, 0);
2124  return Vec128<uint8_t, 1>(b);
2125 }
2126 
2128  const int8_t* HWY_RESTRICT p) {
2129  int8x8_t a = Undefined(d).raw;
2130  int8x8_t b = vld1_lane_s8(p, a, 0);
2131  return Vec128<int8_t, 1>(b);
2132 }
2133 
2134 // [b]float16_t use the same Raw as uint16_t, so forward to that.
2135 template <size_t N>
2137  const float16_t* HWY_RESTRICT p) {
2138  const Simd<uint16_t, N> du16;
2139  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2140  return Vec128<float16_t, N>(LoadU(du16, pu16).raw);
2141 }
2142 template <size_t N>
2144  const bfloat16_t* HWY_RESTRICT p) {
2145  const Simd<uint16_t, N> du16;
2146  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
2147  return Vec128<bfloat16_t, N>(LoadU(du16, pu16).raw);
2148 }
2149 
2150 // On ARM, Load is the same as LoadU.
2151 template <typename T, size_t N>
2153  return LoadU(d, p);
2154 }
2155 
2156 template <typename T, size_t N>
2158  const T* HWY_RESTRICT aligned) {
2159  return IfThenElseZero(m, Load(d, aligned));
2160 }
2161 
2162 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
2163 template <typename T, size_t N, HWY_IF_LE128(T, N)>
2165  return LoadU(d, p);
2166 }
2167 
2168 // ------------------------------ Store 128
2169 
2171  uint8_t* HWY_RESTRICT unaligned) {
2172  vst1q_u8(unaligned, v.raw);
2173 }
2175  uint16_t* HWY_RESTRICT unaligned) {
2176  vst1q_u16(unaligned, v.raw);
2177 }
2179  uint32_t* HWY_RESTRICT unaligned) {
2180  vst1q_u32(unaligned, v.raw);
2181 }
2183  uint64_t* HWY_RESTRICT unaligned) {
2184  vst1q_u64(unaligned, v.raw);
2185 }
2187  int8_t* HWY_RESTRICT unaligned) {
2188  vst1q_s8(unaligned, v.raw);
2189 }
2191  int16_t* HWY_RESTRICT unaligned) {
2192  vst1q_s16(unaligned, v.raw);
2193 }
2195  int32_t* HWY_RESTRICT unaligned) {
2196  vst1q_s32(unaligned, v.raw);
2197 }
2199  int64_t* HWY_RESTRICT unaligned) {
2200  vst1q_s64(unaligned, v.raw);
2201 }
2202 HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
2203  float* HWY_RESTRICT unaligned) {
2204  vst1q_f32(unaligned, v.raw);
2205 }
2206 #if HWY_ARCH_ARM_A64
2207 HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
2208  double* HWY_RESTRICT unaligned) {
2209  vst1q_f64(unaligned, v.raw);
2210 }
2211 #endif
2212 
2213 // ------------------------------ Store 64
2214 
2216  uint8_t* HWY_RESTRICT p) {
2217  vst1_u8(p, v.raw);
2218 }
2220  uint16_t* HWY_RESTRICT p) {
2221  vst1_u16(p, v.raw);
2222 }
2224  uint32_t* HWY_RESTRICT p) {
2225  vst1_u32(p, v.raw);
2226 }
2228  uint64_t* HWY_RESTRICT p) {
2229  vst1_u64(p, v.raw);
2230 }
2232  int8_t* HWY_RESTRICT p) {
2233  vst1_s8(p, v.raw);
2234 }
2236  int16_t* HWY_RESTRICT p) {
2237  vst1_s16(p, v.raw);
2238 }
2240  int32_t* HWY_RESTRICT p) {
2241  vst1_s32(p, v.raw);
2242 }
2244  int64_t* HWY_RESTRICT p) {
2245  vst1_s64(p, v.raw);
2246 }
2248  float* HWY_RESTRICT p) {
2249  vst1_f32(p, v.raw);
2250 }
2251 #if HWY_ARCH_ARM_A64
2252 HWY_API void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
2253  double* HWY_RESTRICT p) {
2254  vst1_f64(p, v.raw);
2255 }
2256 #endif
2257 
2258 // ------------------------------ Store 32
2259 
2261  uint8_t* HWY_RESTRICT p) {
2262  uint32x2_t a = vreinterpret_u32_u8(v.raw);
2263  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
2264 }
2266  uint16_t* HWY_RESTRICT p) {
2267  uint32x2_t a = vreinterpret_u32_u16(v.raw);
2268  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
2269 }
2271  uint32_t* HWY_RESTRICT p) {
2272  vst1_lane_u32(p, v.raw, 0);
2273 }
2275  int8_t* HWY_RESTRICT p) {
2276  int32x2_t a = vreinterpret_s32_s8(v.raw);
2277  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
2278 }
2280  int16_t* HWY_RESTRICT p) {
2281  int32x2_t a = vreinterpret_s32_s16(v.raw);
2282  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
2283 }
2285  int32_t* HWY_RESTRICT p) {
2286  vst1_lane_s32(p, v.raw, 0);
2287 }
2289  float* HWY_RESTRICT p) {
2290  vst1_lane_f32(p, v.raw, 0);
2291 }
2292 
2293 // ------------------------------ Store 16
2294 
2296  uint8_t* HWY_RESTRICT p) {
2297  uint16x4_t a = vreinterpret_u16_u8(v.raw);
2298  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
2299 }
2301  uint16_t* HWY_RESTRICT p) {
2302  vst1_lane_u16(p, v.raw, 0);
2303 }
2305  int8_t* HWY_RESTRICT p) {
2306  int16x4_t a = vreinterpret_s16_s8(v.raw);
2307  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
2308 }
2310  int16_t* HWY_RESTRICT p) {
2311  vst1_lane_s16(p, v.raw, 0);
2312 }
2313 
2314 // ------------------------------ Store 8
2315 
2317  uint8_t* HWY_RESTRICT p) {
2318  vst1_lane_u8(p, v.raw, 0);
2319 }
2321  int8_t* HWY_RESTRICT p) {
2322  vst1_lane_s8(p, v.raw, 0);
2323 }
2324 
2325 // [b]float16_t use the same Raw as uint16_t, so forward to that.
2326 template <size_t N>
2328  float16_t* HWY_RESTRICT p) {
2329  const Simd<uint16_t, N> du16;
2330  const auto pu16 = reinterpret_cast<uint16_t*>(p);
2331  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2332 }
2333 template <size_t N>
2335  bfloat16_t* HWY_RESTRICT p) {
2336  const Simd<uint16_t, N> du16;
2337  const auto pu16 = reinterpret_cast<uint16_t*>(p);
2338  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
2339 }
2340 
2341 // On ARM, Store is the same as StoreU.
2342 template <typename T, size_t N>
2344  StoreU(v, d, aligned);
2345 }
2346 
2347 // ------------------------------ Non-temporal stores
2348 
2349 // Same as aligned stores on non-x86.
2350 
2351 template <typename T, size_t N>
2353  T* HWY_RESTRICT aligned) {
2354  Store(v, d, aligned);
2355 }
2356 
2357 // ================================================== CONVERT
2358 
2359 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2360 
2361 // Unsigned: zero-extend to full vector.
2363  const Vec128<uint8_t, 8> v) {
2364  return Vec128<uint16_t>(vmovl_u8(v.raw));
2365 }
2367  const Vec128<uint8_t, 4> v) {
2368  uint16x8_t a = vmovl_u8(v.raw);
2369  return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
2370 }
2372  const Vec128<uint16_t, 4> v) {
2373  return Vec128<uint32_t>(vmovl_u16(v.raw));
2374 }
2376  const Vec128<uint32_t, 2> v) {
2377  return Vec128<uint64_t>(vmovl_u32(v.raw));
2378 }
2380  const Vec128<uint8_t, 8> v) {
2381  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
2382 }
2384  const Vec128<uint8_t, 4> v) {
2385  uint16x8_t a = vmovl_u8(v.raw);
2386  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
2387 }
2389  const Vec128<uint16_t, 4> v) {
2390  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
2391 }
2392 
2393 // Unsigned: zero-extend to half vector.
2394 template <size_t N, HWY_IF_LE64(uint16_t, N)>
2396  const Vec128<uint8_t, N> v) {
2397  return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw)));
2398 }
2399 template <size_t N, HWY_IF_LE64(uint32_t, N)>
2401  const Vec128<uint8_t, N> v) {
2402  uint16x8_t a = vmovl_u8(v.raw);
2403  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a))));
2404 }
2405 template <size_t N>
2407  const Vec128<uint16_t, N> v) {
2408  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw)));
2409 }
2410 template <size_t N, HWY_IF_LE64(uint64_t, N)>
2412  const Vec128<uint32_t, N> v) {
2413  return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
2414 }
2415 template <size_t N, HWY_IF_LE64(int16_t, N)>
2417  const Vec128<uint8_t, N> v) {
2418  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
2419 }
2420 template <size_t N, HWY_IF_LE64(int32_t, N)>
2422  const Vec128<uint8_t, N> v) {
2423  uint16x8_t a = vmovl_u8(v.raw);
2424  uint32x4_t b = vmovl_u16(vget_low_u16(a));
2425  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b)));
2426 }
2427 template <size_t N, HWY_IF_LE64(int32_t, N)>
2429  const Vec128<uint16_t, N> v) {
2430  uint32x4_t a = vmovl_u16(v.raw);
2431  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a)));
2432 }
2433 
2434 // Signed: replicate sign bit to full vector.
2436  const Vec128<int8_t, 8> v) {
2437  return Vec128<int16_t>(vmovl_s8(v.raw));
2438 }
2440  const Vec128<int8_t, 4> v) {
2441  int16x8_t a = vmovl_s8(v.raw);
2442  return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
2443 }
2445  const Vec128<int16_t, 4> v) {
2446  return Vec128<int32_t>(vmovl_s16(v.raw));
2447 }
2449  const Vec128<int32_t, 2> v) {
2450  return Vec128<int64_t>(vmovl_s32(v.raw));
2451 }
2452 
2453 // Signed: replicate sign bit to half vector.
2454 template <size_t N>
2456  const Vec128<int8_t, N> v) {
2457  return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw)));
2458 }
2459 template <size_t N>
2461  const Vec128<int8_t, N> v) {
2462  int16x8_t a = vmovl_s8(v.raw);
2463  int32x4_t b = vmovl_s16(vget_low_s16(a));
2464  return Vec128<int32_t, N>(vget_low_s32(b));
2465 }
2466 template <size_t N>
2468  const Vec128<int16_t, N> v) {
2469  return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw)));
2470 }
2471 template <size_t N>
2473  const Vec128<int32_t, N> v) {
2474  return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw)));
2475 }
2476 
2477 #if __ARM_FP & 2
2478 
2479 HWY_API Vec128<float> PromoteTo(Full128<float> /* tag */,
2480  const Vec128<float16_t, 4> v) {
2481  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
2482  return Vec128<float>(f32);
2483 }
2484 template <size_t N>
2485 HWY_API Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
2486  const Vec128<float16_t, N> v) {
2487  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
2488  return Vec128<float, N>(vget_low_f32(f32));
2489 }
2490 
2491 #else
2492 
2493 template <size_t N>
2495  const Vec128<float16_t, N> v) {
2496  const Simd<int32_t, N> di32;
2497  const Simd<uint32_t, N> du32;
2498  const Simd<float, N> df32;
2499  // Expand to u32 so we can shift.
2500  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
2501  const auto sign = ShiftRight<15>(bits16);
2502  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
2503  const auto mantissa = bits16 & Set(du32, 0x3FF);
2504  const auto subnormal =
2505  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
2506  Set(df32, 1.0f / 16384 / 1024));
2507 
2508  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
2509  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
2510  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2511  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
2512  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2513 }
2514 
2515 #endif
2516 
2517 #if HWY_ARCH_ARM_A64
2518 
2519 HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
2520  const Vec128<float, 2> v) {
2521  return Vec128<double>(vcvt_f64_f32(v.raw));
2522 }
2523 
2524 HWY_API Vec128<double, 1> PromoteTo(Simd<double, 1> /* tag */,
2525  const Vec128<float, 1> v) {
2526  return Vec128<double, 1>(vget_low_f64(vcvt_f64_f32(v.raw)));
2527 }
2528 
2529 HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
2530  const Vec128<int32_t, 2> v) {
2531  const int64x2_t i64 = vmovl_s32(v.raw);
2532  return Vec128<double>(vcvtq_f64_s64(i64));
2533 }
2534 
2535 HWY_API Vec128<double, 1> PromoteTo(Simd<double, 1> /* tag */,
2536  const Vec128<int32_t, 1> v) {
2537  const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
2538  return Vec128<double, 1>(vcvt_f64_s64(i64));
2539 }
2540 
2541 #endif
2542 
2543 // ------------------------------ Demotions (full -> part w/ narrow lanes)
2544 
2545 // From full vector to half or quarter
2547  const Vec128<int32_t> v) {
2548  return Vec128<uint16_t, 4>(vqmovun_s32(v.raw));
2549 }
2551  const Vec128<int32_t> v) {
2552  return Vec128<int16_t, 4>(vqmovn_s32(v.raw));
2553 }
2555  const Vec128<int32_t> v) {
2556  const uint16x4_t a = vqmovun_s32(v.raw);
2557  return Vec128<uint8_t, 4>(vqmovn_u16(vcombine_u16(a, a)));
2558 }
2560  const Vec128<int16_t> v) {
2561  return Vec128<uint8_t, 8>(vqmovun_s16(v.raw));
2562 }
2564  const Vec128<int32_t> v) {
2565  const int16x4_t a = vqmovn_s32(v.raw);
2566  return Vec128<int8_t, 4>(vqmovn_s16(vcombine_s16(a, a)));
2567 }
2569  const Vec128<int16_t> v) {
2570  return Vec128<int8_t, 8>(vqmovn_s16(v.raw));
2571 }
2572 
2573 // From half vector to partial half
2574 template <size_t N, HWY_IF_LE64(int32_t, N)>
2576  const Vec128<int32_t, N> v) {
2577  return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
2578 }
2579 template <size_t N, HWY_IF_LE64(int32_t, N)>
2581  const Vec128<int32_t, N> v) {
2582  return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
2583 }
2584 template <size_t N, HWY_IF_LE64(int32_t, N)>
2586  const Vec128<int32_t, N> v) {
2587  const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
2588  return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a)));
2589 }
2590 template <size_t N, HWY_IF_LE64(int16_t, N)>
2592  const Vec128<int16_t, N> v) {
2593  return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
2594 }
2595 template <size_t N, HWY_IF_LE64(int32_t, N)>
2597  const Vec128<int32_t, N> v) {
2598  const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
2599  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a)));
2600 }
2601 template <size_t N, HWY_IF_LE64(int16_t, N)>
2603  const Vec128<int16_t, N> v) {
2604  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
2605 }
2606 
2607 #if __ARM_FP & 2
2608 
2609 HWY_API Vec128<float16_t, 4> DemoteTo(Simd<float16_t, 4> /* tag */,
2610  const Vec128<float> v) {
2611  return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
2612 }
2613 template <size_t N>
2614 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
2615  const Vec128<float, N> v) {
2616  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
2617  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
2618 }
2619 
2620 #else
2621 
2622 template <size_t N>
2624  const Vec128<float, N> v) {
2625  const Simd<int32_t, N> di;
2626  const Simd<uint32_t, N> du;
2627  const Simd<uint16_t, N> du16;
2628  const auto bits32 = BitCast(du, v);
2629  const auto sign = ShiftRight<31>(bits32);
2630  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
2631  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
2632 
2633  const auto k15 = Set(di, 15);
2634  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
2635  const auto is_tiny = exp < Set(di, -24);
2636 
2637  const auto is_subnormal = exp < Set(di, -14);
2638  const auto biased_exp16 =
2639  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
2640  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
2641  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
2642  (mantissa32 >> (Set(du, 13) + sub_exp));
2643  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
2644  ShiftRight<13>(mantissa32)); // <1024
2645 
2646  const auto sign16 = ShiftLeft<15>(sign);
2647  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2648  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
2649  return Vec128<float16_t, N>(DemoteTo(du16, bits16).raw);
2650 }
2651 
2652 #endif
2653 
2654 template <size_t N>
2656  const Vec128<float, N> v) {
2657  const Rebind<int32_t, decltype(dbf16)> di32;
2658  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2659  const Rebind<uint16_t, decltype(dbf16)> du16;
2660  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2661  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2662 }
2663 
2664 template <size_t N>
2667  const RebindToUnsigned<decltype(dbf16)> du16;
2668  const Repartition<uint32_t, decltype(dbf16)> du32;
2669  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
2670  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2671 }
2672 
2673 #if HWY_ARCH_ARM_A64
2674 
2675 HWY_API Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */,
2676  const Vec128<double> v) {
2677  return Vec128<float, 2>(vcvt_f32_f64(v.raw));
2678 }
2679 HWY_API Vec128<float, 1> DemoteTo(Simd<float, 1> /* tag */,
2680  const Vec128<double, 1> v) {
2681  return Vec128<float, 1>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
2682 }
2683 
2684 HWY_API Vec128<int32_t, 2> DemoteTo(Simd<int32_t, 2> /* tag */,
2685  const Vec128<double> v) {
2686  const int64x2_t i64 = vcvtq_s64_f64(v.raw);
2687  return Vec128<int32_t, 2>(vqmovn_s64(i64));
2688 }
2689 HWY_API Vec128<int32_t, 1> DemoteTo(Simd<int32_t, 1> /* tag */,
2690  const Vec128<double, 1> v) {
2691  const int64x1_t i64 = vcvt_s64_f64(v.raw);
2692  // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first.
2693  const int64x2_t i64x2 = vcombine_s64(i64, i64);
2694  return Vec128<int32_t, 1>(vqmovn_s64(i64x2));
2695 }
2696 
2697 #endif
2698 
2700  const uint8x16_t org_v = detail::BitCastToByte(v).raw;
2701  const uint8x16_t w = vuzp1q_u8(org_v, org_v);
2702  return Vec128<uint8_t, 4>(vget_low_u8(vuzp1q_u8(w, w)));
2703 }
2704 template <size_t N, HWY_IF_LE64(uint32_t, N)>
2706  const uint8x8_t org_v = detail::BitCastToByte(v).raw;
2707  const uint8x8_t w = vuzp1_u8(org_v, org_v);
2708  return Vec128<uint8_t, N>(vuzp1_u8(w, w));
2709 }
2710 
2711 // In the following DemoteTo functions, |b| is purposely undefined.
2712 // The value a needs to be extended to 128 bits so that vqmovn can be
2713 // used and |b| is undefined so that no extra overhead is introduced.
2714 HWY_DIAGNOSTICS(push)
2715 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
2716 
2717 template <size_t N>
2719  const Vec128<int32_t> v) {
2722  uint16x8_t c = vcombine_u16(a.raw, b.raw);
2723  return Vec128<uint8_t, N>(vqmovn_u16(c));
2724 }
2725 
2726 template <size_t N>
2728  const Vec128<int32_t> v) {
2731  int16x8_t c = vcombine_s16(a.raw, b.raw);
2732  return Vec128<int8_t, N>(vqmovn_s16(c));
2733 }
2734 
2735 HWY_DIAGNOSTICS(pop)
2736 
2737 // ------------------------------ Convert integer <=> floating-point
2738 
2739 HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
2740  const Vec128<int32_t> v) {
2741  return Vec128<float>(vcvtq_f32_s32(v.raw));
2742 }
2743 template <size_t N, HWY_IF_LE64(int32_t, N)>
2745  const Vec128<int32_t, N> v) {
2746  return Vec128<float, N>(vcvt_f32_s32(v.raw));
2747 }
2748 
2749 // Truncates (rounds toward zero).
2751  const Vec128<float> v) {
2752  return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
2753 }
2754 template <size_t N, HWY_IF_LE64(float, N)>
2756  const Vec128<float, N> v) {
2757  return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
2758 }
2759 
2760 #if HWY_ARCH_ARM_A64
2761 
2762 HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
2763  const Vec128<int64_t> v) {
2764  return Vec128<double>(vcvtq_f64_s64(v.raw));
2765 }
2766 HWY_API Vec128<double, 1> ConvertTo(Simd<double, 1> /* tag */,
2767  const Vec128<int64_t, 1> v) {
2768  return Vec128<double, 1>(vcvt_f64_s64(v.raw));
2769 }
2770 
2771 // Truncates (rounds toward zero).
2772 HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
2773  const Vec128<double> v) {
2774  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
2775 }
2776 HWY_API Vec128<int64_t, 1> ConvertTo(Simd<int64_t, 1> /* tag */,
2777  const Vec128<double, 1> v) {
2778  return Vec128<int64_t, 1>(vcvt_s64_f64(v.raw));
2779 }
2780 
2781 #endif
2782 
2783 // ------------------------------ Round (IfThenElse, mask, logical)
2784 
2785 #if HWY_ARCH_ARM_A64
2786 // Toward nearest integer
2788 
2789 // Toward zero, aka truncate
2791 
2792 // Toward +infinity, aka ceiling
2794 
2795 // Toward -infinity, aka floor
2797 #else
2798 
2799 // ------------------------------ Trunc
2800 
2801 // ARMv7 only supports truncation to integer. We can either convert back to
2802 // float (3 floating-point and 2 logic operations) or manipulate the binary32
2803 // representation, clearing the lowest 23-exp mantissa bits. This requires 9
2804 // integer operations and 3 constants, which is likely more expensive.
2805 
2806 namespace detail {
2807 
2808 // The original value is already the desired result if NaN or the magnitude is
2809 // large (i.e. the value is already an integer).
2810 template <size_t N>
2812  return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
2813 }
2814 
2815 } // namespace detail
2816 
2817 template <size_t N>
2819  const Simd<float, N> df;
2820  const RebindToSigned<decltype(df)> di;
2821 
2822  const auto integer = ConvertTo(di, v); // round toward 0
2823  const auto int_f = ConvertTo(df, integer);
2824 
2825  return IfThenElse(detail::UseInt(v), int_f, v);
2826 }
2827 
2828 template <size_t N>
2830  const Simd<float, N> df;
2831 
2832  // ARMv7 also lacks a native NearestInt, but we can instead rely on rounding
2833  // (we assume the current mode is nearest-even) after addition with a large
2834  // value such that no mantissa bits remain. We may need a compiler flag for
2835  // precise floating-point to prevent this from being "optimized" out.
2836  const auto max = Set(df, MantissaEnd<float>());
2837  const auto large = CopySignToAbs(max, v);
2838  const auto added = large + v;
2839  const auto rounded = added - large;
2840 
2841  // Keep original if NaN or the magnitude is large (already an int).
2842  return IfThenElse(Abs(v) < max, rounded, v);
2843 }
2844 
2845 template <size_t N>
2847  const Simd<float, N> df;
2848  const RebindToSigned<decltype(df)> di;
2849 
2850  const auto integer = ConvertTo(di, v); // round toward 0
2851  const auto int_f = ConvertTo(df, integer);
2852 
2853  // Truncating a positive non-integer ends up smaller; if so, add 1.
2854  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
2855 
2856  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
2857 }
2858 
2859 template <size_t N>
2861  const Simd<float, N> df;
2862  const Simd<int32_t, N> di;
2863 
2864  const auto integer = ConvertTo(di, v); // round toward 0
2865  const auto int_f = ConvertTo(df, integer);
2866 
2867  // Truncating a negative non-integer ends up larger; if so, subtract 1.
2868  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
2869 
2870  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
2871 }
2872 
2873 #endif
2874 
2875 // ------------------------------ NearestInt (Round)
2876 
2877 #if HWY_ARCH_ARM_A64
2878 
2879 HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
2880  return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
2881 }
2882 template <size_t N, HWY_IF_LE64(float, N)>
2883 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
2884  return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
2885 }
2886 
2887 #else
2888 
2889 template <size_t N>
2891  const Simd<int32_t, N> di;
2892  return ConvertTo(di, Round(v));
2893 }
2894 
2895 #endif
2896 
2897 // ================================================== SWIZZLE
2898 
2899 // ------------------------------ LowerHalf
2900 
2901 // <= 64 bit: just return different type
2902 template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)>
2903 HWY_API Vec128<T, N / 2> LowerHalf(const Vec128<T, N> v) {
2904  return Vec128<T, N / 2>(v.raw);
2905 }
2906 
2908  return Vec128<uint8_t, 8>(vget_low_u8(v.raw));
2909 }
2911  return Vec128<uint16_t, 4>(vget_low_u16(v.raw));
2912 }
2914  return Vec128<uint32_t, 2>(vget_low_u32(v.raw));
2915 }
2917  return Vec128<uint64_t, 1>(vget_low_u64(v.raw));
2918 }
2920  return Vec128<int8_t, 8>(vget_low_s8(v.raw));
2921 }
2923  return Vec128<int16_t, 4>(vget_low_s16(v.raw));
2924 }
2926  return Vec128<int32_t, 2>(vget_low_s32(v.raw));
2927 }
2929  return Vec128<int64_t, 1>(vget_low_s64(v.raw));
2930 }
2932  return Vec128<float, 2>(vget_low_f32(v.raw));
2933 }
2934 #if HWY_ARCH_ARM_A64
2935 HWY_API Vec128<double, 1> LowerHalf(const Vec128<double> v) {
2936  return Vec128<double, 1>(vget_low_f64(v.raw));
2937 }
2938 #endif
2939 
2940 template <typename T, size_t N>
2942  return LowerHalf(v);
2943 }
2944 
2945 // ------------------------------ CombineShiftRightBytes
2946 
2947 // 128-bit
2948 template <int kBytes, typename T, class V128 = Vec128<T>>
2949 HWY_API V128 CombineShiftRightBytes(Full128<T> d, V128 hi, V128 lo) {
2950  static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
2951  const Repartition<uint8_t, decltype(d)> d8;
2952  uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
2953  return BitCast(d, Vec128<uint8_t>(v8));
2954 }
2955 
2956 // 64-bit
2957 template <int kBytes, typename T, class V64 = Vec128<T, 8 / sizeof(T)>>
2958 HWY_API V64 CombineShiftRightBytes(Simd<T, 8 / sizeof(T)> d, V64 hi, V64 lo) {
2959  static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
2960  const Repartition<uint8_t, decltype(d)> d8;
2961  uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
2962  return BitCast(d, VFromD<decltype(d8)>(v8));
2963 }
2964 
2965 // <= 32-bit defined after ShiftLeftBytes.
2966 
2967 // ------------------------------ Shift vector by constant #bytes
2968 
2969 namespace detail {
2970 
2971 // Partially specialize because kBytes = 0 and >= size are compile errors;
2972 // callers replace the latter with 0xFF for easier specialization.
2973 template <int kBytes>
2975  // Full
2976  template <class T>
2978  const Full128<T> d;
2979  return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
2980  }
2981 
2982  // Partial
2983  template <class T, size_t N, HWY_IF_LE64(T, N)>
2985  // Expand to 64-bit so we only use the native EXT instruction.
2986  const Simd<T, 8 / sizeof(T)> d64;
2987  const auto zero64 = Zero(d64);
2988  const decltype(zero64) v64(v.raw);
2989  return Vec128<T, N>(
2990  CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
2991  }
2992 };
2993 template <>
2994 struct ShiftLeftBytesT<0> {
2995  template <class T, size_t N>
2997  return v;
2998  }
2999 };
3000 template <>
3001 struct ShiftLeftBytesT<0xFF> {
3002  template <class T, size_t N>
3004  return Zero(Simd<T, N>());
3005  }
3006 };
3007 
3008 template <int kBytes>
3010  template <class T, size_t N>
3012  const Simd<T, N> d;
3013  // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
3014  if (N * sizeof(T) < 8) {
3015  constexpr size_t kReg = N * sizeof(T) == 16 ? 16 : 8;
3016  const Simd<T, kReg / sizeof(T)> dreg;
3017  v = Vec128<T, N>(
3018  IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
3019  }
3020  return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
3021  }
3022 };
3023 template <>
3024 struct ShiftRightBytesT<0> {
3025  template <class T, size_t N>
3027  return v;
3028  }
3029 };
3030 template <>
3031 struct ShiftRightBytesT<0xFF> {
3032  template <class T, size_t N>
3034  return Zero(Simd<T, N>());
3035  }
3036 };
3037 
3038 } // namespace detail
3039 
3040 template <int kBytes, typename T, size_t N>
3042  return detail::ShiftLeftBytesT < kBytes >= N * sizeof(T) ? 0xFF
3043  : kBytes > ()(v);
3044 }
3045 
3046 template <int kBytes, typename T, size_t N>
3048  return ShiftLeftBytes<kBytes>(Simd<T, N>(), v);
3049 }
3050 
3051 template <int kLanes, typename T, size_t N>
3053  const Repartition<uint8_t, decltype(d)> d8;
3054  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3055 }
3056 
3057 template <int kLanes, typename T, size_t N>
3059  return ShiftLeftLanes<kLanes>(Simd<T, N>(), v);
3060 }
3061 
3062 // 0x01..0F, kBytes = 1 => 0x0001..0E
3063 template <int kBytes, typename T, size_t N>
3065  return detail::ShiftRightBytesT < kBytes >= N * sizeof(T) ? 0xFF
3066  : kBytes > ()(v);
3067 }
3068 
3069 template <int kLanes, typename T, size_t N>
3071  const Repartition<uint8_t, decltype(d)> d8;
3072  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3073 }
3074 
3075 // Calls ShiftLeftBytes
3076 template <int kBytes, typename T, size_t N, HWY_IF_LE32(T, N)>
3078  Vec128<T, N> lo) {
3079  constexpr size_t kSize = N * sizeof(T);
3080  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3081  const Repartition<uint8_t, decltype(d)> d8;
3082  const Simd<uint8_t, 8> d_full8;
3083  const Repartition<T, decltype(d_full8)> d_full;
3084  using V64 = VFromD<decltype(d_full8)>;
3085  const V64 hi64(BitCast(d8, hi).raw);
3086  // Move into most-significant bytes
3087  const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
3088  const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
3089  // After casting to full 64-bit vector of correct type, shrink to 32-bit
3090  return Vec128<T, N>(BitCast(d_full, r).raw);
3091 }
3092 
3093 // ------------------------------ UpperHalf (ShiftRightBytes)
3094 
3095 // Full input
3097  const Vec128<uint8_t> v) {
3098  return Vec128<uint8_t, 8>(vget_high_u8(v.raw));
3099 }
3101  const Vec128<uint16_t> v) {
3102  return Vec128<uint16_t, 4>(vget_high_u16(v.raw));
3103 }
3105  const Vec128<uint32_t> v) {
3106  return Vec128<uint32_t, 2>(vget_high_u32(v.raw));
3107 }
3109  const Vec128<uint64_t> v) {
3110  return Vec128<uint64_t, 1>(vget_high_u64(v.raw));
3111 }
3113  const Vec128<int8_t> v) {
3114  return Vec128<int8_t, 8>(vget_high_s8(v.raw));
3115 }
3117  const Vec128<int16_t> v) {
3118  return Vec128<int16_t, 4>(vget_high_s16(v.raw));
3119 }
3121  const Vec128<int32_t> v) {
3122  return Vec128<int32_t, 2>(vget_high_s32(v.raw));
3123 }
3125  const Vec128<int64_t> v) {
3126  return Vec128<int64_t, 1>(vget_high_s64(v.raw));
3127 }
3129  const Vec128<float> v) {
3130  return Vec128<float, 2>(vget_high_f32(v.raw));
3131 }
3132 #if HWY_ARCH_ARM_A64
3133 HWY_API Vec128<double, 1> UpperHalf(Simd<double, 1> /* tag */,
3134  const Vec128<double> v) {
3135  return Vec128<double, 1>(vget_high_f64(v.raw));
3136 }
3137 #endif
3138 
3139 // Partial
3140 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3141 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N>> /* tag */,
3142  Vec128<T, N> v) {
3143  const Simd<T, N> d;
3144  const auto vu = BitCast(RebindToUnsigned<decltype(d)>(), v);
3145  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(vu));
3146  return Vec128<T, (N + 1) / 2>(upper.raw);
3147 }
3148 
3149 // ------------------------------ Broadcast/splat any lane
3150 
3151 #if HWY_ARCH_ARM_A64
3152 // Unsigned
3153 template <int kLane>
3154 HWY_API Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
3155  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3156  return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
3157 }
3158 template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3159 HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
3160  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3161  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3162 }
3163 template <int kLane>
3164 HWY_API Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
3165  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3166  return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
3167 }
3168 template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3169 HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
3170  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3171  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3172 }
3173 template <int kLane>
3174 HWY_API Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
3175  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3176  return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
3177 }
3178 // Vec128<uint64_t, 1> is defined below.
3179 
3180 // Signed
3181 template <int kLane>
3182 HWY_API Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
3183  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3184  return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
3185 }
3186 template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3187 HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
3188  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3189  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3190 }
3191 template <int kLane>
3192 HWY_API Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
3193  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3194  return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
3195 }
3196 template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3197 HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
3198  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3199  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3200 }
3201 template <int kLane>
3202 HWY_API Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
3203  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3204  return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
3205 }
3206 // Vec128<int64_t, 1> is defined below.
3207 
3208 // Float
3209 template <int kLane>
3210 HWY_API Vec128<float> Broadcast(const Vec128<float> v) {
3211  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3212  return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
3213 }
3214 template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3215 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
3216  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3217  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3218 }
3219 template <int kLane>
3220 HWY_API Vec128<double> Broadcast(const Vec128<double> v) {
3221  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3222  return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
3223 }
3224 template <int kLane>
3225 HWY_API Vec128<double, 1> Broadcast(const Vec128<double, 1> v) {
3226  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3227  return v;
3228 }
3229 
3230 #else
3231 // No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
3232 
3233 // Unsigned
3234 template <int kLane>
3236  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3237  return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
3238 }
3239 template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
3241  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3242  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3243 }
3244 template <int kLane>
3246  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3247  return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
3248 }
3249 template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
3251  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3252  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
3253 }
3254 template <int kLane>
3256  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3257  return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
3258 }
3259 // Vec128<uint64_t, 1> is defined below.
3260 
3261 // Signed
3262 template <int kLane>
3264  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
3265  return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
3266 }
3267 template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
3269  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3270  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
3271 }
3272 template <int kLane>
3274  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3275  return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
3276 }
3277 template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
3279  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3280  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
3281 }
3282 template <int kLane>
3284  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
3285  return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
3286 }
3287 // Vec128<int64_t, 1> is defined below.
3288 
3289 // Float
3290 template <int kLane>
3292  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
3293  return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
3294 }
3295 template <int kLane, size_t N, HWY_IF_LE64(float, N)>
3297  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3298  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
3299 }
3300 
3301 #endif
3302 
3303 template <int kLane>
3305  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3306  return v;
3307 }
3308 template <int kLane>
3310  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
3311  return v;
3312 }
3313 
3314 // ------------------------------ TableLookupLanes
3315 
3316 // Returned by SetTableIndices for use by TableLookupLanes.
3317 template <typename T, size_t N>
3318 struct Indices128 {
3320 };
3321 
3322 template <typename T, size_t N, HWY_IF_LE128(T, N)>
3324 #if HWY_IS_DEBUG_BUILD
3325  for (size_t i = 0; i < N; ++i) {
3326  HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
3327  }
3328 #endif
3329 
3330  const Repartition<uint8_t, decltype(d)> d8;
3331  alignas(16) uint8_t control[16] = {0};
3332  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
3333  for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
3334  control[idx_lane * sizeof(T) + idx_byte] = static_cast<uint8_t>(
3335  static_cast<size_t>(idx[idx_lane]) * sizeof(T) + idx_byte);
3336  }
3337  }
3338  return Indices128<T, N>{BitCast(d, Load(d8, control)).raw};
3339 }
3340 
3341 template <size_t N>
3343  const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
3344  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
3345 }
3346 template <size_t N>
3348  const Indices128<int32_t, N> idx) {
3349  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
3350 }
3351 template <size_t N>
3353  const Indices128<float, N> idx) {
3354  const Simd<int32_t, N> di;
3355  const auto idx_i = BitCast(di, Vec128<float, N>{idx.raw});
3356  return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
3357 }
3358 
3359 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
3360 
3361 template <typename T>
3363  return Shuffle0123(v);
3364 }
3365 
3366 template <typename T>
3368  return Vec128<T, 2>(Shuffle2301(v));
3369 }
3370 
3371 template <typename T>
3373  return v;
3374 }
3375 
3376 // ------------------------------ Other shuffles (TableLookupBytes)
3377 
3378 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
3379 // Shuffle0321 rotates one lane to the right (the previous least-significant
3380 // lane is now most-significant). These could also be implemented via
3381 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
3382 
3383 // Swap 64-bit halves
3384 template <typename T>
3386  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
3387 }
3388 template <typename T>
3390  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
3391 }
3392 
3393 // Rotate right 32 bits
3394 template <typename T>
3396  return CombineShiftRightBytes<4>(Full128<T>(), v, v);
3397 }
3398 
3399 // Rotate left 32 bits
3400 template <typename T>
3402  return CombineShiftRightBytes<12>(Full128<T>(), v, v);
3403 }
3404 
3405 // Reverse
3406 template <typename T>
3408  return Shuffle2301(Shuffle1032(v));
3409 }
3410 
3411 // ------------------------------ InterleaveLower
3412 
3413 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3414 // the least-significant lane) and "b". To concatenate two half-width integers
3415 // into one, use ZipLower/Upper instead (also works with scalar).
3418 
3419 #if HWY_ARCH_ARM_A64
3420 // N=1 makes no sense (in that case, there would be no upper/lower).
3421 HWY_API Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
3422  const Vec128<uint64_t> b) {
3423  return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
3424 }
3425 HWY_API Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
3426  const Vec128<int64_t> b) {
3427  return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
3428 }
3429 HWY_API Vec128<double> InterleaveLower(const Vec128<double> a,
3430  const Vec128<double> b) {
3431  return Vec128<double>(vzip1q_f64(a.raw, b.raw));
3432 }
3433 #else
3434 // ARMv7 emulation.
3436  const Vec128<uint64_t> b) {
3437  return CombineShiftRightBytes<8>(Full128<uint64_t>(), b, Shuffle01(a));
3438 }
3440  const Vec128<int64_t> b) {
3441  return CombineShiftRightBytes<8>(Full128<int64_t>(), b, Shuffle01(a));
3442 }
3443 #endif
3444 
3445 // Floats
3447  const Vec128<float> b) {
3448  return Vec128<float>(vzip1q_f32(a.raw, b.raw));
3449 }
3450 template <size_t N, HWY_IF_LE64(float, N)>
3452  const Vec128<float, N> b) {
3453  return Vec128<float, N>(vzip1_f32(a.raw, b.raw));
3454 }
3455 
3456 // < 64 bit parts
3457 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3459  using V64 = Vec128<T, 8 / sizeof(T)>;
3460  return Vec128<T, N>(InterleaveLower(V64(a.raw), V64(b.raw)).raw);
3461 }
3462 
3463 // Additional overload for the optional Simd<> tag.
3464 template <typename T, size_t N, class V = Vec128<T, N>>
3465 HWY_API V InterleaveLower(Simd<T, N> /* tag */, V a, V b) {
3466  return InterleaveLower(a, b);
3467 }
3468 
3469 // ------------------------------ InterleaveUpper (UpperHalf)
3470 
3471 // All functions inside detail lack the required D parameter.
3472 namespace detail {
3475 
3476 #if HWY_ARCH_ARM_A64
3477 // N=1 makes no sense (in that case, there would be no upper/lower).
3478 HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
3479  const Vec128<uint64_t> b) {
3480  return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
3481 }
3482 HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
3483  return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
3484 }
3485 HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) {
3486  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
3487 }
3488 #else
3489 // ARMv7 emulation.
3491  const Vec128<uint64_t> b) {
3492  return CombineShiftRightBytes<8>(Full128<uint64_t>(), Shuffle01(b), a);
3493 }
3495  return CombineShiftRightBytes<8>(Full128<int64_t>(), Shuffle01(b), a);
3496 }
3497 #endif
3498 
3500  return Vec128<float>(vzip2q_f32(a.raw, b.raw));
3501 }
3503  const Vec128<float, 2> b) {
3504  return Vec128<float, 2>(vzip2_f32(a.raw, b.raw));
3505 }
3506 
3507 } // namespace detail
3508 
3509 // Full register
3510 template <typename T, size_t N, HWY_IF_GE64(T, N), class V = Vec128<T, N>>
3511 HWY_API V InterleaveUpper(Simd<T, N> /* tag */, V a, V b) {
3512  return detail::InterleaveUpper(a, b);
3513 }
3514 
3515 // Partial
3516 template <typename T, size_t N, HWY_IF_LE32(T, N), class V = Vec128<T, N>>
3517 HWY_API V InterleaveUpper(Simd<T, N> d, V a, V b) {
3518  const Half<decltype(d)> d2;
3519  return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw));
3520 }
3521 
3522 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3523 
3524 // Same as Interleave*, except that the return lanes are double-width integers;
3525 // this is necessary because the single-lane scalar cannot return two values.
3526 template <typename T, size_t N, class DW = RepartitionToWide<Simd<T, N>>>
3528  return BitCast(DW(), InterleaveLower(a, b));
3529 }
3530 template <typename T, size_t N, class D = Simd<T, N>,
3531  class DW = RepartitionToWide<D>>
3533  return BitCast(dw, InterleaveLower(D(), a, b));
3534 }
3535 
3536 template <typename T, size_t N, class D = Simd<T, N>,
3537  class DW = RepartitionToWide<D>>
3539  return BitCast(dw, InterleaveUpper(D(), a, b));
3540 }
3541 
3542 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3543 
3544 template <size_t N>
3548  const Vec128<float, N> sum0,
3549  Vec128<float, N>& sum1) {
3550  const Repartition<uint16_t, decltype(df32)> du16;
3551  const RebindToUnsigned<decltype(df32)> du32;
3552  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
3553  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
3554  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3555  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
3556  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3557  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3558  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3559 }
3560 
3561 // ================================================== COMBINE
3562 
3563 // ------------------------------ Combine (InterleaveLower)
3564 
3565 // Full result
3568  return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
3569 }
3572  Vec128<uint16_t, 4> lo) {
3573  return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
3574 }
3577  Vec128<uint32_t, 2> lo) {
3578  return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
3579 }
3582  Vec128<uint64_t, 1> lo) {
3583  return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
3584 }
3585 
3587  Vec128<int8_t, 8> lo) {
3588  return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
3589 }
3592  return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
3593 }
3596  return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
3597 }
3600  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
3601 }
3602 
3604  Vec128<float, 2> lo) {
3605  return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
3606 }
3607 #if HWY_ARCH_ARM_A64
3608 HWY_API Vec128<double> Combine(Full128<double> /* tag */, Vec128<double, 1> hi,
3609  Vec128<double, 1> lo) {
3610  return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
3611 }
3612 #endif
3613 
3614 // < 64bit input, <= 64 bit result
3615 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3617  Vec128<T, N / 2> lo) {
3618  // First double N (only lower halves will be used).
3619  const Vec128<T, N> hi2(hi.raw);
3620  const Vec128<T, N> lo2(lo.raw);
3621  // Repartition to two unsigned lanes (each the size of the valid input).
3622  const Simd<UnsignedFromSize<N * sizeof(T) / 2>, 2> du;
3623  return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
3624 }
3625 
3626 // ------------------------------ ZeroExtendVector (Combine)
3627 
3628 template <typename T, size_t N>
3630  return Combine(d, Zero(Half<decltype(d)>()), lo);
3631 }
3632 
3633 // ------------------------------ ConcatLowerLower
3634 
3635 // 64 or 128-bit input: just interleave
3636 template <typename T, size_t N, HWY_IF_GE64(T, N)>
3638  Vec128<T, N> lo) {
3639  // Treat half-width input as a single lane and interleave them.
3640  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3641  return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
3642 }
3643 
3644 #if HWY_ARCH_ARM_A64
3645 namespace detail {
3646 
3647 HWY_INLINE Vec128<uint8_t, 2> ConcatEven(Vec128<uint8_t, 2> hi,
3648  Vec128<uint8_t, 2> lo) {
3649  return Vec128<uint8_t, 2>(vtrn1_u8(lo.raw, hi.raw));
3650 }
3651 HWY_INLINE Vec128<uint16_t, 2> ConcatEven(Vec128<uint16_t, 2> hi,
3652  Vec128<uint16_t, 2> lo) {
3653  return Vec128<uint16_t, 2>(vtrn1_u16(lo.raw, hi.raw));
3654 }
3655 
3656 } // namespace detail
3657 
3658 // <= 32-bit input/output
3659 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3660 HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N> d, Vec128<T, N> hi,
3661  Vec128<T, N> lo) {
3662  // Treat half-width input as two lanes and take every second one.
3663  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3664  return BitCast(d, detail::ConcatEven(BitCast(du, hi), BitCast(du, lo)));
3665 }
3666 
3667 #else
3668 
3669 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3670 HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N> d, Vec128<T, N> hi,
3671  Vec128<T, N> lo) {
3672  const Half<decltype(d)> d2;
3673  return Combine(LowerHalf(d2, hi), LowerHalf(d2, lo));
3674 }
3675 #endif // HWY_ARCH_ARM_A64
3676 
3677 // ------------------------------ ConcatUpperUpper
3678 
3679 // 64 or 128-bit input: just interleave
3680 template <typename T, size_t N, HWY_IF_GE64(T, N)>
3682  Vec128<T, N> lo) {
3683  // Treat half-width input as a single lane and interleave them.
3684  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3685  return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
3686 }
3687 
3688 #if HWY_ARCH_ARM_A64
3689 namespace detail {
3690 
3691 HWY_INLINE Vec128<uint8_t, 2> ConcatOdd(Vec128<uint8_t, 2> hi,
3692  Vec128<uint8_t, 2> lo) {
3693  return Vec128<uint8_t, 2>(vtrn2_u8(lo.raw, hi.raw));
3694 }
3695 HWY_INLINE Vec128<uint16_t, 2> ConcatOdd(Vec128<uint16_t, 2> hi,
3696  Vec128<uint16_t, 2> lo) {
3697  return Vec128<uint16_t, 2>(vtrn2_u16(lo.raw, hi.raw));
3698 }
3699 
3700 } // namespace detail
3701 
3702 // <= 32-bit input/output
3703 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3704 HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N> d, Vec128<T, N> hi,
3705  Vec128<T, N> lo) {
3706  // Treat half-width input as two lanes and take every second one.
3707  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
3708  return BitCast(d, detail::ConcatOdd(BitCast(du, hi), BitCast(du, lo)));
3709 }
3710 
3711 #else
3712 
3713 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3714 HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N> d, Vec128<T, N> hi,
3715  Vec128<T, N> lo) {
3716  const Half<decltype(d)> d2;
3717  return Combine(UpperHalf(d2, hi), UpperHalf(d2, lo));
3718 }
3719 
3720 #endif // HWY_ARCH_ARM_A64
3721 
3722 // ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
3723 
3724 // 64 or 128-bit input: extract from concatenated
3725 template <typename T, size_t N, HWY_IF_GE64(T, N)>
3727  Vec128<T, N> lo) {
3728  return CombineShiftRightBytes<N * sizeof(T) / 2>(d, hi, lo);
3729 }
3730 
3731 // <= 32-bit input/output
3732 template <typename T, size_t N, HWY_IF_LE32(T, N)>
3733 HWY_API Vec128<T, N> ConcatLowerUpper(const Simd<T, N> d, Vec128<T, N> hi,
3734  Vec128<T, N> lo) {
3735  constexpr size_t kSize = N * sizeof(T);
3736  const Repartition<uint8_t, decltype(d)> d8;
3737  const Simd<uint8_t, 8> d8x8;
3738  const Simd<T, 8 / sizeof(T)> d64;
3739  using V8x8 = VFromD<decltype(d8x8)>;
3740  const V8x8 hi8x8(BitCast(d8, hi).raw);
3741  // Move into most-significant bytes
3742  const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
3743  const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
3744  // Back to original lane type, then shrink N.
3745  return Vec128<T, N>(BitCast(d64, r).raw);
3746 }
3747 
3748 // ------------------------------ ConcatUpperLower
3749 
3750 // Works for all N.
3751 template <typename T, size_t N>
3753  Vec128<T, N> lo) {
3754  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
3755 }
3756 
3757 // ------------------------------ ConcatOdd (InterleaveUpper)
3758 
3759 // 32-bit full
3762  return Vec128<uint32_t>(vuzp2q_u32(lo.raw, hi.raw));
3763 }
3766  return Vec128<int32_t>(vuzp2q_s32(lo.raw, hi.raw));
3767 }
3769  Vec128<float> lo) {
3770  return Vec128<float>(vuzp2q_f32(lo.raw, hi.raw));
3771 }
3772 
3773 // 32-bit partial
3774 template <size_t N, HWY_IF_LE64(uint32_t, N)>
3777  Vec128<uint32_t, N> lo) {
3778  return Vec128<uint32_t, N>(vuzp2_u32(lo.raw, hi.raw));
3779 }
3780 template <size_t N, HWY_IF_LE64(int32_t, N)>
3782  Vec128<int32_t, N> hi,
3783  Vec128<int32_t, N> lo) {
3784  return Vec128<int32_t, N>(vuzp2_s32(lo.raw, hi.raw));
3785 }
3786 template <size_t N, HWY_IF_LE64(float, N)>
3789  return Vec128<float, N>(vuzp2_f32(lo.raw, hi.raw));
3790 }
3791 
3792 // 64-bit full - no partial because we need at least two inputs to have
3793 // even/odd. ARMv7 lacks vuzpq_u64, and it's anyway the same as InterleaveUpper.
3794 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3796  return InterleaveUpper(d, lo, hi);
3797 }
3798 
3799 // ------------------------------ ConcatEven (InterleaveLower)
3800 
3801 // 32-bit full
3804  return Vec128<uint32_t>(vuzp1q_u32(lo.raw, hi.raw));
3805 }
3808  return Vec128<int32_t>(vuzp1q_s32(lo.raw, hi.raw));
3809 }
3811  Vec128<float> lo) {
3812  return Vec128<float>(vuzp1q_f32(lo.raw, hi.raw));
3813 }
3814 
3815 // 32-bit partial
3816 template <size_t N, HWY_IF_LE64(uint32_t, N)>
3819  Vec128<uint32_t, N> lo) {
3820  return Vec128<uint32_t, N>(vuzp1_u32(lo.raw, hi.raw));
3821 }
3822 template <size_t N, HWY_IF_LE64(int32_t, N)>
3824  Vec128<int32_t, N> hi,
3825  Vec128<int32_t, N> lo) {
3826  return Vec128<int32_t, N>(vuzp1_s32(lo.raw, hi.raw));
3827 }
3828 template <size_t N, HWY_IF_LE64(float, N)>
3831  return Vec128<float, N>(vuzp1_f32(lo.raw, hi.raw));
3832 }
3833 
3834 // 64-bit full - no partial because we need at least two inputs to have
3835 // even/odd. ARMv7 lacks vuzpq_u64, and it's anyway the same as InterleaveUpper.
3836 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3838  return InterleaveLower(d, lo, hi);
3839 }
3840 
3841 // ------------------------------ OddEven (IfThenElse)
3842 
3843 template <typename T, size_t N>
3845  const Simd<T, N> d;
3846  const Repartition<uint8_t, decltype(d)> d8;
3847  alignas(16) constexpr uint8_t kBytes[16] = {
3848  ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
3849  ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
3850  ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
3851  ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
3852  ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
3853  ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
3854  ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
3855  ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
3856  };
3857  const auto vec = BitCast(d, Load(d8, kBytes));
3858  return IfThenElse(MaskFromVec(vec), b, a);
3859 }
3860 
3861 // ================================================== CRYPTO
3862 
3863 #if defined(__ARM_FEATURE_AES)
3864 
3865 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3866 #ifdef HWY_NATIVE_AES
3867 #undef HWY_NATIVE_AES
3868 #else
3869 #define HWY_NATIVE_AES
3870 #endif
3871 
3872 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
3873  Vec128<uint8_t> round_key) {
3874  // NOTE: it is important that AESE and AESMC be consecutive instructions so
3875  // they can be fused. AESE includes AddRoundKey, which is a different ordering
3876  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
3877  // round key (the compiler will hopefully optimize this for multiple rounds).
3878  return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
3879  round_key;
3880 }
3881 
3882 HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
3883  return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
3884 }
3885 
3886 HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
3887  return Vec128<uint64_t>(
3888  (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
3889 }
3890 
3891 #endif // __ARM_FEATURE_AES
3892 
3893 // ================================================== MISC
3894 
3895 template <size_t N>
3897  const Vec128<bfloat16_t, N> v) {
3898  const Rebind<uint16_t, decltype(df32)> du16;
3899  const RebindToSigned<decltype(df32)> di32;
3900  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3901 }
3902 
3903 // ------------------------------ MulEven (ConcatEven)
3904 
3905 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
3906 // even and the upper half into its odd neighbor lane.
3908  const Full128<int32_t> d;
3909  int32x4_t a_packed = ConcatEven(d, a, a).raw;
3910  int32x4_t b_packed = ConcatEven(d, b, b).raw;
3911  return Vec128<int64_t>(
3912  vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
3913 }
3915  const Full128<uint32_t> d;
3916  uint32x4_t a_packed = ConcatEven(d, a, a).raw;
3917  uint32x4_t b_packed = ConcatEven(d, b, b).raw;
3918  return Vec128<uint64_t>(
3919  vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
3920 }
3921 
3922 template <size_t N>
3923 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
3924  const Vec128<int32_t, N> b) {
3925  const Simd<int32_t, N> d;
3926  int32x2_t a_packed = ConcatEven(d, a, a).raw;
3927  int32x2_t b_packed = ConcatEven(d, b, b).raw;
3928  return Vec128<int64_t, (N + 1) / 2>(
3929  vget_low_s64(vmull_s32(a_packed, b_packed)));
3930 }
3931 template <size_t N>
3932 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
3933  const Vec128<uint32_t, N> b) {
3934  const Simd<uint32_t, N> d;
3935  uint32x2_t a_packed = ConcatEven(d, a, a).raw;
3936  uint32x2_t b_packed = ConcatEven(d, b, b).raw;
3937  return Vec128<uint64_t, (N + 1) / 2>(
3938  vget_low_u64(vmull_u32(a_packed, b_packed)));
3939 }
3940 
3942  uint64_t hi;
3943  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
3944  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
3945 }
3946 
3948  uint64_t hi;
3949  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
3950  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
3951 }
3952 
3953 // ------------------------------ TableLookupBytes (Combine, LowerHalf)
3954 
3955 // Both full
3956 template <typename T, typename TI>
3958  const Vec128<TI> from) {
3959  const Full128<TI> d;
3960  const Repartition<uint8_t, decltype(d)> d8;
3961 #if HWY_ARCH_ARM_A64
3962  return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
3963  BitCast(d8, from).raw)));
3964 #else
3965  uint8x16_t table0 = BitCast(d8, bytes).raw;
3966  uint8x8x2_t table;
3967  table.val[0] = vget_low_u8(table0);
3968  table.val[1] = vget_high_u8(table0);
3969  uint8x16_t idx = BitCast(d8, from).raw;
3970  uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
3971  uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
3972  return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
3973 #endif
3974 }
3975 
3976 // Partial index vector
3977 template <typename T, typename TI, size_t NI, HWY_IF_LE64(TI, NI)>
3979  const Vec128<TI, NI> from) {
3980  const Full128<TI> d_full;
3981  const Vec128<TI, 8 / sizeof(T)> from64(from.raw);
3982  const auto idx_full = Combine(d_full, from64, from64);
3983  const auto out_full = TableLookupBytes(bytes, idx_full);
3984  return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
3985 }
3986 
3987 // Partial table vector
3988 template <typename T, size_t N, typename TI, HWY_IF_LE64(T, N)>
3990  const Vec128<TI> from) {
3991  const Full128<T> d_full;
3992  return TableLookupBytes(Combine(d_full, bytes, bytes), from);
3993 }
3994 
3995 // Partial both
3996 template <typename T, size_t N, typename TI, size_t NI, HWY_IF_LE64(T, N),
3997  HWY_IF_LE64(TI, NI)>
3999  Vec128<T, N> bytes, Vec128<TI, NI> from) {
4000  const Simd<T, N> d;
4001  const Simd<TI, NI> d_idx;
4002  const Repartition<uint8_t, decltype(d_idx)> d_idx8;
4003  // uint8x8
4004  const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
4005  const auto from8 = BitCast(d_idx8, from);
4006  const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4007  return BitCast(d_idx, v8);
4008 }
4009 
4010 // For all vector widths; ARM anyway zeroes if >= 0x10.
4011 template <class V, class VI>
4012 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
4013  return TableLookupBytes(bytes, from);
4014 }
4015 
4016 // ------------------------------ Scatter (Store)
4017 
4018 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
4020  const Vec128<Offset, N> offset) {
4021  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4022 
4023  alignas(16) T lanes[N];
4024  Store(v, d, lanes);
4025 
4026  alignas(16) Offset offset_lanes[N];
4027  Store(offset, Simd<Offset, N>(), offset_lanes);
4028 
4029  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
4030  for (size_t i = 0; i < N; ++i) {
4031  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
4032  }
4033 }
4034 
4035 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
4037  const Vec128<Index, N> index) {
4038  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4039 
4040  alignas(16) T lanes[N];
4041  Store(v, d, lanes);
4042 
4043  alignas(16) Index index_lanes[N];
4044  Store(index, Simd<Index, N>(), index_lanes);
4045 
4046  for (size_t i = 0; i < N; ++i) {
4047  base[index_lanes[i]] = lanes[i];
4048  }
4049 }
4050 
4051 // ------------------------------ Gather (Load/Store)
4052 
4053 template <typename T, size_t N, typename Offset>
4055  const T* HWY_RESTRICT base,
4056  const Vec128<Offset, N> offset) {
4057  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
4058 
4059  alignas(16) Offset offset_lanes[N];
4060  Store(offset, Simd<Offset, N>(), offset_lanes);
4061 
4062  alignas(16) T lanes[N];
4063  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
4064  for (size_t i = 0; i < N; ++i) {
4065  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
4066  }
4067  return Load(d, lanes);
4068 }
4069 
4070 template <typename T, size_t N, typename Index>
4072  const Vec128<Index, N> index) {
4073  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
4074 
4075  alignas(16) Index index_lanes[N];
4076  Store(index, Simd<Index, N>(), index_lanes);
4077 
4078  alignas(16) T lanes[N];
4079  for (size_t i = 0; i < N; ++i) {
4080  lanes[i] = base[index_lanes[i]];
4081  }
4082  return Load(d, lanes);
4083 }
4084 
4085 // ------------------------------ Reductions
4086 
4087 namespace detail {
4088 
4089 // N=1 for any T: no-op
4090 template <typename T>
4092  return v;
4093 }
4094 template <typename T>
4096  const Vec128<T, 1> v) {
4097  return v;
4098 }
4099 template <typename T>
4101  const Vec128<T, 1> v) {
4102  return v;
4103 }
4104 
4105 // u32/i32/f32: N=2
4106 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4108  return v10 + Shuffle2301(v10);
4109 }
4110 template <typename T>
4112  const Vec128<T, 2> v10) {
4113  return Min(v10, Shuffle2301(v10));
4114 }
4115 template <typename T>
4117  const Vec128<T, 2> v10) {
4118  return Max(v10, Shuffle2301(v10));
4119 }
4120 
4121 // full vectors
4122 #if HWY_ARCH_ARM_A64
4124  return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
4125 }
4126 HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
4127  return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
4128 }
4129 HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
4130  return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
4131 }
4132 HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
4133  return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
4134 }
4135 HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
4136  return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
4137 }
4138 HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
4139  return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
4140 }
4141 #else
4142 // ARMv7 version for everything except doubles.
4144  uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
4145  uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
4146  uint32x4x2_t v1 = vuzpq_u32(c0, c0);
4147  return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
4148 }
4150  int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
4151  int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
4152  int32x4x2_t v1 = vuzpq_s32(c0, c0);
4153  return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
4154 }
4156  float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
4157  float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
4158  float32x4x2_t v1 = vuzpq_f32(c0, c0);
4159  return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
4160 }
4162  return v + Shuffle01(v);
4163 }
4165  return v + Shuffle01(v);
4166 }
4167 #endif
4168 
4169 template <typename T>
4171  const Vec128<T> v3210) {
4172  const Vec128<T> v1032 = Shuffle1032(v3210);
4173  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
4174  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4175  return Min(v20_31_20_31, v31_20_31_20);
4176 }
4177 template <typename T>
4179  const Vec128<T> v3210) {
4180  const Vec128<T> v1032 = Shuffle1032(v3210);
4181  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
4182  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
4183  return Max(v20_31_20_31, v31_20_31_20);
4184 }
4185 
4186 // For u64/i64[/f64].
4187 template <typename T>
4189  const Vec128<T> v10) {
4190  const Vec128<T> v01 = Shuffle01(v10);
4191  return Min(v10, v01);
4192 }
4193 template <typename T>
4195  const Vec128<T> v10) {
4196  const Vec128<T> v01 = Shuffle01(v10);
4197  return Max(v10, v01);
4198 }
4199 
4200 } // namespace detail
4201 
4202 template <typename T, size_t N>
4204  return detail::SumOfLanes(v);
4205 }
4206 template <typename T, size_t N>
4208  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4209 }
4210 template <typename T, size_t N>
4212  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
4213 }
4214 
4215 // ------------------------------ LoadMaskBits (TestBit)
4216 
4217 namespace detail {
4218 
4219 // Helper function to set 64 bits and potentially return a smaller vector. The
4220 // overload is required to call the q vs non-q intrinsics. Note that 8-bit
4221 // LoadMaskBits only requires 16 bits, but 64 avoids casting.
4222 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4223 HWY_INLINE Vec128<T, N> Set64(Simd<T, N> /* tag */, uint64_t mask_bits) {
4224  const auto v64 = Vec128<uint64_t, 1>(vdup_n_u64(mask_bits));
4225  return Vec128<T, N>(BitCast(Simd<T, 8 / sizeof(T)>(), v64).raw);
4226 }
4227 template <typename T>
4228 HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
4229  return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
4230 }
4231 
4232 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4234  const RebindToUnsigned<decltype(d)> du;
4235  // Easier than Set(), which would require an >8-bit type, which would not
4236  // compile for T=uint8_t, N=1.
4237  const auto vmask_bits = Set64(du, mask_bits);
4238 
4239  // Replicate bytes 8x such that each byte contains the bit that governs it.
4240  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4241  1, 1, 1, 1, 1, 1, 1, 1};
4242  const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
4243 
4244  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4245  1, 2, 4, 8, 16, 32, 64, 128};
4246  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4247 }
4248 
4249 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4250 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4251  const RebindToUnsigned<decltype(d)> du;
4252  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4253  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4254  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4255 }
4256 
4257 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4258 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4259  const RebindToUnsigned<decltype(d)> du;
4260  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
4261  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4262  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4263 }
4264 
4265 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4266 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4267  const RebindToUnsigned<decltype(d)> du;
4268  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
4269  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4270 }
4271 
4272 } // namespace detail
4273 
4274 // `p` points to at least 8 readable bytes, not all of which need be valid.
4275 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4277  const uint8_t* HWY_RESTRICT bits) {
4278  uint64_t mask_bits = 0;
4279  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
4280  return detail::LoadMaskBits(d, mask_bits);
4281 }
4282 
4283 // ------------------------------ Mask
4284 
4285 namespace detail {
4286 
4287 template <typename T>
4289  const Mask128<T> mask) {
4290  alignas(16) constexpr uint8_t kSliceLanes[16] = {
4291  1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
4292  };
4293  const Full128<uint8_t> du;
4294  const Vec128<uint8_t> values =
4295  BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
4296 
4297 #if HWY_ARCH_ARM_A64
4298  // Can't vaddv - we need two separate bytes (16 bits).
4299  const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
4300  const uint8x8_t x4 = vpadd_u8(x2, x2);
4301  const uint8x8_t x8 = vpadd_u8(x4, x4);
4302  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
4303 #else
4304  // Don't have vpaddq, so keep doubling lane size.
4305  const uint16x8_t x2 = vpaddlq_u8(values.raw);
4306  const uint32x4_t x4 = vpaddlq_u16(x2);
4307  const uint64x2_t x8 = vpaddlq_u32(x4);
4308  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
4309 #endif
4310 }
4311 
4312 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4314  const Mask128<T, N> mask) {
4315  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
4316  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
4317  alignas(8) constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
4318  0x10, 0x20, 0x40, 0x80};
4319  const Simd<T, N> d;
4320  const Simd<uint8_t, N> du;
4321  const Vec128<uint8_t, N> slice(Load(Simd<uint8_t, 8>(), kSliceLanes).raw);
4322  const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
4323 
4324 #if HWY_ARCH_ARM_A64
4325  return vaddv_u8(values.raw);
4326 #else
4327  const uint16x4_t x2 = vpaddl_u8(values.raw);
4328  const uint32x2_t x4 = vpaddl_u16(x2);
4329  const uint64x1_t x8 = vpaddl_u32(x4);
4330  return vget_lane_u64(x8, 0);
4331 #endif
4332 }
4333 
4334 template <typename T>
4336  const Mask128<T> mask) {
4337  alignas(16) constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
4338  0x10, 0x20, 0x40, 0x80};
4339  const Full128<T> d;
4340  const Full128<uint16_t> du;
4341  const Vec128<uint16_t> values =
4342  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
4343 #if HWY_ARCH_ARM_A64
4344  return vaddvq_u16(values.raw);
4345 #else
4346  const uint32x4_t x2 = vpaddlq_u16(values.raw);
4347  const uint64x2_t x4 = vpaddlq_u32(x2);
4348  return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
4349 #endif
4350 }
4351 
4352 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4354  const Mask128<T, N> mask) {
4355  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
4356  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
4357  alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
4358  const Simd<T, N> d;
4359  const Simd<uint16_t, N> du;
4360  const Vec128<uint16_t, N> slice(Load(Simd<uint16_t, 4>(), kSliceLanes).raw);
4361  const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
4362 #if HWY_ARCH_ARM_A64
4363  return vaddv_u16(values.raw);
4364 #else
4365  const uint32x2_t x2 = vpaddl_u16(values.raw);
4366  const uint64x1_t x4 = vpaddl_u32(x2);
4367  return vget_lane_u64(x4, 0);
4368 #endif
4369 }
4370 
4371 template <typename T>
4373  const Mask128<T> mask) {
4374  alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
4375  const Full128<T> d;
4376  const Full128<uint32_t> du;
4377  const Vec128<uint32_t> values =
4378  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
4379 #if HWY_ARCH_ARM_A64
4380  return vaddvq_u32(values.raw);
4381 #else
4382  const uint64x2_t x2 = vpaddlq_u32(values.raw);
4383  return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
4384 #endif
4385 }
4386 
4387 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4389  const Mask128<T, N> mask) {
4390  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
4391  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
4392  alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
4393  const Simd<T, N> d;
4394  const Simd<uint32_t, N> du;
4395  const Vec128<uint32_t, N> slice(Load(Simd<uint32_t, 2>(), kSliceLanes).raw);
4396  const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
4397 #if HWY_ARCH_ARM_A64
4398  return vaddv_u32(values.raw);
4399 #else
4400  const uint64x1_t x2 = vpaddl_u32(values.raw);
4401  return vget_lane_u64(x2, 0);
4402 #endif
4403 }
4404 
4405 template <typename T>
4407  alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
4408  const Full128<T> d;
4409  const Full128<uint64_t> du;
4410  const Vec128<uint64_t> values =
4411  BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
4412 #if HWY_ARCH_ARM_A64
4413  return vaddvq_u64(values.raw);
4414 #else
4415  return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
4416 #endif
4417 }
4418 
4419 template <typename T>
4421  const Mask128<T, 1> m) {
4422  const Simd<T, 1> d;
4423  const Simd<uint64_t, 1> du;
4424  const Vec128<uint64_t, 1> values =
4425  BitCast(du, VecFromMask(d, m)) & Set(du, 1);
4426  return vget_lane_u64(values.raw, 0);
4427 }
4428 
4429 // Returns the lowest N for the BitsFromMask result.
4430 template <typename T, size_t N>
4431 constexpr uint64_t OnlyActive(uint64_t bits) {
4432  return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
4433 }
4434 
4435 template <typename T, size_t N>
4436 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
4437  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
4438 }
4439 
4440 // Returns number of lanes whose mask is set.
4441 //
4442 // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
4443 // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
4444 // changes each lane to 1 (if mask set) or 0.
4445 
4446 template <typename T>
4447 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
4448  const Full128<int8_t> di;
4449  const int8x16_t ones =
4450  vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4451 
4452 #if HWY_ARCH_ARM_A64
4453  return static_cast<size_t>(vaddvq_s8(ones));
4454 #else
4455  const int16x8_t x2 = vpaddlq_s8(ones);
4456  const int32x4_t x4 = vpaddlq_s16(x2);
4457  const int64x2_t x8 = vpaddlq_s32(x4);
4458  return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
4459 #endif
4460 }
4461 template <typename T>
4462 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
4463  const Full128<int16_t> di;
4464  const int16x8_t ones =
4465  vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4466 
4467 #if HWY_ARCH_ARM_A64
4468  return static_cast<size_t>(vaddvq_s16(ones));
4469 #else
4470  const int32x4_t x2 = vpaddlq_s16(ones);
4471  const int64x2_t x4 = vpaddlq_s32(x2);
4472  return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
4473 #endif
4474 }
4475 
4476 template <typename T>
4477 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
4478  const Full128<int32_t> di;
4479  const int32x4_t ones =
4480  vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4481 
4482 #if HWY_ARCH_ARM_A64
4483  return static_cast<size_t>(vaddvq_s32(ones));
4484 #else
4485  const int64x2_t x2 = vpaddlq_s32(ones);
4486  return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
4487 #endif
4488 }
4489 
4490 template <typename T>
4491 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
4492 #if HWY_ARCH_ARM_A64
4493  const Full128<int64_t> di;
4494  const int64x2_t ones =
4495  vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
4496  return static_cast<size_t>(vaddvq_s64(ones));
4497 #else
4498  const Full128<uint64_t> du;
4499  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
4500  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
4501  return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
4502 #endif
4503 }
4504 
4505 } // namespace detail
4506 
4507 // Full
4508 template <typename T>
4509 HWY_API size_t CountTrue(Full128<T> /* tag */, const Mask128<T> mask) {
4510  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
4511 }
4512 
4513 // Partial
4514 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4515 HWY_API size_t CountTrue(Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4516  return PopCount(detail::BitsFromMask(mask));
4517 }
4518 
4519 template <typename T, size_t N>
4520 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */,
4521  const Mask128<T, N> mask) {
4522  const uint64_t bits = detail::BitsFromMask(mask);
4523  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
4524 }
4525 
4526 // `p` points to at least 8 writable bytes.
4527 template <typename T, size_t N>
4528 HWY_API size_t StoreMaskBits(Simd<T, N> /* tag */, const Mask128<T, N> mask,
4529  uint8_t* bits) {
4530  const uint64_t mask_bits = detail::BitsFromMask(mask);
4531  const size_t kNumBytes = (N + 7) / 8;
4532  CopyBytes<kNumBytes>(&mask_bits, bits);
4533  return kNumBytes;
4534 }
4535 
4536 // Full
4537 template <typename T>
4538 HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
4539 #if HWY_ARCH_ARM_A64
4540  const Full128<uint32_t> d32;
4541  const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(d, m)));
4542  return (vmaxvq_u32(m32.raw) == 0);
4543 #else
4544  const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(d, m));
4545  uint32x2_t a = vqmovn_u64(v64.raw);
4546  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
4547 #endif
4548 }
4549 
4550 // Partial
4551 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4552 HWY_API bool AllFalse(const Simd<T, N> /* tag */, const Mask128<T, N> m) {
4553  return detail::BitsFromMask(m) == 0;
4554 }
4555 
4556 template <typename T, size_t N>
4557 HWY_API bool AllTrue(const Simd<T, N> d, const Mask128<T, N> m) {
4558  return AllFalse(VecFromMask(d, m) == Zero(d));
4559 }
4560 
4561 // ------------------------------ Compress
4562 
4563 namespace detail {
4564 
4565 // Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
4567  const uint8_t* bytes) {
4568  return Vec128<uint8_t>(vreinterpretq_u8_u64(
4569  vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
4570 }
4571 
4572 // Load 8 bytes and return half-reg with N <= 8 bytes.
4573 template <size_t N, HWY_IF_LE64(uint8_t, N)>
4575  const uint8_t* bytes) {
4576  return Load(d, bytes);
4577 }
4578 
4579 template <typename T, size_t N>
4581  const uint64_t mask_bits) {
4582  HWY_DASSERT(mask_bits < 256);
4583  const Simd<T, N> d;
4584  const Repartition<uint8_t, decltype(d)> d8;
4585  const Simd<uint16_t, N> du;
4586 
4587  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
4588  // indices for VTBL (one vector's worth for each of 256 combinations of
4589  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
4590  // store lane indices and convert to byte indices (2*lane + 0..1), with the
4591  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
4592  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
4593  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
4594  // is likely more costly than the higher cache footprint from storing bytes.
4595  alignas(16) constexpr uint8_t table[256 * 8] = {
4596  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
4597  0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
4598  0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
4599  0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
4600  0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
4601  6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
4602  0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
4603  0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
4604  2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
4605  0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
4606  0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
4607  0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
4608  0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
4609  6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
4610  8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
4611  0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
4612  4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
4613  10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
4614  0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
4615  0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
4616  0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
4617  4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
4618  0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
4619  0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
4620  2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
4621  10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
4622  0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
4623  0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
4624  0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
4625  0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
4626  0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
4627  0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
4628  6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
4629  12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
4630  0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
4631  0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
4632  0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
4633  8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
4634  0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
4635  0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
4636  2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
4637  8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
4638  12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
4639  0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
4640  0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
4641  10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
4642  12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
4643  0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
4644  4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
4645  6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
4646  0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
4647  0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
4648  0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
4649  4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
4650  12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
4651  0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
4652  2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
4653  0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
4654  0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
4655  0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
4656  0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
4657  14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
4658  0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
4659  0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
4660  8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
4661  14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
4662  0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
4663  0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
4664  0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
4665  6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
4666  14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
4667  0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
4668  2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
4669  14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
4670  0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
4671  0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
4672  0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
4673  6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
4674  10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
4675  0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
4676  4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
4677  8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
4678  0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
4679  0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
4680  0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
4681  4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
4682  0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
4683  0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
4684  2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
4685  14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
4686  0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
4687  0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
4688  0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
4689  12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
4690  14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
4691  0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
4692  6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
4693  8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
4694  14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
4695  0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
4696  0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
4697  10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
4698  14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
4699  0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
4700  2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
4701  10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
4702  12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
4703  0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
4704  0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
4705  8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
4706  10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
4707  0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
4708  4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
4709  6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
4710 
4711  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
4712  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
4713  return BitCast(d, pairs + Set(du, 0x0100));
4714 }
4715 
4716 template <typename T, size_t N>
4718  const uint64_t mask_bits) {
4719  HWY_DASSERT(mask_bits < 16);
4720 
4721  // There are only 4 lanes, so we can afford to load the index vector directly.
4722  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
4723  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
4724  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
4725  4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
4726  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, //
4727  8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
4728  0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
4729  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
4730  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, //
4731  12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
4732  0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
4733  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
4734  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, //
4735  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
4736  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
4737  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
4738  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4739 
4740  const Simd<T, N> d;
4741  const Repartition<uint8_t, decltype(d)> d8;
4742  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
4743 }
4744 
4745 #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
4746 
4747 template <typename T, size_t N>
4749  const uint64_t mask_bits) {
4750  HWY_DASSERT(mask_bits < 4);
4751 
4752  // There are only 2 lanes, so we can afford to load the index vector directly.
4753  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
4754  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
4755  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
4756  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
4757  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4758 
4759  const Simd<T, N> d;
4760  const Repartition<uint8_t, decltype(d)> d8;
4761  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
4762 }
4763 
4764 #endif
4765 
4766 // Helper function called by both Compress and CompressStore - avoids a
4767 // redundant BitsFromMask in the latter.
4768 template <typename T, size_t N>
4769 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
4770  const auto idx =
4771  detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
4772  using D = Simd<T, N>;
4773  const RebindToSigned<D> di;
4774  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
4775 }
4776 
4777 } // namespace detail
4778 
4779 template <typename T, size_t N>
4781  return detail::Compress(v, detail::BitsFromMask(mask));
4782 }
4783 
4784 // ------------------------------ CompressBits
4785 
4786 template <typename T, size_t N>
4788  const uint8_t* HWY_RESTRICT bits) {
4789  uint64_t mask_bits = 0;
4790  constexpr size_t kNumBytes = (N + 7) / 8;
4791  CopyBytes<kNumBytes>(bits, &mask_bits);
4792  if (N < 8) {
4793  mask_bits &= (1ull << N) - 1;
4794  }
4795 
4796  return detail::Compress(v, mask_bits);
4797 }
4798 
4799 // ------------------------------ CompressStore
4800 
4801 template <typename T, size_t N>
4803  Simd<T, N> d, T* HWY_RESTRICT unaligned) {
4804  const uint64_t mask_bits = detail::BitsFromMask(mask);
4805  StoreU(detail::Compress(v, mask_bits), d, unaligned);
4806  return PopCount(mask_bits);
4807 }
4808 
4809 // ------------------------------ CompressBitsStore
4810 
4811 template <typename T, size_t N>
4813  const uint8_t* HWY_RESTRICT bits, Simd<T, N> d,
4814  T* HWY_RESTRICT unaligned) {
4815  uint64_t mask_bits = 0;
4816  constexpr size_t kNumBytes = (N + 7) / 8;
4817  CopyBytes<kNumBytes>(bits, &mask_bits);
4818  if (N < 8) {
4819  mask_bits &= (1ull << N) - 1;
4820  }
4821 
4822  StoreU(detail::Compress(v, mask_bits), d, unaligned);
4823  return PopCount(mask_bits);
4824 }
4825 
4826 // ------------------------------ StoreInterleaved3
4827 
4828 // 128 bits
4830  const Vec128<uint8_t> v1,
4831  const Vec128<uint8_t> v2,
4832  Full128<uint8_t> /*tag*/,
4833  uint8_t* HWY_RESTRICT unaligned) {
4834  const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw};
4835  vst3q_u8(unaligned, triple);
4836 }
4837 
4838 // 64 bits
4840  const Vec128<uint8_t, 8> v1,
4841  const Vec128<uint8_t, 8> v2,
4842  Simd<uint8_t, 8> /*tag*/,
4843  uint8_t* HWY_RESTRICT unaligned) {
4844  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
4845  vst3_u8(unaligned, triple);
4846 }
4847 
4848 // <= 32 bits: avoid writing more than N bytes by copying to buffer
4849 template <size_t N, HWY_IF_LE32(uint8_t, N)>
4851  const Vec128<uint8_t, N> v1,
4852  const Vec128<uint8_t, N> v2,
4853  Simd<uint8_t, N> /*tag*/,
4854  uint8_t* HWY_RESTRICT unaligned) {
4855  alignas(16) uint8_t buf[24];
4856  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
4857  vst3_u8(buf, triple);
4858  CopyBytes<N * 3>(buf, unaligned);
4859 }
4860 
4861 // ------------------------------ StoreInterleaved4
4862 
4863 // 128 bits
4865  const Vec128<uint8_t> v1,
4866  const Vec128<uint8_t> v2,
4867  const Vec128<uint8_t> v3,
4868  Full128<uint8_t> /*tag*/,
4869  uint8_t* HWY_RESTRICT unaligned) {
4870  const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
4871  vst4q_u8(unaligned, quad);
4872 }
4873 
4874 // 64 bits
4876  const Vec128<uint8_t, 8> v1,
4877  const Vec128<uint8_t, 8> v2,
4878  const Vec128<uint8_t, 8> v3,
4879  Simd<uint8_t, 8> /*tag*/,
4880  uint8_t* HWY_RESTRICT unaligned) {
4881  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
4882  vst4_u8(unaligned, quad);
4883 }
4884 
4885 // <= 32 bits: avoid writing more than N bytes by copying to buffer
4886 template <size_t N, HWY_IF_LE32(uint8_t, N)>
4888  const Vec128<uint8_t, N> v1,
4889  const Vec128<uint8_t, N> v2,
4890  const Vec128<uint8_t, N> v3,
4891  Simd<uint8_t, N> /*tag*/,
4892  uint8_t* HWY_RESTRICT unaligned) {
4893  alignas(16) uint8_t buf[32];
4894  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
4895  vst4_u8(buf, quad);
4896  CopyBytes<N * 4>(buf, unaligned);
4897 }
4898 
4899 // ================================================== DEPRECATED
4900 
4901 template <typename T, size_t N>
4902 HWY_API size_t StoreMaskBits(const Mask128<T, N> mask, uint8_t* bits) {
4903  return StoreMaskBits(Simd<T, N>(), mask, bits);
4904 }
4905 
4906 template <typename T, size_t N>
4907 HWY_API bool AllTrue(const Mask128<T, N> mask) {
4908  return AllTrue(Simd<T, N>(), mask);
4909 }
4910 
4911 template <typename T, size_t N>
4912 HWY_API bool AllFalse(const Mask128<T, N> mask) {
4913  return AllFalse(Simd<T, N>(), mask);
4914 }
4915 
4916 template <typename T, size_t N>
4917 HWY_API size_t CountTrue(const Mask128<T, N> mask) {
4918  return CountTrue(Simd<T, N>(), mask);
4919 }
4920 
4921 template <typename T, size_t N>
4923  return SumOfLanes(Simd<T, N>(), v);
4924 }
4925 template <typename T, size_t N>
4927  return MinOfLanes(Simd<T, N>(), v);
4928 }
4929 template <typename T, size_t N>
4931  return MaxOfLanes(Simd<T, N>(), v);
4932 }
4933 
4934 template <typename T, size_t N>
4935 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Vec128<T, N> v) {
4936  return UpperHalf(Half<Simd<T, N>>(), v);
4937 }
4938 
4939 template <int kBytes, typename T, size_t N>
4941  return ShiftRightBytes<kBytes>(Simd<T, N>(), v);
4942 }
4943 
4944 template <int kLanes, typename T, size_t N>
4946  return ShiftRightLanes<kLanes>(Simd<T, N>(), v);
4947 }
4948 
4949 template <size_t kBytes, typename T, size_t N>
4951  return CombineShiftRightBytes<kBytes>(Simd<T, N>(), hi, lo);
4952 }
4953 
4954 template <typename T, size_t N>
4956  return InterleaveUpper(Simd<T, N>(), a, b);
4957 }
4958 
4959 template <typename T, size_t N, class D = Simd<T, N>>
4961  return InterleaveUpper(RepartitionToWide<D>(), a, b);
4962 }
4963 
4964 template <typename T, size_t N2>
4966  return Combine(Simd<T, N2 * 2>(), hi2, lo2);
4967 }
4968 
4969 template <typename T, size_t N2, HWY_IF_LE64(T, N2)>
4971  return ZeroExtendVector(Simd<T, N2 * 2>(), lo);
4972 }
4973 
4974 template <typename T, size_t N>
4976  return ConcatLowerLower(Simd<T, N>(), hi, lo);
4977 }
4978 
4979 template <typename T, size_t N>
4981  return ConcatUpperUpper(Simd<T, N>(), hi, lo);
4982 }
4983 
4984 template <typename T, size_t N>
4986  const Vec128<T, N> lo) {
4987  return ConcatLowerUpper(Simd<T, N>(), hi, lo);
4988 }
4989 
4990 template <typename T, size_t N>
4992  return ConcatUpperLower(Simd<T, N>(), hi, lo);
4993 }
4994 
4995 // ================================================== Operator wrapper
4996 
4997 // These apply to all x86_*-inl.h because there are no restrictions on V.
4998 
4999 template <class V>
5000 HWY_API V Add(V a, V b) {
5001  return a + b;
5002 }
5003 template <class V>
5004 HWY_API V Sub(V a, V b) {
5005  return a - b;
5006 }
5007 
5008 template <class V>
5009 HWY_API V Mul(V a, V b) {
5010  return a * b;
5011 }
5012 template <class V>
5013 HWY_API V Div(V a, V b) {
5014  return a / b;
5015 }
5016 
5017 template <class V>
5018 V Shl(V a, V b) {
5019  return a << b;
5020 }
5021 template <class V>
5022 V Shr(V a, V b) {
5023  return a >> b;
5024 }
5025 
5026 template <class V>
5027 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
5028  return a == b;
5029 }
5030 template <class V>
5031 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
5032  return a != b;
5033 }
5034 template <class V>
5035 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
5036  return a < b;
5037 }
5038 
5039 template <class V>
5040 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
5041  return a > b;
5042 }
5043 template <class V>
5044 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
5045  return a >= b;
5046 }
5047 
5048 template <class V>
5049 HWY_API auto Le(V a, V b) -> decltype(a == b) {
5050  return a <= b;
5051 }
5052 
5053 namespace detail { // for code folding
5054 #if HWY_ARCH_ARM_V7
5055 #undef vuzp1_s8
5056 #undef vuzp1_u8
5057 #undef vuzp1_s16
5058 #undef vuzp1_u16
5059 #undef vuzp1_s32
5060 #undef vuzp1_u32
5061 #undef vuzp1_f32
5062 #undef vuzp1q_s8
5063 #undef vuzp1q_u8
5064 #undef vuzp1q_s16
5065 #undef vuzp1q_u16
5066 #undef vuzp1q_s32
5067 #undef vuzp1q_u32
5068 #undef vuzp1q_f32
5069 #undef vuzp2_s8
5070 #undef vuzp2_u8
5071 #undef vuzp2_s16
5072 #undef vuzp2_u16
5073 #undef vuzp2_s32
5074 #undef vuzp2_u32
5075 #undef vuzp2_f32
5076 #undef vuzp2q_s8
5077 #undef vuzp2q_u8
5078 #undef vuzp2q_s16
5079 #undef vuzp2q_u16
5080 #undef vuzp2q_s32
5081 #undef vuzp2q_u32
5082 #undef vuzp2q_f32
5083 #undef vzip1_s8
5084 #undef vzip1_u8
5085 #undef vzip1_s16
5086 #undef vzip1_u16
5087 #undef vzip1_s32
5088 #undef vzip1_u32
5089 #undef vzip1_f32
5090 #undef vzip1q_s8
5091 #undef vzip1q_u8
5092 #undef vzip1q_s16
5093 #undef vzip1q_u16
5094 #undef vzip1q_s32
5095 #undef vzip1q_u32
5096 #undef vzip1q_f32
5097 #undef vzip2_s8
5098 #undef vzip2_u8
5099 #undef vzip2_s16
5100 #undef vzip2_u16
5101 #undef vzip2_s32
5102 #undef vzip2_u32
5103 #undef vzip2_f32
5104 #undef vzip2q_s8
5105 #undef vzip2q_u8
5106 #undef vzip2q_s16
5107 #undef vzip2q_u16
5108 #undef vzip2q_s32
5109 #undef vzip2q_u32
5110 #undef vzip2q_f32
5111 #endif
5112 
5113 #undef HWY_NEON_BUILD_ARG_1
5114 #undef HWY_NEON_BUILD_ARG_2
5115 #undef HWY_NEON_BUILD_ARG_3
5116 #undef HWY_NEON_BUILD_PARAM_1
5117 #undef HWY_NEON_BUILD_PARAM_2
5118 #undef HWY_NEON_BUILD_PARAM_3
5119 #undef HWY_NEON_BUILD_RET_1
5120 #undef HWY_NEON_BUILD_RET_2
5121 #undef HWY_NEON_BUILD_RET_3
5122 #undef HWY_NEON_BUILD_TPL_1
5123 #undef HWY_NEON_BUILD_TPL_2
5124 #undef HWY_NEON_BUILD_TPL_3
5125 #undef HWY_NEON_DEF_FUNCTION
5126 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
5127 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
5128 #undef HWY_NEON_DEF_FUNCTION_INT_8
5129 #undef HWY_NEON_DEF_FUNCTION_INT_16
5130 #undef HWY_NEON_DEF_FUNCTION_INT_32
5131 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
5132 #undef HWY_NEON_DEF_FUNCTION_INTS
5133 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
5134 #undef HWY_NEON_DEF_FUNCTION_TPL
5135 #undef HWY_NEON_DEF_FUNCTION_UINT_8
5136 #undef HWY_NEON_DEF_FUNCTION_UINT_16
5137 #undef HWY_NEON_DEF_FUNCTION_UINT_32
5138 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
5139 #undef HWY_NEON_DEF_FUNCTION_UINTS
5140 #undef HWY_NEON_EVAL
5141 } // namespace detail
5142 
5143 // NOLINTNEXTLINE(google-readability-namespace-comments)
5144 } // namespace HWY_NAMESPACE
5145 } // namespace hwy
HWY_AFTER_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition: arm_neon-inl.h:149
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:174
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition: arm_neon-inl.h:184
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:136
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition: arm_neon-inl.h:131
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:89
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:179
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:119
HWY_BEFORE_NAMESPACE()
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:157
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:105
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition: arm_neon-inl.h:112
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition: arm_neon-inl.h:169
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition: arm_neon-inl.h:97
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition: arm_neon-inl.h:125
#define HWY_IF_FLOAT(T)
Definition: base.h:280
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_IF_LE64(T, N)
Definition: base.h:271
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:506
HWY_INLINE Mask128()
Definition: arm_neon-inl.h:511
Mask128(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition: arm_neon-inl.h:514
Raw raw
Definition: arm_neon-inl.h:516
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:508
Mask128 & operator=(const Mask128 &)=default
Definition: arm_neon-inl.h:468
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: arm_neon-inl.h:491
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: arm_neon-inl.h:494
HWY_INLINE Vec128()
Definition: arm_neon-inl.h:472
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: arm_neon-inl.h:482
HWY_INLINE Vec128(const Raw raw)
Definition: arm_neon-inl.h:475
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: arm_neon-inl.h:497
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: arm_neon-inl.h:479
Vec128(const Vec128 &)=default
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:469
Raw raw
Definition: arm_neon-inl.h:501
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: arm_neon-inl.h:485
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: arm_neon-inl.h:488
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2811
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4233
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_INLINE Vec128< T, N > Set64(Simd< T, N >, uint64_t mask_bits)
Definition: arm_neon-inl.h:4223
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition: arm_neon-inl.h:1334
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition: arm_neon-inl.h:4566
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) template< size_t N > HWY_INLINE Vec128< uint8_t
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition: arm_neon-inl.h:1181
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4447
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4769
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4431
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4580
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T > ConcatOdd(Full128< T > d, Vec128< T > hi, Vec128< T > lo)
Definition: arm_neon-inl.h:3795
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) HWY_API Vec128< int64_t
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > ConcatEven(Full128< T > d, Vec128< T > hi, Vec128< T > lo)
Definition: arm_neon-inl.h:3837
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
typename D::T TFromD
Definition: shared-inl.h:140
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:613
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:535
constexpr float MantissaEnd< float >()
Definition: base.h:391
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_neon-inl.h:3318
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3319
Definition: shared-inl.h:35
Definition: arm_neon-inl.h:522
Simd< T, N > operator()(Vec128< T, N >) const
Definition: arm_neon-inl.h:524
uint16x4_t type
Definition: arm_neon-inl.h:451
uint16x4_t type
Definition: arm_neon-inl.h:415
uint16x4_t type
Definition: arm_neon-inl.h:362
uint16x8_t type
Definition: arm_neon-inl.h:299
uint16x4_t type
Definition: arm_neon-inl.h:446
uint16x4_t type
Definition: arm_neon-inl.h:410
uint16x4_t type
Definition: arm_neon-inl.h:357
uint16x8_t type
Definition: arm_neon-inl.h:294
float32x2_t type
Definition: arm_neon-inl.h:420
float32x2_t type
Definition: arm_neon-inl.h:367
float32x4_t type
Definition: arm_neon-inl.h:304
int16x4_t type
Definition: arm_neon-inl.h:441
int16x4_t type
Definition: arm_neon-inl.h:400
int16x4_t type
Definition: arm_neon-inl.h:342
int16x8_t type
Definition: arm_neon-inl.h:279
int32x2_t type
Definition: arm_neon-inl.h:405
int32x2_t type
Definition: arm_neon-inl.h:347
int32x4_t type
Definition: arm_neon-inl.h:284
int64x1_t type
Definition: arm_neon-inl.h:352
int64x2_t type
Definition: arm_neon-inl.h:289
int8x16_t type
Definition: arm_neon-inl.h:274
int8x8_t type
Definition: arm_neon-inl.h:462
int8x8_t type
Definition: arm_neon-inl.h:436
int8x8_t type
Definition: arm_neon-inl.h:395
int8x8_t type
Definition: arm_neon-inl.h:337
uint16x4_t type
Definition: arm_neon-inl.h:431
uint16x4_t type
Definition: arm_neon-inl.h:385
uint16x4_t type
Definition: arm_neon-inl.h:322
uint16x8_t type
Definition: arm_neon-inl.h:259
uint32x2_t type
Definition: arm_neon-inl.h:390
uint32x2_t type
Definition: arm_neon-inl.h:327
uint32x4_t type
Definition: arm_neon-inl.h:264
uint64x1_t type
Definition: arm_neon-inl.h:332
uint64x2_t type
Definition: arm_neon-inl.h:269
uint8x16_t type
Definition: arm_neon-inl.h:254
uint8x8_t type
Definition: arm_neon-inl.h:457
uint8x8_t type
Definition: arm_neon-inl.h:426
uint8x8_t type
Definition: arm_neon-inl.h:380
uint8x8_t type
Definition: arm_neon-inl.h:317
Definition: x86_128-inl.h:51
__v128_u type
Definition: wasm_128-inl.h:58
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2996
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3003
Definition: arm_neon-inl.h:2974
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition: arm_neon-inl.h:2977
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2984
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3026
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition: arm_neon-inl.h:3033
Definition: arm_neon-inl.h:3009
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition: arm_neon-inl.h:3011
Definition: base.h:290
Definition: base.h:227
Definition: base.h:222