Grok  9.5.0
arm_sve-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // ARM SVE[2] vectors (length not known at compile time).
16 // External include guard in highway.h - see comment there.
17 
18 #include <stddef.h>
19 #include <stdint.h>
20 
21 #if defined(HWY_EMULATE_SVE)
22 #include "third_party/farm_sve/farm_sve.h"
23 #else
24 #include <arm_sve.h>
25 #endif
26 
27 #include "hwy/base.h"
28 #include "hwy/ops/shared-inl.h"
29 
31 namespace hwy {
32 namespace HWY_NAMESPACE {
33 
34 // SVE only supports fractions, not LMUL > 1.
35 template <typename T, int kShift = 0>
36 using Full = Simd<T, (kShift <= 0) ? (HWY_LANES(T) >> (-kShift)) : 0>;
37 
38 template <class V>
39 struct DFromV_t {}; // specialized in macros
40 template <class V>
41 using DFromV = typename DFromV_t<RemoveConst<V>>::type;
42 
43 template <class V>
44 using TFromV = TFromD<DFromV<V>>;
45 
46 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
47 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
48 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
49 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
50 
51 // ================================================== MACROS
52 
53 // Generate specializations and function definitions using X macros. Although
54 // harder to read and debug, writing everything manually is too bulky.
55 
56 namespace detail { // for code folding
57 
58 // Unsigned:
59 #define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, NAME, OP)
60 #define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, NAME, OP)
61 #define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) X_MACRO(uint, u, 32, NAME, OP)
62 #define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) X_MACRO(uint, u, 64, NAME, OP)
63 
64 // Signed:
65 #define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, NAME, OP)
66 #define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, NAME, OP)
67 #define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, NAME, OP)
68 #define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, NAME, OP)
69 
70 // Float:
71 #define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) X_MACRO(float, f, 16, NAME, OP)
72 #define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) X_MACRO(float, f, 32, NAME, OP)
73 #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) X_MACRO(float, f, 64, NAME, OP)
74 
75 // For all element sizes:
76 #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
77  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
78  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
79  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
80  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
81 
82 #define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
83  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
84  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
85  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
86  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
87 
88 #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
89  HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
90  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
91  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
92 
93 // Commonly used type categories for a given element size:
94 #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
95  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
96  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
97 
98 #define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
99  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
100  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
101 
102 #define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
103  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
104  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
105 
106 #define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
107  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
108  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
109 
110 #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
111  HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
112  HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
113  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
114  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
115 
116 // Commonly used type categories:
117 #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
118  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
119  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
120 
121 #define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
122  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
123  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
124 
125 #define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
126  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
127  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
128  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
129 
130 // Assemble types for use in x-macros
131 #define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
132 #define HWY_SVE_D(BASE, BITS, N) Simd<HWY_SVE_T(BASE, BITS), N>
133 #define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
134 
135 } // namespace detail
136 
137 #define HWY_SPECIALIZE(BASE, CHAR, BITS, NAME, OP) \
138  template <> \
139  struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
140  using type = HWY_SVE_D(BASE, BITS, HWY_LANES(HWY_SVE_T(BASE, BITS))); \
141  };
142 
144 #undef HWY_SPECIALIZE
145 
146 // vector = f(d), e.g. Undefined
147 #define HWY_SVE_RETV_ARGD(BASE, CHAR, BITS, NAME, OP) \
148  template <size_t N> \
149  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N) d) { \
150  return sv##OP##_##CHAR##BITS(); \
151  }
152 
153 // Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
154 // instructions, and we anyway only use it when the predicate is ptrue.
155 
156 // vector = f(vector), e.g. Not
157 #define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, NAME, OP) \
158  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
159  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
160  }
161 #define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, NAME, OP) \
162  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
163  return sv##OP##_##CHAR##BITS(v); \
164  }
165 
166 // vector = f(vector, scalar), e.g. detail::AddK
167 #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, NAME, OP) \
168  HWY_API HWY_SVE_V(BASE, BITS) \
169  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
170  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
171  }
172 #define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, NAME, OP) \
173  HWY_API HWY_SVE_V(BASE, BITS) \
174  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
175  return sv##OP##_##CHAR##BITS(a, b); \
176  }
177 
178 // vector = f(vector, vector), e.g. Add
179 #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, NAME, OP) \
180  HWY_API HWY_SVE_V(BASE, BITS) \
181  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
182  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
183  }
184 #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, NAME, OP) \
185  HWY_API HWY_SVE_V(BASE, BITS) \
186  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
187  return sv##OP##_##CHAR##BITS(a, b); \
188  }
189 
190 // ------------------------------ Lanes
191 
192 namespace detail {
193 
194 // Returns actual lanes of a hardware vector without rounding to a power of two.
196  return svcntb_pat(SV_ALL);
197 }
199  return svcnth_pat(SV_ALL);
200 }
202  return svcntw_pat(SV_ALL);
203 }
205  return svcntd_pat(SV_ALL);
206 }
207 
208 // Returns actual lanes of a hardware vector, rounded down to a power of two.
210  return svcntb_pat(SV_POW2);
211 }
213  return svcnth_pat(SV_POW2);
214 }
216  return svcntw_pat(SV_POW2);
217 }
219  return svcntd_pat(SV_POW2);
220 }
221 
222 } // namespace detail
223 
224 // Capped to <= 128-bit: SVE is at least that large, so no need to query actual.
225 template <typename T, size_t N, HWY_IF_LE128(T, N)>
226 HWY_API constexpr size_t Lanes(Simd<T, N> /* tag */) {
227  return N;
228 }
229 
230 // Returns actual number of lanes after dividing by div={1,2,4,8}.
231 // May return 0 if div > 16/sizeof(T): there is no "1/8th" of a u32x4, but it
232 // would be valid for u32x8 (i.e. hardware vectors >= 256 bits).
233 template <typename T, size_t N, HWY_IF_GT128(T, N)>
234 HWY_API size_t Lanes(Simd<T, N> /* tag */) {
235  static_assert(N <= HWY_LANES(T), "N cannot exceed a full vector");
236 
237  const size_t actual = detail::HardwareLanes(hwy::SizeTag<sizeof(T)>());
238  const size_t div = HWY_LANES(T) / N;
239  static_assert(div <= 8, "Invalid N - must be <=128 bit, or >=1/8th");
240  return actual / div;
241 }
242 
243 // ================================================== MASK INIT
244 
245 // One mask bit per byte; only the one belonging to the lowest byte is valid.
246 
247 // ------------------------------ FirstN
248 #define HWY_SVE_FIRSTN(BASE, CHAR, BITS, NAME, OP) \
249  template <size_t KN> \
250  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, KN) /* d */, uint32_t N) { \
251  return sv##OP##_b##BITS##_u32(uint32_t(0), N); \
252  }
254 #undef HWY_SVE_FIRSTN
255 
256 namespace detail {
257 
258 // All-true mask from a macro
259 #define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
260 
261 #define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, NAME, OP) \
262  template <size_t N> \
263  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N) d) { \
264  return HWY_SVE_PTRUE(BITS); \
265  }
266 
267 HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
268 #undef HWY_SVE_WRAP_PTRUE
269 
270 HWY_API svbool_t PFalse() { return svpfalse_b(); }
271 
272 // Returns all-true if d is HWY_FULL or FirstN(N) after capping N.
273 //
274 // This is used in functions that load/store memory; other functions (e.g.
275 // arithmetic on partial vectors) can ignore d and use PTrue instead.
276 template <typename T, size_t N>
277 svbool_t Mask(Simd<T, N> d) {
278  return N == HWY_LANES(T) ? PTrue(d) : FirstN(d, Lanes(d));
279 }
280 
281 } // namespace detail
282 
283 // ================================================== INIT
284 
285 // ------------------------------ Set
286 // vector = f(d, scalar), e.g. Set
287 #define HWY_SVE_SET(BASE, CHAR, BITS, NAME, OP) \
288  template <size_t N> \
289  HWY_API HWY_SVE_V(BASE, BITS) \
290  NAME(HWY_SVE_D(BASE, BITS, N) d, HWY_SVE_T(BASE, BITS) arg) { \
291  return sv##OP##_##CHAR##BITS(arg); \
292  }
293 
295 #undef HWY_SVE_SET
296 
297 // Required for Zero and VFromD
298 template <size_t N>
299 svuint16_t Set(Simd<bfloat16_t, N> d, bfloat16_t arg) {
300  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
301 }
302 
303 template <class D>
304 using VFromD = decltype(Set(D(), TFromD<D>()));
305 
306 // ------------------------------ Zero
307 
308 template <class D>
310  return Set(d, 0);
311 }
312 
313 // ------------------------------ Undefined
314 
315 #if defined(HWY_EMULATE_SVE)
316 template <class D>
317 VFromD<D> Undefined(D d) {
318  return Zero(d);
319 }
320 #else
322 #endif
323 
324 // ------------------------------ BitCast
325 
326 namespace detail {
327 
328 // u8: no change
329 #define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, NAME, OP) \
330  HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
331  return v; \
332  } \
333  template <size_t N> \
334  HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
335  HWY_SVE_D(BASE, BITS, N) /* d */, HWY_SVE_V(BASE, BITS) v) { \
336  return v; \
337  }
338 
339 // All other types
340 #define HWY_SVE_CAST(BASE, CHAR, BITS, NAME, OP) \
341  HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
342  return sv##OP##_u8_##CHAR##BITS(v); \
343  } \
344  template <size_t N> \
345  HWY_INLINE HWY_SVE_V(BASE, BITS) \
346  BitCastFromByte(HWY_SVE_D(BASE, BITS, N) /* d */, svuint8_t v) { \
347  return sv##OP##_##CHAR##BITS##_u8(v); \
348  }
349 
351 HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
352 HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
353 HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
354 HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
355 HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
356 
357 #undef HWY_SVE_CAST_NOP
358 #undef HWY_SVE_CAST
359 
360 template <size_t N>
362  svuint8_t v) {
363  return BitCastFromByte(Simd<uint16_t, N>(), v);
364 }
365 
366 } // namespace detail
367 
368 template <class D, class FromV>
369 HWY_API VFromD<D> BitCast(D d, FromV v) {
371 }
372 
373 // ================================================== LOGICAL
374 
375 // detail::*N() functions accept a scalar argument to avoid extra Set().
376 
377 // ------------------------------ Not
378 
380 
381 // ------------------------------ And
382 
383 namespace detail {
385 } // namespace detail
386 
388 
389 template <class V, HWY_IF_FLOAT_V(V)>
390 HWY_API V And(const V a, const V b) {
391  const DFromV<V> df;
392  const RebindToUnsigned<decltype(df)> du;
393  return BitCast(df, And(BitCast(du, a), BitCast(du, b)));
394 }
395 
396 // ------------------------------ Or
397 
399 
400 template <class V, HWY_IF_FLOAT_V(V)>
401 HWY_API V Or(const V a, const V b) {
402  const DFromV<V> df;
403  const RebindToUnsigned<decltype(df)> du;
404  return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
405 }
406 
407 // ------------------------------ Xor
408 
409 namespace detail {
411 } // namespace detail
412 
414 
415 template <class V, HWY_IF_FLOAT_V(V)>
416 HWY_API V Xor(const V a, const V b) {
417  const DFromV<V> df;
418  const RebindToUnsigned<decltype(df)> du;
419  return BitCast(df, Xor(BitCast(du, a), BitCast(du, b)));
420 }
421 
422 // ------------------------------ AndNot
423 
424 namespace detail {
425 #define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, NAME, OP) \
426  HWY_API HWY_SVE_V(BASE, BITS) \
427  NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
428  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
429  }
430 
432 #undef HWY_SVE_RETV_ARGPVN_SWAP
433 } // namespace detail
434 
435 #define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, NAME, OP) \
436  HWY_API HWY_SVE_V(BASE, BITS) \
437  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
438  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
439  }
441 #undef HWY_SVE_RETV_ARGPVV_SWAP
442 
443 template <class V, HWY_IF_FLOAT_V(V)>
444 HWY_API V AndNot(const V a, const V b) {
445  const DFromV<V> df;
446  const RebindToUnsigned<decltype(df)> du;
447  return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b)));
448 }
449 
450 // ------------------------------ PopulationCount
451 
452 #ifdef HWY_NATIVE_POPCNT
453 #undef HWY_NATIVE_POPCNT
454 #else
455 #define HWY_NATIVE_POPCNT
456 #endif
457 
458 // Need to return original type instead of unsigned.
459 #define HWY_SVE_POPCNT(BASE, CHAR, BITS, NAME, OP) \
460  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
461  return BitCast(DFromV<decltype(v)>(), \
462  sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
463  }
465 #undef HWY_SVE_POPCNT
466 
467 // ================================================== SIGN
468 
469 // ------------------------------ Neg
471 
472 // ------------------------------ Abs
474 
475 // ------------------------------ CopySign[ToAbs]
476 
477 template <class V>
478 HWY_API V CopySign(const V magn, const V sign) {
479  const auto msb = SignBit(DFromV<V>());
480  return Or(AndNot(msb, magn), And(msb, sign));
481 }
482 
483 template <class V>
484 HWY_API V CopySignToAbs(const V abs, const V sign) {
485  const auto msb = SignBit(DFromV<V>());
486  return Or(abs, And(msb, sign));
487 }
488 
489 // ================================================== ARITHMETIC
490 
491 // ------------------------------ Add
492 
493 namespace detail {
495 } // namespace detail
496 
498 
499 // ------------------------------ Sub
500 
501 namespace detail {
502 // Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg.
503 #define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, NAME, OP) \
504  HWY_API HWY_SVE_V(BASE, BITS) \
505  NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
506  return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
507  }
508 
510 #undef HWY_SVE_RETV_ARGPVN_MASK
511 } // namespace detail
512 
514 
515 // ------------------------------ SaturatedAdd
516 
519 
520 // ------------------------------ SaturatedSub
521 
524 
525 // ------------------------------ AbsDiff
527 
528 // ------------------------------ ShiftLeft[Same]
529 
530 #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, NAME, OP) \
531  template <int kBits> \
532  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
533  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
534  } \
535  HWY_API HWY_SVE_V(BASE, BITS) \
536  NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
537  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
538  }
539 
541 
542 // ------------------------------ ShiftRight[Same]
543 
546 
547 #undef HWY_SVE_SHIFT_N
548 
549 // ------------------------------ Shl/r
550 
551 #define HWY_SVE_SHIFT(BASE, CHAR, BITS, NAME, OP) \
552  HWY_API HWY_SVE_V(BASE, BITS) \
553  NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
554  using TU = HWY_SVE_T(uint, BITS); \
555  return sv##OP##_##CHAR##BITS##_x( \
556  HWY_SVE_PTRUE(BITS), v, BitCast(Simd<TU, HWY_LANES(TU)>(), bits)); \
557  }
558 
560 
563 
564 #undef HWY_SVE_SHIFT
565 
566 // ------------------------------ Min/Max
567 
572 
573 namespace detail {
576 } // namespace detail
577 
578 // ------------------------------ Mul
581 
582 // ------------------------------ MulHigh
584 namespace detail {
587 } // namespace detail
588 
589 // ------------------------------ Div
591 
592 // ------------------------------ ApproximateReciprocal
594 
595 // ------------------------------ Sqrt
597 
598 // ------------------------------ ApproximateReciprocalSqrt
600 
601 // ------------------------------ MulAdd
602 #define HWY_SVE_FMA(BASE, CHAR, BITS, NAME, OP) \
603  HWY_API HWY_SVE_V(BASE, BITS) \
604  NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
605  HWY_SVE_V(BASE, BITS) add) { \
606  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
607  }
608 
610 
611 // ------------------------------ NegMulAdd
613 
614 // ------------------------------ MulSub
616 
617 // ------------------------------ NegMulSub
619 
620 #undef HWY_SVE_FMA
621 
622 // ------------------------------ Round etc.
623 
628 
629 // ================================================== MASK
630 
631 // ------------------------------ RebindMask
632 template <class D, typename MFrom>
633 HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) {
634  return mask;
635 }
636 
637 // ------------------------------ Mask logical
638 
639 HWY_API svbool_t Not(svbool_t m) {
640  // We don't know the lane type, so assume 8-bit. For larger types, this will
641  // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
642  // correspond to the lowest byte in the lane. Per ARM, such bits are ignored.
643  return svnot_b_z(HWY_SVE_PTRUE(8), m);
644 }
645 HWY_API svbool_t And(svbool_t a, svbool_t b) {
646  return svand_b_z(b, b, a); // same order as AndNot for consistency
647 }
648 HWY_API svbool_t AndNot(svbool_t a, svbool_t b) {
649  return svbic_b_z(b, b, a); // reversed order like NEON
650 }
651 HWY_API svbool_t Or(svbool_t a, svbool_t b) {
652  return svsel_b(a, a, b); // a ? true : b
653 }
654 HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
655  return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
656 }
657 
658 // ------------------------------ CountTrue
659 
660 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, NAME, OP) \
661  template <size_t N> \
662  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N) d, svbool_t m) { \
663  return sv##OP##_b##BITS(detail::Mask(d), m); \
664  }
665 
667 #undef HWY_SVE_COUNT_TRUE
668 
669 // For 16-bit Compress: full vector, not limited to SV_POW2.
670 namespace detail {
671 
672 #define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, NAME, OP) \
673  template <size_t N> \
674  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N) d, svbool_t m) { \
675  return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
676  }
677 
678 HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp)
679 #undef HWY_SVE_COUNT_TRUE_FULL
680 
681 } // namespace detail
682 
683 // ------------------------------ AllFalse
684 template <typename T, size_t N>
685 HWY_API bool AllFalse(Simd<T, N> d, svbool_t m) {
686  return !svptest_any(detail::Mask(d), m);
687 }
688 
689 // ------------------------------ AllTrue
690 template <typename T, size_t N>
691 HWY_API bool AllTrue(Simd<T, N> d, svbool_t m) {
692  return CountTrue(d, m) == Lanes(d);
693 }
694 
695 // ------------------------------ FindFirstTrue
696 template <typename T, size_t N>
697 HWY_API intptr_t FindFirstTrue(Simd<T, N> d, svbool_t m) {
698  return AllFalse(d, m) ? -1 : CountTrue(d, svbrkb_b_z(detail::Mask(d), m));
699 }
700 
701 // ------------------------------ IfThenElse
702 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, NAME, OP) \
703  HWY_API HWY_SVE_V(BASE, BITS) \
704  NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
705  return sv##OP##_##CHAR##BITS(m, yes, no); \
706  }
707 
709 #undef HWY_SVE_IF_THEN_ELSE
710 
711 // ------------------------------ IfThenElseZero
712 template <class M, class V>
713 HWY_API V IfThenElseZero(const M mask, const V yes) {
714  return IfThenElse(mask, yes, Zero(DFromV<V>()));
715 }
716 
717 // ------------------------------ IfThenZeroElse
718 template <class M, class V>
719 HWY_API V IfThenZeroElse(const M mask, const V no) {
720  return IfThenElse(mask, Zero(DFromV<V>()), no);
721 }
722 
723 // ================================================== COMPARE
724 
725 // mask = f(vector, vector)
726 #define HWY_SVE_COMPARE(BASE, CHAR, BITS, NAME, OP) \
727  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
728  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
729  }
730 #define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, NAME, OP) \
731  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
732  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
733  }
734 
735 // ------------------------------ Eq
737 
738 // ------------------------------ Ne
740 
741 // ------------------------------ Lt
743 namespace detail {
745 } // namespace detail
746 
747 // ------------------------------ Le
749 
750 #undef HWY_SVE_COMPARE
751 #undef HWY_SVE_COMPARE_N
752 
753 // ------------------------------ Gt/Ge (swapped order)
754 
755 template <class V>
756 HWY_API svbool_t Gt(const V a, const V b) {
757  return Lt(b, a);
758 }
759 template <class V>
760 HWY_API svbool_t Ge(const V a, const V b) {
761  return Le(b, a);
762 }
763 
764 // ------------------------------ TestBit
765 template <class V>
766 HWY_API svbool_t TestBit(const V a, const V bit) {
767  return Ne(And(a, bit), Zero(DFromV<V>()));
768 }
769 
770 // ------------------------------ MaskFromVec (Ne)
771 template <class V>
772 HWY_API svbool_t MaskFromVec(const V v) {
773  return Ne(v, Zero(DFromV<V>()));
774 }
775 
776 // ------------------------------ VecFromMask
777 
778 template <class D, HWY_IF_NOT_FLOAT_D(D)>
779 HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
780  const auto v0 = Zero(RebindToSigned<decltype(d)>());
781  return BitCast(d, detail::SubN(mask, v0, 1));
782 }
783 
784 template <class D, HWY_IF_FLOAT_D(D)>
785 HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
786  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
787 }
788 
789 // ================================================== MEMORY
790 
791 // ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
792 
793 #define HWY_SVE_LOAD(BASE, CHAR, BITS, NAME, OP) \
794  template <size_t N> \
795  HWY_API HWY_SVE_V(BASE, BITS) \
796  NAME(HWY_SVE_D(BASE, BITS, N) d, \
797  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
798  return sv##OP##_##CHAR##BITS(detail::Mask(d), p); \
799  }
800 
801 #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, NAME, OP) \
802  template <size_t N> \
803  HWY_API HWY_SVE_V(BASE, BITS) \
804  NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N) d, \
805  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
806  return sv##OP##_##CHAR##BITS(m, p); \
807  }
808 
809 #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, NAME, OP) \
810  template <size_t N> \
811  HWY_API HWY_SVE_V(BASE, BITS) \
812  NAME(HWY_SVE_D(BASE, BITS, N) d, \
813  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
814  /* All-true predicate to load all 128 bits. */ \
815  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
816  }
817 
818 #define HWY_SVE_STORE(BASE, CHAR, BITS, NAME, OP) \
819  template <size_t N> \
820  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N) d, \
821  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
822  sv##OP##_##CHAR##BITS(detail::Mask(d), p, v); \
823  }
824 
830 
831 #undef HWY_SVE_LOAD
832 #undef HWY_SVE_MASKED_LOAD
833 #undef HWY_SVE_LOAD_DUP128
834 #undef HWY_SVE_STORE
835 
836 // BF16 is the same as svuint16_t because BF16 is optional before v8.6.
837 template <size_t N>
839  const bfloat16_t* HWY_RESTRICT p) {
840  return Load(RebindToUnsigned<decltype(d)>(),
841  reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
842 }
843 
844 template <size_t N>
845 HWY_API void Store(svuint16_t v, Simd<bfloat16_t, N> d,
847  Store(v, RebindToUnsigned<decltype(d)>(),
848  reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
849 }
850 
851 // ------------------------------ Load/StoreU
852 
853 // SVE only requires lane alignment, not natural alignment of the entire
854 // vector.
855 template <class D>
857  return Load(d, p);
858 }
859 
860 template <class V, class D>
861 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
862  Store(v, d, p);
863 }
864 
865 // ------------------------------ ScatterOffset/Index
866 
867 #define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, NAME, OP) \
868  template <size_t N> \
869  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N) d, \
870  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
871  HWY_SVE_V(int, BITS) offset) { \
872  sv##OP##_s##BITS##offset_##CHAR##BITS(detail::Mask(d), base, offset, v); \
873  }
874 
875 #define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, NAME, OP) \
876  template <size_t N> \
877  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N) d, \
878  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
879  HWY_SVE_V(int, BITS) index) { \
880  sv##OP##_s##BITS##index_##CHAR##BITS(detail::Mask(d), base, index, v); \
881  }
882 
885 #undef HWY_SVE_SCATTER_OFFSET
886 #undef HWY_SVE_SCATTER_INDEX
887 
888 // ------------------------------ GatherOffset/Index
889 
890 #define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, NAME, OP) \
891  template <size_t N> \
892  HWY_API HWY_SVE_V(BASE, BITS) \
893  NAME(HWY_SVE_D(BASE, BITS, N) d, \
894  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
895  HWY_SVE_V(int, BITS) offset) { \
896  return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::Mask(d), base, \
897  offset); \
898  }
899 #define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, NAME, OP) \
900  template <size_t N> \
901  HWY_API HWY_SVE_V(BASE, BITS) \
902  NAME(HWY_SVE_D(BASE, BITS, N) d, \
903  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
904  HWY_SVE_V(int, BITS) index) { \
905  return sv##OP##_s##BITS##index_##CHAR##BITS(detail::Mask(d), base, index); \
906  }
907 
910 #undef HWY_SVE_GATHER_OFFSET
911 #undef HWY_SVE_GATHER_INDEX
912 
913 // ------------------------------ StoreInterleaved3
914 
915 #define HWY_SVE_STORE3(BASE, CHAR, BITS, NAME, OP) \
916  template <size_t N> \
917  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
918  HWY_SVE_V(BASE, BITS) v2, HWY_SVE_D(BASE, BITS, N) d, \
919  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
920  const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
921  sv##OP##_##CHAR##BITS(detail::Mask(d), unaligned, triple); \
922  }
924 
925 #undef HWY_SVE_STORE3
926 
927 // ------------------------------ StoreInterleaved4
928 
929 #define HWY_SVE_STORE4(BASE, CHAR, BITS, NAME, OP) \
930  template <size_t N> \
931  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
932  HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
933  HWY_SVE_D(BASE, BITS, N) d, \
934  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
935  const sv##BASE##BITS##x4_t quad = \
936  svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
937  sv##OP##_##CHAR##BITS(detail::Mask(d), unaligned, quad); \
938  }
940 
941 #undef HWY_SVE_STORE4
942 
943 // ================================================== CONVERT
944 
945 // ------------------------------ PromoteTo
946 
947 // Same sign
948 #define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, NAME, OP) \
949  template <size_t N> \
950  HWY_API HWY_SVE_V(BASE, BITS) \
951  NAME(HWY_SVE_D(BASE, BITS, N) /* tag */, \
952  VFromD<Simd<MakeNarrow<HWY_SVE_T(BASE, BITS)>, \
953  HWY_LANES(HWY_SVE_T(BASE, BITS)) * 2>> \
954  v) { \
955  return sv##OP##_##CHAR##BITS(v); \
956  }
957 
961 
962 // 2x
963 template <size_t N>
964 HWY_API svuint32_t PromoteTo(Simd<uint32_t, N> dto, svuint8_t vfrom) {
965  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
966  return PromoteTo(dto, PromoteTo(d2, vfrom));
967 }
968 template <size_t N>
969 HWY_API svint32_t PromoteTo(Simd<int32_t, N> dto, svint8_t vfrom) {
970  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
971  return PromoteTo(dto, PromoteTo(d2, vfrom));
972 }
973 template <size_t N>
974 HWY_API svuint32_t U32FromU8(svuint8_t v) {
975  return PromoteTo(Simd<uint32_t, N>(), v);
976 }
977 
978 // Sign change
979 template <size_t N>
980 HWY_API svint16_t PromoteTo(Simd<int16_t, N> dto, svuint8_t vfrom) {
981  const RebindToUnsigned<decltype(dto)> du;
982  return BitCast(dto, PromoteTo(du, vfrom));
983 }
984 template <size_t N>
985 HWY_API svint32_t PromoteTo(Simd<int32_t, N> dto, svuint16_t vfrom) {
986  const RebindToUnsigned<decltype(dto)> du;
987  return BitCast(dto, PromoteTo(du, vfrom));
988 }
989 template <size_t N>
990 HWY_API svint32_t PromoteTo(Simd<int32_t, N> dto, svuint8_t vfrom) {
991  const Repartition<uint16_t, DFromV<decltype(vfrom)>> du16;
992  const Repartition<int16_t, decltype(du16)> di16;
993  return PromoteTo(dto, BitCast(di16, PromoteTo(du16, vfrom)));
994 }
995 
996 // ------------------------------ PromoteTo F
997 
998 template <size_t N>
999 HWY_API svfloat32_t PromoteTo(Simd<float32_t, N> /* d */, const svfloat16_t v) {
1000  return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N>()), v);
1001 }
1002 
1003 template <size_t N>
1004 HWY_API svfloat64_t PromoteTo(Simd<float64_t, N> /* d */, const svfloat32_t v) {
1005  return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N>()), v);
1006 }
1007 
1008 template <size_t N>
1009 HWY_API svfloat64_t PromoteTo(Simd<float64_t, N> /* d */, const svint32_t v) {
1010  return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N>()), v);
1011 }
1012 
1013 // For 16-bit Compress
1014 namespace detail {
1016 #undef HWY_SVE_PROMOTE_TO
1017 
1018 template <size_t N>
1019 HWY_API svfloat32_t PromoteUpperTo(Simd<float, N> df, const svfloat16_t v) {
1020  const RebindToUnsigned<decltype(df)> du;
1021  const RepartitionToNarrow<decltype(du)> dn;
1022  return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
1023 }
1024 
1025 } // namespace detail
1026 
1027 // ------------------------------ DemoteTo U
1028 
1029 namespace detail {
1030 
1031 // Saturates unsigned vectors to half/quarter-width TN.
1032 template <typename TN, class VU>
1033 VU SaturateU(VU v) {
1034  return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>()));
1035 }
1036 
1037 // Saturates unsigned vectors to half/quarter-width TN.
1038 template <typename TN, class VI>
1039 VI SaturateI(VI v) {
1040  const DFromV<VI> di;
1041  return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
1042 }
1043 
1044 } // namespace detail
1045 
1046 template <size_t N>
1047 HWY_API svuint8_t DemoteTo(Simd<uint8_t, N> dn, const svint16_t v) {
1048  const DFromV<decltype(v)> di;
1049  const RebindToUnsigned<decltype(di)> du;
1050  using TN = TFromD<decltype(dn)>;
1051  // First clamp negative numbers to zero and cast to unsigned.
1052  const svuint16_t clamped = BitCast(du, Max(Zero(di), v));
1053  // Saturate to unsigned-max and halve the width.
1054  const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1055  return svuzp1_u8(vn, vn);
1056 }
1057 
1058 template <size_t N>
1059 HWY_API svuint16_t DemoteTo(Simd<uint16_t, N> dn, const svint32_t v) {
1060  const DFromV<decltype(v)> di;
1061  const RebindToUnsigned<decltype(di)> du;
1062  using TN = TFromD<decltype(dn)>;
1063  // First clamp negative numbers to zero and cast to unsigned.
1064  const svuint32_t clamped = BitCast(du, Max(Zero(di), v));
1065  // Saturate to unsigned-max and halve the width.
1066  const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1067  return svuzp1_u16(vn, vn);
1068 }
1069 
1070 template <size_t N>
1071 HWY_API svuint8_t DemoteTo(Simd<uint8_t, N> dn, const svint32_t v) {
1072  const DFromV<decltype(v)> di;
1073  const RebindToUnsigned<decltype(di)> du;
1074  const RepartitionToNarrow<decltype(du)> d2;
1075  using TN = TFromD<decltype(dn)>;
1076  // First clamp negative numbers to zero and cast to unsigned.
1077  const svuint32_t clamped = BitCast(du, Max(Zero(di), v));
1078  // Saturate to unsigned-max and quarter the width.
1079  const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped));
1080  const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16));
1081  return svuzp1_u8(x2, x2);
1082 }
1083 
1084 HWY_API svuint8_t U8FromU32(const svuint32_t v) {
1085  const DFromV<svuint32_t> du32;
1086  const RepartitionToNarrow<decltype(du32)> du16;
1087  const RepartitionToNarrow<decltype(du16)> du8;
1088 
1089  const svuint16_t cast16 = BitCast(du16, v);
1090  const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1091  const svuint8_t cast8 = BitCast(du8, x2);
1092  return svuzp1_u8(cast8, cast8);
1093 }
1094 
1095 // ------------------------------ DemoteTo I
1096 
1097 template <size_t N>
1098 HWY_API svint8_t DemoteTo(Simd<int8_t, N> dn, const svint16_t v) {
1099  const DFromV<decltype(v)> di;
1100  using TN = TFromD<decltype(dn)>;
1101 #if HWY_TARGET == HWY_SVE2
1102  const svint8_t vn = BitCast(dn, svqxtnb_s16(v));
1103 #else
1104  const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1105 #endif
1106  return svuzp1_s8(vn, vn);
1107 }
1108 
1109 template <size_t N>
1110 HWY_API svint16_t DemoteTo(Simd<int16_t, N> dn, const svint32_t v) {
1111  const DFromV<decltype(v)> di;
1112  using TN = TFromD<decltype(dn)>;
1113 #if HWY_TARGET == HWY_SVE2
1114  const svint16_t vn = BitCast(dn, svqxtnb_s32(v));
1115 #else
1116  const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1117 #endif
1118  return svuzp1_s16(vn, vn);
1119 }
1120 
1121 template <size_t N>
1122 HWY_API svint8_t DemoteTo(Simd<int8_t, N> dn, const svint32_t v) {
1123  const DFromV<decltype(v)> di;
1124  using TN = TFromD<decltype(dn)>;
1125  const RepartitionToWide<decltype(dn)> d2;
1126 #if HWY_TARGET == HWY_SVE2
1127  const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
1128 #else
1129  const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v));
1130 #endif
1131  const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16));
1132  return BitCast(dn, svuzp1_s8(v2, v2));
1133 }
1134 
1135 // ------------------------------ ConcatEven/ConcatOdd
1136 
1137 // WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
1138 // full vector length, not rounded down to a power of two as we require).
1139 namespace detail {
1140 
1141 #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, NAME, OP) \
1142  HWY_INLINE HWY_SVE_V(BASE, BITS) \
1143  NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1144  return sv##OP##_##CHAR##BITS(lo, hi); \
1145  }
1148 #undef HWY_SVE_CONCAT_EVERY_SECOND
1149 
1150 // Used to slide up / shift whole register left; mask indicates which range
1151 // to take from lo, and the rest is filled from hi starting at its lowest.
1152 #define HWY_SVE_SPLICE(BASE, CHAR, BITS, NAME, OP) \
1153  HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1154  HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1155  return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1156  }
1157 HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
1158 #undef HWY_SVE_SPLICE
1159 
1160 } // namespace detail
1161 
1162 template <class D>
1164 #if 0 // if we could assume VL is a power of two
1165  return detail::ConcatOdd(hi, lo);
1166 #else
1167  const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
1168  const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
1169  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1170 #endif
1171 }
1172 
1173 template <class D>
1175 #if 0 // if we could assume VL is a power of two
1176  return detail::ConcatEven(hi, lo);
1177 #else
1178  const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
1179  const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
1180  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1181 #endif
1182 }
1183 
1184 // ------------------------------ DemoteTo F
1185 
1186 template <size_t N>
1187 HWY_API svfloat16_t DemoteTo(Simd<float16_t, N> d, const svfloat32_t v) {
1188  return svcvt_f16_f32_x(detail::PTrue(d), v);
1189 }
1190 
1191 template <size_t N>
1192 HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N> d, const svfloat32_t v) {
1193  const svuint16_t halves = BitCast(Full<uint16_t>(), v);
1194  return detail::ConcatOdd(halves, halves); // can ignore upper half of vec
1195 }
1196 
1197 template <size_t N>
1198 HWY_API svfloat32_t DemoteTo(Simd<float32_t, N> d, const svfloat64_t v) {
1199  return svcvt_f32_f64_x(detail::PTrue(d), v);
1200 }
1201 
1202 template <size_t N>
1203 HWY_API svint32_t DemoteTo(Simd<int32_t, N> d, const svfloat64_t v) {
1204  return svcvt_s32_f64_x(detail::PTrue(d), v);
1205 }
1206 
1207 // ------------------------------ ConvertTo F
1208 
1209 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, NAME, OP) \
1210  template <size_t N> \
1211  HWY_API HWY_SVE_V(BASE, BITS) \
1212  NAME(HWY_SVE_D(BASE, BITS, N) /* d */, HWY_SVE_V(int, BITS) v) { \
1213  return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1214  } \
1215  /* Truncates (rounds toward zero). */ \
1216  template <size_t N> \
1217  HWY_API HWY_SVE_V(int, BITS) \
1218  NAME(HWY_SVE_D(int, BITS, N) /* d */, HWY_SVE_V(BASE, BITS) v) { \
1219  return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1220  }
1221 
1222 // API only requires f32 but we provide f64 for use by Iota.
1224 #undef HWY_SVE_CONVERT
1225 
1226 // ------------------------------ NearestInt (Round, ConvertTo)
1227 
1228 template <class VF, class DI = RebindToSigned<DFromV<VF>>>
1230  // No single instruction, round then truncate.
1231  return ConvertTo(DI(), Round(v));
1232 }
1233 
1234 // ------------------------------ Iota (Add, ConvertTo)
1235 
1236 #define HWY_SVE_IOTA(BASE, CHAR, BITS, NAME, OP) \
1237  template <size_t N> \
1238  HWY_API HWY_SVE_V(BASE, BITS) \
1239  NAME(HWY_SVE_D(BASE, BITS, N) d, HWY_SVE_T(BASE, BITS) first) { \
1240  return sv##OP##_##CHAR##BITS(first, 1); \
1241  }
1242 
1244 #undef HWY_SVE_IOTA
1245 
1246 template <class D, HWY_IF_FLOAT_D(D)>
1247 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
1248  const RebindToSigned<D> di;
1249  return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
1250 }
1251 
1252 // ================================================== COMBINE
1253 
1254 namespace detail {
1255 
1256 template <typename T, size_t N>
1258  return FirstN(d, Lanes(d) / 2);
1259 }
1260 template <typename T, size_t N>
1262  // For Splice to work as intended, make sure bits above Lanes(d) are zero.
1263  return AndNot(MaskLowerHalf(d), detail::Mask(d));
1264 }
1265 
1266 // Right-shift vector pair by constexpr; can be used to slide down (=N) or up
1267 // (=Lanes()-N).
1268 #define HWY_SVE_EXT(BASE, CHAR, BITS, NAME, OP) \
1269  template <size_t kIndex> \
1270  HWY_API HWY_SVE_V(BASE, BITS) \
1271  NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1272  return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1273  }
1274 HWY_SVE_FOREACH(HWY_SVE_EXT, Ext, ext)
1275 #undef HWY_SVE_EXT
1276 
1277 } // namespace detail
1278 
1279 // ------------------------------ ConcatUpperLower
1280 template <class D, class V>
1281 HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
1282  return IfThenElse(detail::MaskLowerHalf(d), lo, hi);
1283 }
1284 
1285 // ------------------------------ ConcatLowerLower
1286 template <class D, class V>
1287 HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
1288  return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
1289 }
1290 
1291 // ------------------------------ ConcatLowerUpper
1292 template <class D, class V>
1293 HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
1294  return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
1295 }
1296 
1297 // ------------------------------ ConcatUpperUpper
1298 template <class D, class V>
1299 HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
1300  const svbool_t mask_upper = detail::MaskUpperHalf(d);
1301  const V lo_upper = detail::Splice(lo, lo, mask_upper);
1302  return IfThenElse(mask_upper, hi, lo_upper);
1303 }
1304 
1305 // ------------------------------ Combine
1306 template <class D, class V2>
1307 HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) {
1308  return ConcatLowerLower(d, hi, lo);
1309 }
1310 
1311 // ------------------------------ ZeroExtendVector
1312 
1313 template <class D, class V>
1314 HWY_API V ZeroExtendVector(const D d, const V lo) {
1315  return Combine(d, Zero(Half<D>()), lo);
1316 }
1317 
1318 // ------------------------------ Lower/UpperHalf
1319 
1320 template <class D2, class V>
1321 HWY_API V LowerHalf(D2 /* tag */, const V v) {
1322  return v;
1323 }
1324 
1325 template <class V>
1326 HWY_API V LowerHalf(const V v) {
1327  return v;
1328 }
1329 
1330 template <class D2, class V>
1331 HWY_API V UpperHalf(const D2 d2, const V v) {
1332  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<D2>()));
1333 }
1334 
1335 // ================================================== SWIZZLE
1336 
1337 // ------------------------------ GetLane
1338 
1339 #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, NAME, OP) \
1340  HWY_API HWY_SVE_T(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1341  return sv##OP##_##CHAR##BITS(detail::PFalse(), v); \
1342  }
1343 
1345 #undef HWY_SVE_GET_LANE
1346 
1347 // ------------------------------ OddEven
1348 
1349 namespace detail {
1350 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVN, Insert, insr_n)
1351 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1)
1352 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2)
1353 } // namespace detail
1354 
1355 template <class V>
1356 HWY_API V OddEven(const V odd, const V even) {
1357  const auto even_in_odd = detail::Insert(even, 0);
1358  return detail::InterleaveOdd(even_in_odd, odd);
1359 }
1360 
1361 // ------------------------------ TableLookupLanes
1362 
1363 template <class D, class DI = RebindToSigned<D>>
1365 #if HWY_IS_DEBUG_BUILD
1366  const size_t N = Lanes(d);
1367  for (size_t i = 0; i < N; ++i) {
1368  HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<TFromD<DI>>(N));
1369  }
1370 #else
1371  (void)d;
1372 #endif
1373  return Load(DI(), idx);
1374 }
1375 
1376 // <32bit are not part of Highway API, but used in Broadcast.
1377 #define HWY_SVE_TABLE(BASE, CHAR, BITS, NAME, OP) \
1378  HWY_API HWY_SVE_V(BASE, BITS) \
1379  NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(int, BITS) idx) { \
1380  const auto idx_u = BitCast(RebindToUnsigned<DFromV<decltype(v)>>(), idx); \
1381  return sv##OP##_##CHAR##BITS(v, idx_u); \
1382  }
1383 
1385 #undef HWY_SVE_TABLE
1386 
1387 // ------------------------------ Reverse
1388 
1389 #if 0 // if we could assume VL is a power of two
1390 #error "Update macro"
1391 #endif
1392 #define HWY_SVE_REVERSE(BASE, CHAR, BITS, NAME, OP) \
1393  template <size_t N> \
1394  HWY_API HWY_SVE_V(BASE, BITS) \
1395  NAME(Simd<HWY_SVE_T(BASE, BITS), N> d, HWY_SVE_V(BASE, BITS) v) { \
1396  const auto reversed = sv##OP##_##CHAR##BITS(v); \
1397  /* Shift right to remove extra (non-pow2 and remainder) lanes. */ \
1398  const size_t all_lanes = \
1399  detail::AllHardwareLanes(hwy::SizeTag<BITS / 8>()); \
1400  /* TODO(janwas): on SVE2, use whilege. */ \
1401  const svbool_t mask = Not(FirstN(d, all_lanes - Lanes(d))); \
1402  return detail::Splice(reversed, reversed, mask); \
1403  }
1404 
1406 #undef HWY_SVE_REVERSE
1407 
1408 // ------------------------------ Compress (PromoteTo)
1409 
1410 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, NAME, OP) \
1411  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1412  return sv##OP##_##CHAR##BITS(mask, v); \
1413  }
1414 
1416 #undef HWY_SVE_COMPRESS
1417 
1418 template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
1419 HWY_API V Compress(V v, svbool_t mask16) {
1420  static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
1421  const DFromV<V> d16;
1422 
1423  // Promote vector and mask to 32-bit
1424  const RepartitionToWide<decltype(d16)> dw;
1425  const auto v32L = PromoteTo(dw, v);
1426  const auto v32H = detail::PromoteUpperTo(dw, v);
1427  const svbool_t mask32L = svunpklo_b(mask16);
1428  const svbool_t mask32H = svunpkhi_b(mask16);
1429 
1430  const auto compressedL = Compress(v32L, mask32L);
1431  const auto compressedH = Compress(v32H, mask32H);
1432 
1433  // Demote to 16-bit (already in range) - separately so we can splice
1434  const V evenL = BitCast(d16, compressedL);
1435  const V evenH = BitCast(d16, compressedH);
1436  const V v16L = detail::ConcatEven(evenL, evenL); // only lower half needed
1437  const V v16H = detail::ConcatEven(evenH, evenH);
1438 
1439  // We need to combine two vectors of non-constexpr length, so the only option
1440  // is Splice, which requires us to synthesize a mask. NOTE: this function uses
1441  // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt.
1442  const size_t countL = detail::CountTrueFull(dw, mask32L);
1443  const auto compressed_maskL = FirstN(d16, countL);
1444  return detail::Splice(v16H, v16L, compressed_maskL);
1445 }
1446 
1447 // Must treat float16_t as integers so we can ConcatEven.
1448 HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
1449  const DFromV<decltype(v)> df;
1450  const RebindToSigned<decltype(df)> di;
1451  return BitCast(df, Compress(BitCast(di, v), mask16));
1452 }
1453 
1454 // ------------------------------ CompressStore
1455 
1456 template <class V, class M, class D>
1457 HWY_API size_t CompressStore(const V v, const M mask, const D d,
1458  TFromD<D>* HWY_RESTRICT unaligned) {
1459  StoreU(Compress(v, mask), d, unaligned);
1460  return CountTrue(d, mask);
1461 }
1462 
1463 // ================================================== BLOCKWISE
1464 
1465 // ------------------------------ CombineShiftRightBytes
1466 
1467 namespace detail {
1468 
1469 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
1470 // offsets are implicitly relative to the start of their 128-bit block.
1471 template <typename T, size_t N>
1472 constexpr size_t LanesPerBlock(Simd<T, N> /* tag */) {
1473  // We might have a capped vector smaller than a block, so honor that.
1474  return HWY_MIN(16 / sizeof(T), N);
1475 }
1476 
1477 template <class D, class V>
1478 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
1479  using T = MakeUnsigned<TFromD<D>>;
1480  return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0);
1481 }
1482 
1483 template <size_t kLanes, class D>
1484 svbool_t FirstNPerBlock(D d) {
1485  const RebindToSigned<D> di;
1486  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
1487  const auto idx_mod = detail::AndN(Iota(di, 0), kLanesPerBlock - 1);
1488  return detail::LtN(BitCast(di, idx_mod), kLanes);
1489 }
1490 
1491 } // namespace detail
1492 
1493 template <size_t kBytes, class D, class V = VFromD<D>>
1494 HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
1495  const Repartition<uint8_t, decltype(d)> d8;
1496  const auto hi8 = BitCast(d8, hi);
1497  const auto lo8 = BitCast(d8, lo);
1498  const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
1499  const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
1500  const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
1501  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
1502 }
1503 
1504 // ------------------------------ Shuffle2301
1505 
1506 #define HWY_SVE_SHUFFLE_2301(BASE, CHAR, BITS, NAME, OP) \
1507  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1508  const DFromV<decltype(v)> d; \
1509  const svuint64_t vu64 = BitCast(Repartition<uint64_t, decltype(d)>(), v); \
1510  return BitCast(d, sv##OP##_u64_x(HWY_SVE_PTRUE(64), vu64)); \
1511  }
1512 
1514 #undef HWY_SVE_SHUFFLE_2301
1515 
1516 template <class V, HWY_IF_FLOAT_V(V)>
1517 HWY_API V Shuffle2301(const V v) {
1518  const DFromV<V> df;
1519  const RebindToUnsigned<decltype(df)> du;
1520  return BitCast(df, Shuffle2301(BitCast(du, v)));
1521 }
1522 
1523 // ------------------------------ Shuffle2103
1524 template <class V>
1525 HWY_API V Shuffle2103(const V v) {
1526  const DFromV<V> d;
1527  const Repartition<uint8_t, decltype(d)> d8;
1528  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1529  const svuint8_t v8 = BitCast(d8, v);
1530  return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
1531 }
1532 
1533 // ------------------------------ Shuffle0321
1534 template <class V>
1535 HWY_API V Shuffle0321(const V v) {
1536  const DFromV<V> d;
1537  const Repartition<uint8_t, decltype(d)> d8;
1538  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1539  const svuint8_t v8 = BitCast(d8, v);
1540  return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
1541 }
1542 
1543 // ------------------------------ Shuffle1032
1544 template <class V>
1545 HWY_API V Shuffle1032(const V v) {
1546  const DFromV<V> d;
1547  const Repartition<uint8_t, decltype(d)> d8;
1548  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1549  const svuint8_t v8 = BitCast(d8, v);
1550  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
1551 }
1552 
1553 // ------------------------------ Shuffle01
1554 template <class V>
1555 HWY_API V Shuffle01(const V v) {
1556  const DFromV<V> d;
1557  const Repartition<uint8_t, decltype(d)> d8;
1558  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
1559  const svuint8_t v8 = BitCast(d8, v);
1560  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
1561 }
1562 
1563 // ------------------------------ Shuffle0123
1564 template <class V>
1565 HWY_API V Shuffle0123(const V v) {
1566  return Shuffle2301(Shuffle1032(v));
1567 }
1568 
1569 // ------------------------------ TableLookupBytes
1570 
1571 template <class V, class VI>
1572 HWY_API VI TableLookupBytes(const V v, const VI idx) {
1573  const DFromV<VI> d;
1574  const Repartition<uint8_t, decltype(d)> du8;
1575  const Repartition<int8_t, decltype(d)> di8;
1576  const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
1577  const auto idx8 = BitCast(di8, Add(BitCast(du8, idx), offsets128));
1578  return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
1579 }
1580 
1581 template <class V, class VI>
1582 HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
1583  const DFromV<VI> d;
1584  // Mask size must match vector type, so cast everything to this type.
1585  const Repartition<int8_t, decltype(d)> di8;
1586 
1587  auto idx8 = BitCast(di8, idx);
1588  const auto msb = Lt(idx8, Zero(di8));
1589 // Prevent overflow in table lookups (unnecessary if native)
1590 #if defined(HWY_EMULATE_SVE)
1591  idx8 = IfThenZeroElse(msb, idx8);
1592 #endif
1593 
1594  const auto lookup = TableLookupBytes(BitCast(di8, v), idx8);
1595  return BitCast(d, IfThenZeroElse(msb, lookup));
1596 }
1597 
1598 // ------------------------------ Broadcast
1599 
1600 template <int kLane, class V>
1601 HWY_API V Broadcast(const V v) {
1602  const DFromV<V> d;
1603  const RebindToSigned<decltype(d)> di;
1604  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
1605  static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
1606  auto idx = detail::OffsetsOf128BitBlocks(di, Iota(di, 0));
1607  if (kLane != 0) {
1608  idx = detail::AddN(idx, kLane);
1609  }
1610  return TableLookupLanes(v, idx);
1611 }
1612 
1613 // ------------------------------ ShiftLeftLanes
1614 
1615 template <size_t kLanes, class D, class V = VFromD<D>>
1616 HWY_API V ShiftLeftLanes(D d, const V v) {
1617  const RebindToSigned<decltype(d)> di;
1618  const auto zero = Zero(d);
1619  const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
1620  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
1621  return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
1622 }
1623 
1624 template <size_t kLanes, class V>
1625 HWY_API V ShiftLeftLanes(const V v) {
1626  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
1627 }
1628 
1629 // ------------------------------ ShiftRightLanes
1630 template <size_t kLanes, typename T, size_t N, class V = VFromD<Simd<T, N>>>
1632  const RebindToSigned<decltype(d)> di;
1633  // For partial vectors, clear upper lanes so we shift in zeros.
1634  if (N != HWY_LANES(T)) {
1635  v = IfThenElseZero(detail::Mask(d), v);
1636  }
1637 
1638  const auto shifted = detail::Ext<kLanes>(v, v);
1639  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
1640  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
1641  const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
1642  return IfThenElseZero(mask, shifted);
1643 }
1644 
1645 // ------------------------------ ShiftLeftBytes
1646 
1647 template <int kBytes, class D, class V = VFromD<D>>
1648 HWY_API V ShiftLeftBytes(const D d, const V v) {
1649  const Repartition<uint8_t, decltype(d)> d8;
1650  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
1651 }
1652 
1653 template <int kBytes, class V>
1654 HWY_API V ShiftLeftBytes(const V v) {
1655  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
1656 }
1657 
1658 // ------------------------------ ShiftRightBytes
1659 template <int kBytes, class D, class V = VFromD<D>>
1660 HWY_API V ShiftRightBytes(const D d, const V v) {
1661  const Repartition<uint8_t, decltype(d)> d8;
1662  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
1663 }
1664 
1665 // ------------------------------ InterleaveLower
1666 
1667 namespace detail {
1669 // Do not use zip2 to implement PromoteUpperTo or similar because vectors may be
1670 // non-powers of two, so getting the actual "upper half" requires MaskUpperHalf.
1671 } // namespace detail
1672 
1673 template <class D, class V>
1674 HWY_API V InterleaveLower(D d, const V a, const V b) {
1675  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
1676  // Move lower halves of blocks to lower half of vector.
1677  const Repartition<uint64_t, decltype(d)> d64;
1678  const auto a64 = BitCast(d64, a);
1679  const auto b64 = BitCast(d64, b);
1680  const auto a_blocks = detail::ConcatEven(a64, a64); // only lower half needed
1681  const auto b_blocks = detail::ConcatEven(b64, b64);
1682 
1683  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1684 }
1685 
1686 template <class V>
1687 HWY_API V InterleaveLower(const V a, const V b) {
1688  return InterleaveLower(DFromV<V>(), a, b);
1689 }
1690 
1691 // ------------------------------ InterleaveUpper
1692 
1693 // Full vector: guaranteed to have at least one block
1694 template <typename T, class V = VFromD<Full<T>>>
1695 HWY_API V InterleaveUpper(Simd<T, HWY_LANES(T)> d, const V a, const V b) {
1696  // Move upper halves of blocks to lower half of vector.
1697  const Repartition<uint64_t, decltype(d)> d64;
1698  const auto a64 = BitCast(d64, a);
1699  const auto b64 = BitCast(d64, b);
1700  const auto a_blocks = detail::ConcatOdd(a64, a64); // only lower half needed
1701  const auto b_blocks = detail::ConcatOdd(b64, b64);
1702  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1703 }
1704 
1705 // Capped: less than one block
1706 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = VFromD<Simd<T, N>>>
1707 HWY_API V InterleaveUpper(Simd<T, N> d, const V a, const V b) {
1708  static_assert(IsSame<T, TFromV<V>>(), "D/V mismatch");
1709  const Half<decltype(d)> d2;
1710  return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
1711 }
1712 
1713 // Partial: need runtime check
1714 template <typename T, size_t N,
1715  hwy::EnableIf<(N < HWY_LANES(T) && N * sizeof(T) >= 16)>* = nullptr,
1716  class V = VFromD<Simd<T, N>>>
1717 HWY_API V InterleaveUpper(Simd<T, N> d, const V a, const V b) {
1718  static_assert(IsSame<T, TFromV<V>>(), "D/V mismatch");
1719  // Less than one block: treat as capped
1720  if (Lanes(d) * sizeof(T) < 16) {
1721  const Half<decltype(d)> d2;
1722  return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
1723  }
1724  return InterleaveUpper(Full<T>(), a, b);
1725 }
1726 
1727 // ------------------------------ ZipLower
1728 
1729 template <class V, class DW = RepartitionToWide<DFromV<V>>>
1730 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
1731  const RepartitionToNarrow<DW> dn;
1732  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
1733  return BitCast(dw, InterleaveLower(dn, a, b));
1734 }
1735 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
1736 HWY_API VFromD<DW> ZipLower(const V a, const V b) {
1737  return BitCast(DW(), InterleaveLower(D(), a, b));
1738 }
1739 
1740 // ------------------------------ ZipUpper
1741 template <class V, class DW = RepartitionToWide<DFromV<V>>>
1742 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
1743  const RepartitionToNarrow<DW> dn;
1744  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
1745  return BitCast(dw, InterleaveUpper(dn, a, b));
1746 }
1747 
1748 // ================================================== REDUCE
1749 
1750 #define HWY_SVE_REDUCE(BASE, CHAR, BITS, NAME, OP) \
1751  template <size_t N> \
1752  HWY_API HWY_SVE_V(BASE, BITS) \
1753  NAME(HWY_SVE_D(BASE, BITS, N) d, HWY_SVE_V(BASE, BITS) v) { \
1754  return Set(d, sv##OP##_##CHAR##BITS(detail::Mask(d), v)); \
1755  }
1756 
1760 // NaN if all are
1763 
1764 #undef HWY_SVE_REDUCE
1765 
1766 // ================================================== Ops with dependencies
1767 
1768 // ------------------------------ PromoteTo bfloat16 (ZipLower)
1769 
1770 template <size_t N>
1771 HWY_API svfloat32_t PromoteTo(Simd<float32_t, N> df32, const svuint16_t v) {
1772  return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
1773 }
1774 
1775 // ------------------------------ ReorderDemote2To (OddEven)
1776 
1777 template <size_t N>
1778 HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N> dbf16, svfloat32_t a,
1779  svfloat32_t b) {
1780  const RebindToUnsigned<decltype(dbf16)> du16;
1781  const Repartition<uint32_t, decltype(dbf16)> du32;
1782  const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
1783  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
1784 }
1785 
1786 // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
1787 template <class V>
1788 HWY_API V ZeroIfNegative(const V v) {
1789  const auto v0 = Zero(DFromV<V>());
1790  // We already have a zero constant, so avoid IfThenZeroElse.
1791  return IfThenElse(Lt(v, v0), v0, v);
1792 }
1793 
1794 // ------------------------------ BroadcastSignBit (ShiftRight)
1795 template <class V>
1797  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1798 }
1799 
1800 // ------------------------------ AverageRound (ShiftRight)
1801 
1802 #if HWY_TARGET == HWY_SVE2
1805 #else
1806 template <class V>
1807 V AverageRound(const V a, const V b) {
1808  return ShiftRight<1>(Add(Add(a, b), Set(DFromV<V>(), 1)));
1809 }
1810 #endif // HWY_TARGET == HWY_SVE2
1811 
1812 // ------------------------------ LoadMaskBits (TestBit)
1813 
1814 // `p` points to at least 8 readable bytes, not all of which need be valid.
1815 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1816 HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
1817  const RebindToUnsigned<D> du;
1818  const svuint8_t iota = Iota(du, 0);
1819 
1820  // Load correct number of bytes (bits/8) with 7 zeros after each.
1821  const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
1822  // Replicate bytes 8x such that each byte contains the bit that governs it.
1823  const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
1824 
1825  // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
1826  const svuint8_t bit = Shl(Set(du, 1), detail::AndN(iota, 7));
1827 
1828  return TestBit(rep8, bit);
1829 }
1830 
1831 template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1832 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
1833  const uint8_t* HWY_RESTRICT bits) {
1834  const RebindToUnsigned<D> du;
1835  const Repartition<uint8_t, D> du8;
1836 
1837  // There may be up to 128 bits; avoid reading past the end.
1838  const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits);
1839 
1840  // Replicate bytes 16x such that each lane contains the bit that governs it.
1841  const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0)));
1842 
1843  // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
1844  const svuint16_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
1845 
1846  return TestBit(BitCast(du, rep16), bit);
1847 }
1848 
1849 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1850 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
1851  const uint8_t* HWY_RESTRICT bits) {
1852  const RebindToUnsigned<D> du;
1853  const Repartition<uint8_t, D> du8;
1854 
1855  // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable,
1856  // so we can skip computing the actual length (Lanes(du)+7)/8.
1857  const svuint8_t bytes = svld1(FirstN(du8, 8), bits);
1858 
1859  // Replicate bytes 32x such that each lane contains the bit that governs it.
1860  const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0)));
1861 
1862  // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
1863  const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
1864 
1865  return TestBit(BitCast(du, rep32), bit);
1866 }
1867 
1868 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1869 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
1870  const uint8_t* HWY_RESTRICT bits) {
1871  const RebindToUnsigned<D> du;
1872 
1873  // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
1874  // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
1875  uint32_t mask_bits;
1876  CopyBytes<4>(bits, &mask_bits);
1877  const auto vbits = Set(du, mask_bits);
1878 
1879  // 2 ^ {0,1, .., 31}, will not have more lanes than that.
1880  const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0));
1881 
1882  return TestBit(vbits, bit);
1883 }
1884 
1885 // ------------------------------ StoreMaskBits
1886 
1887 namespace detail {
1888 
1889 // Returns mask ? 1 : 0 in BYTE lanes.
1890 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1891 HWY_API svuint8_t BoolFromMask(Simd<T, N> d, svbool_t m) {
1892  return svdup_n_u8_z(m, 1);
1893 }
1894 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1895 HWY_API svuint8_t BoolFromMask(Simd<T, N> d, svbool_t m) {
1896  const Repartition<uint8_t, decltype(d)> d8;
1897  const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
1898  return detail::ConcatEven(b16, b16); // only lower half needed
1899 }
1900 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1901 HWY_API svuint8_t BoolFromMask(Simd<T, N> d, svbool_t m) {
1902  return U8FromU32(svdup_n_u32_z(m, 1));
1903 }
1904 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1905 HWY_API svuint8_t BoolFromMask(Simd<T, N> d, svbool_t m) {
1906  const Repartition<uint32_t, decltype(d)> d32;
1907  const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
1908  return U8FromU32(detail::ConcatEven(b64, b64)); // only lower half needed
1909 }
1910 
1911 } // namespace detail
1912 
1913 // `p` points to at least 8 writable bytes.
1914 template <typename T, size_t N>
1915 HWY_API size_t StoreMaskBits(Simd<T, N> d, svbool_t m, uint8_t* bits) {
1916  const Repartition<uint8_t, decltype(d)> d8;
1917  const Repartition<uint16_t, decltype(d)> d16;
1918  const Repartition<uint32_t, decltype(d)> d32;
1919  const Repartition<uint64_t, decltype(d)> d64;
1920  auto x = detail::BoolFromMask(d, m);
1921  // Compact bytes to bits. Could use SVE2 BDEP, but it's optional.
1922  x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
1923  x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
1924  x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
1925 
1926  const size_t num_bits = Lanes(d);
1927  const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below
1928 
1929  // Truncate to 8 bits and store.
1930  svst1b_u64(FirstN(d64, num_bytes), bits, BitCast(d64, x));
1931 
1932  // Non-full byte, need to clear the undefined upper bits. Can happen for
1933  // capped/partial vectors or large T and small hardware vectors.
1934  if (num_bits < 8) {
1935  const int mask = (1 << num_bits) - 1;
1936  bits[0] = static_cast<uint8_t>(bits[0] & mask);
1937  }
1938  // Else: we wrote full bytes because num_bits is a power of two >= 8.
1939 
1940  return num_bytes;
1941 }
1942 
1943 // ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
1944 
1945 template <class V>
1946 HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
1947  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
1948 }
1949 
1950 template <class D>
1951 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
1952  D d, TFromD<D>* HWY_RESTRICT unaligned) {
1953  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
1954 }
1955 
1956 // ------------------------------ MulEven (InterleaveEven)
1957 
1958 #if HWY_TARGET == HWY_SVE2
1959 namespace detail {
1961 } // namespace detail
1962 #endif
1963 
1964 template <class V, class DW = RepartitionToWide<DFromV<V>>>
1965 HWY_API VFromD<DW> MulEven(const V a, const V b) {
1966 #if HWY_TARGET == HWY_SVE2
1967  return BitCast(DW(), detail::MulEven(a, b));
1968 #else
1969  const auto lo = Mul(a, b);
1970  const auto hi = detail::MulHigh(a, b);
1971  return BitCast(DW(), detail::InterleaveEven(lo, hi));
1972 #endif
1973 }
1974 
1975 HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
1976  const auto lo = Mul(a, b);
1977  const auto hi = detail::MulHigh(a, b);
1978  return detail::InterleaveEven(lo, hi);
1979 }
1980 
1981 HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
1982  const auto lo = Mul(a, b);
1983  const auto hi = detail::MulHigh(a, b);
1984  return detail::InterleaveOdd(lo, hi);
1985 }
1986 
1987 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1988 
1989 template <size_t N>
1990 HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N> df32, svuint16_t a,
1991  svuint16_t b,
1992  const svfloat32_t sum0,
1993  svfloat32_t& sum1) {
1994  // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16.
1995  const Repartition<uint16_t, decltype(df32)> du16;
1996  const RebindToUnsigned<decltype(df32)> du32;
1997  const svuint16_t zero = Zero(du16);
1998  const svuint32_t a0 = ZipLower(du32, zero, BitCast(du16, a));
1999  const svuint32_t a1 = ZipUpper(du32, zero, BitCast(du16, a));
2000  const svuint32_t b0 = ZipLower(du32, zero, BitCast(du16, b));
2001  const svuint32_t b1 = ZipUpper(du32, zero, BitCast(du16, b));
2002  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2003  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2004 }
2005 
2006 // ------------------------------ AESRound / CLMul
2007 
2008 #if defined(__ARM_FEATURE_SVE2_AES)
2009 
2010 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
2011 #ifdef HWY_NATIVE_AES
2012 #undef HWY_NATIVE_AES
2013 #else
2014 #define HWY_NATIVE_AES
2015 #endif
2016 
2017 HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) {
2018  // NOTE: it is important that AESE and AESMC be consecutive instructions so
2019  // they can be fused. AESE includes AddRoundKey, which is a different ordering
2020  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
2021  // round key (the compiler will hopefully optimize this for multiple rounds).
2022  const svuint8_t zero = svdup_n_u8(0);
2023  return Xor(vaesmcq_u8(vaeseq_u8(state, zero), round_key));
2024 }
2025 
2026 HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) {
2027  return svpmullb_pair(a, b);
2028 }
2029 
2030 HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
2031  return svpmullt_pair(a, b);
2032 }
2033 
2034 #endif // __ARM_FEATURE_SVE2_AES
2035 
2036 // ================================================== END MACROS
2037 namespace detail { // for code folding
2038 #undef HWY_IF_FLOAT_V
2039 #undef HWY_IF_LANE_SIZE_V
2040 #undef HWY_IF_SIGNED_V
2041 #undef HWY_IF_UNSIGNED_V
2042 #undef HWY_SVE_D
2043 #undef HWY_SVE_FOREACH
2044 #undef HWY_SVE_FOREACH_F
2045 #undef HWY_SVE_FOREACH_F16
2046 #undef HWY_SVE_FOREACH_F32
2047 #undef HWY_SVE_FOREACH_F64
2048 #undef HWY_SVE_FOREACH_I
2049 #undef HWY_SVE_FOREACH_I08
2050 #undef HWY_SVE_FOREACH_I16
2051 #undef HWY_SVE_FOREACH_I32
2052 #undef HWY_SVE_FOREACH_I64
2053 #undef HWY_SVE_FOREACH_IF
2054 #undef HWY_SVE_FOREACH_U
2055 #undef HWY_SVE_FOREACH_U08
2056 #undef HWY_SVE_FOREACH_U16
2057 #undef HWY_SVE_FOREACH_U32
2058 #undef HWY_SVE_FOREACH_U64
2059 #undef HWY_SVE_FOREACH_UI
2060 #undef HWY_SVE_FOREACH_UI08
2061 #undef HWY_SVE_FOREACH_UI16
2062 #undef HWY_SVE_FOREACH_UI32
2063 #undef HWY_SVE_FOREACH_UI64
2064 #undef HWY_SVE_FOREACH_UIF3264
2065 #undef HWY_SVE_PTRUE
2066 #undef HWY_SVE_RETV_ARGD
2067 #undef HWY_SVE_RETV_ARGPV
2068 #undef HWY_SVE_RETV_ARGPVN
2069 #undef HWY_SVE_RETV_ARGPVV
2070 #undef HWY_SVE_RETV_ARGV
2071 #undef HWY_SVE_RETV_ARGVN
2072 #undef HWY_SVE_RETV_ARGVV
2073 #undef HWY_SVE_T
2074 #undef HWY_SVE_V
2075 
2076 } // namespace detail
2077 // NOLINTNEXTLINE(google-readability-namespace-comments)
2078 } // namespace HWY_NAMESPACE
2079 } // namespace hwy
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:102
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:62
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:801
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:248
HWY_AFTER_NAMESPACE()
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:672
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:261
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:72
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:167
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1141
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:157
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1392
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:875
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:59
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1209
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:125
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:530
#define HWY_SVE_CAST(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:340
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1750
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:161
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:503
#define HWY_SVE_IOTA(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1236
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:117
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:110
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:65
#define HWY_SVE_PTRUE(BITS)
Definition: arm_sve-inl.h:259
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:702
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:551
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:867
#define HWY_SVE_SHUFFLE_2301(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1506
#define HWY_SVE_FMA(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:602
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:88
#define HWY_SVE_STORE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:818
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:890
#define HWY_SVE_SET(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:287
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1410
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1152
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:94
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:660
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:435
#define HWY_SVE_STORE3(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:915
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:329
#define HWY_SPECIALIZE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:137
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:179
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:730
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:82
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:184
#define HWY_SVE_TABLE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1377
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:899
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:809
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1339
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:726
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:459
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:121
#define HWY_SVE_EXT(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:1268
HWY_BEFORE_NAMESPACE()
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:60
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:76
#define HWY_SVE_RETV_ARGD(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:147
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:106
#define HWY_SVE_LOAD(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:793
#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:172
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:425
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:98
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:948
#define HWY_SVE_STORE4(BASE, CHAR, BITS, NAME, OP)
Definition: arm_sve-inl.h:929
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_API
Definition: base.h:117
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_DASSERT(condition)
Definition: base.h:163
svbool_t MaskLowerHalf(Simd< T, N > d)
Definition: arm_sve-inl.h:1257
HWY_API svuint8_t BoolFromMask(Simd< T, N > d, svbool_t m)
Definition: arm_sve-inl.h:1891
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:195
constexpr size_t LanesPerBlock(Simd< T, N >)
Definition: arm_sve-inl.h:1472
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: arm_sve-inl.h:1478
svbool_t MaskUpperHalf(Simd< T, N > d)
Definition: arm_sve-inl.h:1261
VI SaturateI(VI v)
Definition: arm_sve-inl.h:1039
svbool_t FirstNPerBlock(D d)
Definition: arm_sve-inl.h:1484
VU SaturateU(VU v)
Definition: arm_sve-inl.h:1033
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) HWY_API svbool_t PFalse()
Definition: arm_sve-inl.h:267
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
svbool_t Mask(Simd< T, N > d)
Definition: arm_sve-inl.h:277
HWY_INLINE size_t HardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:209
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N > df, const svfloat16_t v)
Definition: arm_sve-inl.h:1019
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
typename D::Twice Twice
Definition: shared-inl.h:168
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:1975
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API svuint32_t U32FromU8(svuint8_t v)
Definition: arm_sve-inl.h:974
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API VFromD< D > ConcatEven(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1174
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: shared-inl.h:160
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API VFromD< DW > ZipLower(const V a, const V b)
Definition: arm_sve-inl.h:1736
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API VFromD< D > ConcatOdd(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1163
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
typename D::T TFromD
Definition: shared-inl.h:140
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
constexpr HWY_API bool IsSame()
Definition: base.h:260
typename EnableIfT< Condition, T >::type EnableIf
Definition: base.h:247
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
#define HWY_LANES(T)
Definition: set_macros-inl.h:80
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_sve-inl.h:39
Definition: shared-inl.h:35
Definition: base.h:290
Definition: base.h:227
uint16_t bits
Definition: base.h:228