Grok 10.0.1
arm_sve-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// ARM SVE[2] vectors (length not known at compile time).
17// External include guard in highway.h - see comment there.
18
19#include <arm_sve.h>
20#include <stddef.h>
21#include <stdint.h>
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
25
26// If running on hardware whose vector length is known to be a power of two, we
27// can skip fixups for non-power of two sizes.
28#undef HWY_SVE_IS_POW2
29#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
30#define HWY_SVE_IS_POW2 1
31#else
32#define HWY_SVE_IS_POW2 0
33#endif
34
36namespace hwy {
37namespace HWY_NAMESPACE {
38
39template <class V>
40struct DFromV_t {}; // specialized in macros
41template <class V>
42using DFromV = typename DFromV_t<RemoveConst<V>>::type;
43
44template <class V>
46
47// ================================================== MACROS
48
49// Generate specializations and function definitions using X macros. Although
50// harder to read and debug, writing everything manually is too bulky.
51
52namespace detail { // for code folding
53
54// Unsigned:
55#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
56#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
57#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
58 X_MACRO(uint, u, 32, 16, NAME, OP)
59#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
60 X_MACRO(uint, u, 64, 32, NAME, OP)
61
62// Signed:
63#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
64#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
65#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
66#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
67
68// Float:
69#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
70 X_MACRO(float, f, 16, 16, NAME, OP)
71#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
72 X_MACRO(float, f, 32, 16, NAME, OP)
73#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
74 X_MACRO(float, f, 64, 32, NAME, OP)
75
76// For all element sizes:
77#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
78 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
79 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
80 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
81 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
82
83#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
84 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
85 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
86 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
87 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
88
89#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
90 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
91 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
92 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
93
94// Commonly used type categories for a given element size:
95#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
96 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
97 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
98
99#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
100 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
101 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
102
103#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
104 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
105 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
106
107#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
108 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
109 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
110
111#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
112 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
113 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
114 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
115 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
116
117// Commonly used type categories:
118#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
119 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
120 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
121
122#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
123 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
124 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
125
126#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
127 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
128 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
129 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
130
131// Assemble types for use in x-macros
132#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
133#define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
134#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
135
136} // namespace detail
137
138#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
139 template <> \
140 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
141 using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
142 };
143
145#undef HWY_SPECIALIZE
146
147// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
148// instructions, and we anyway only use it when the predicate is ptrue.
149
150// vector = f(vector), e.g. Not
151#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
152 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
153 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
154 }
155#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
156 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
157 return sv##OP##_##CHAR##BITS(v); \
158 }
159
160// vector = f(vector, scalar), e.g. detail::AddN
161#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
162 HWY_API HWY_SVE_V(BASE, BITS) \
163 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
164 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
165 }
166#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
167 HWY_API HWY_SVE_V(BASE, BITS) \
168 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
169 return sv##OP##_##CHAR##BITS(a, b); \
170 }
171
172// vector = f(vector, vector), e.g. Add
173#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
174 HWY_API HWY_SVE_V(BASE, BITS) \
175 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
176 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
177 }
178#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
179 HWY_API HWY_SVE_V(BASE, BITS) \
180 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
181 return sv##OP##_##CHAR##BITS(a, b); \
182 }
183
184// ------------------------------ Lanes
185
186namespace detail {
187
188// Returns actual lanes of a hardware vector without rounding to a power of two.
190 return svcntb_pat(SV_ALL);
191}
193 return svcnth_pat(SV_ALL);
194}
196 return svcntw_pat(SV_ALL);
197}
199 return svcntd_pat(SV_ALL);
200}
201
202// All-true mask from a macro
203#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
204
205#if HWY_SVE_IS_POW2
206#define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
207#else
208#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
209
210// Returns actual lanes of a hardware vector, rounded down to a power of two.
211template <typename T, HWY_IF_LANE_SIZE(T, 1)>
212HWY_INLINE size_t HardwareLanes() {
213 return svcntb_pat(SV_POW2);
214}
215template <typename T, HWY_IF_LANE_SIZE(T, 2)>
216HWY_INLINE size_t HardwareLanes() {
217 return svcnth_pat(SV_POW2);
218}
219template <typename T, HWY_IF_LANE_SIZE(T, 4)>
220HWY_INLINE size_t HardwareLanes() {
221 return svcntw_pat(SV_POW2);
222}
223template <typename T, HWY_IF_LANE_SIZE(T, 8)>
224HWY_INLINE size_t HardwareLanes() {
225 return svcntd_pat(SV_POW2);
226}
227
228#endif // HWY_SVE_IS_POW2
229
230} // namespace detail
231
232// Returns actual number of lanes after capping by N and shifting. May return 0
233// (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
234#if HWY_TARGET == HWY_SVE_256
235template <typename T, size_t N, int kPow2>
236HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
237 return HWY_MIN(detail::ScaleByPower(32 / sizeof(T), kPow2), N);
238}
239#elif HWY_TARGET == HWY_SVE2_128
240template <typename T, size_t N, int kPow2>
241HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
242 return HWY_MIN(detail::ScaleByPower(16 / sizeof(T), kPow2), N);
243}
244#else
245template <typename T, size_t N, int kPow2>
246HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
247 const size_t actual = detail::HardwareLanes<T>();
248 // Common case of full vectors: avoid any extra instructions.
249 if (detail::IsFull(d)) return actual;
250 return HWY_MIN(detail::ScaleByPower(actual, kPow2), N);
251}
252#endif // HWY_TARGET
253
254// ================================================== MASK INIT
255
256// One mask bit per byte; only the one belonging to the lowest byte is valid.
257
258// ------------------------------ FirstN
259#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
260 template <size_t N, int kPow2> \
261 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
262 const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
263 return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
264 }
266#undef HWY_SVE_FIRSTN
267
268namespace detail {
269
270#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
271 template <size_t N, int kPow2> \
272 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
273 return HWY_SVE_PTRUE(BITS); \
274 } \
275 template <size_t N, int kPow2> \
276 HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
277 return HWY_SVE_ALL_PTRUE(BITS); \
278 }
279
280HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
281#undef HWY_SVE_WRAP_PTRUE
282
283HWY_API svbool_t PFalse() { return svpfalse_b(); }
284
285// Returns all-true if d is HWY_FULL or FirstN(N) after capping N.
286//
287// This is used in functions that load/store memory; other functions (e.g.
288// arithmetic) can ignore d and use PTrue instead.
289template <class D>
290svbool_t MakeMask(D d) {
291 return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d));
292}
293
294} // namespace detail
295
296// ================================================== INIT
297
298// ------------------------------ Set
299// vector = f(d, scalar), e.g. Set
300#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
301 template <size_t N, int kPow2> \
302 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
303 HWY_SVE_T(BASE, BITS) arg) { \
304 return sv##OP##_##CHAR##BITS(arg); \
305 }
306
308#undef HWY_SVE_SET
309
310// Required for Zero and VFromD
311template <size_t N, int kPow2>
313 return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
314}
315
316template <class D>
317using VFromD = decltype(Set(D(), TFromD<D>()));
318
319// ------------------------------ Zero
320
321template <class D>
323 return Set(d, 0);
324}
325
326// ------------------------------ Undefined
327
328#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
329 template <size_t N, int kPow2> \
330 HWY_API HWY_SVE_V(BASE, BITS) \
331 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
332 return sv##OP##_##CHAR##BITS(); \
333 }
334
336
337// ------------------------------ BitCast
338
339namespace detail {
340
341// u8: no change
342#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
343 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
344 return v; \
345 } \
346 template <size_t N, int kPow2> \
347 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
348 HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
349 return v; \
350 }
351
352// All other types
353#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
354 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
355 return sv##OP##_u8_##CHAR##BITS(v); \
356 } \
357 template <size_t N, int kPow2> \
358 HWY_INLINE HWY_SVE_V(BASE, BITS) \
359 BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \
360 return sv##OP##_##CHAR##BITS##_u8(v); \
361 }
362
364HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
365HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
366HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
367HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
368HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
369
370#undef HWY_SVE_CAST_NOP
371#undef HWY_SVE_CAST
372
373template <size_t N, int kPow2>
375 svuint8_t v) {
377}
378
379} // namespace detail
380
381template <class D, class FromV>
384}
385
386// ================================================== LOGICAL
387
388// detail::*N() functions accept a scalar argument to avoid extra Set().
389
390// ------------------------------ Not
392
393// ------------------------------ And
394
395namespace detail {
397} // namespace detail
398
400
401template <class V, HWY_IF_FLOAT_V(V)>
402HWY_API V And(const V a, const V b) {
403 const DFromV<V> df;
404 const RebindToUnsigned<decltype(df)> du;
405 return BitCast(df, And(BitCast(du, a), BitCast(du, b)));
406}
407
408// ------------------------------ Or
409
411
412template <class V, HWY_IF_FLOAT_V(V)>
413HWY_API V Or(const V a, const V b) {
414 const DFromV<V> df;
415 const RebindToUnsigned<decltype(df)> du;
416 return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
417}
418
419// ------------------------------ Xor
420
421namespace detail {
423} // namespace detail
424
426
427template <class V, HWY_IF_FLOAT_V(V)>
428HWY_API V Xor(const V a, const V b) {
429 const DFromV<V> df;
430 const RebindToUnsigned<decltype(df)> du;
431 return BitCast(df, Xor(BitCast(du, a), BitCast(du, b)));
432}
433
434// ------------------------------ AndNot
435
436namespace detail {
437#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
438 HWY_API HWY_SVE_V(BASE, BITS) \
439 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
440 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
441 }
442
444#undef HWY_SVE_RETV_ARGPVN_SWAP
445} // namespace detail
446
447#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
448 HWY_API HWY_SVE_V(BASE, BITS) \
449 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
450 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
451 }
453#undef HWY_SVE_RETV_ARGPVV_SWAP
454
455template <class V, HWY_IF_FLOAT_V(V)>
456HWY_API V AndNot(const V a, const V b) {
457 const DFromV<V> df;
458 const RebindToUnsigned<decltype(df)> du;
459 return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b)));
460}
461
462// ------------------------------ Or3
463template <class V>
464HWY_API V Or3(V o1, V o2, V o3) {
465 return Or(o1, Or(o2, o3));
466}
467
468// ------------------------------ OrAnd
469template <class V>
470HWY_API V OrAnd(const V o, const V a1, const V a2) {
471 return Or(o, And(a1, a2));
472}
473
474// ------------------------------ PopulationCount
475
476#ifdef HWY_NATIVE_POPCNT
477#undef HWY_NATIVE_POPCNT
478#else
479#define HWY_NATIVE_POPCNT
480#endif
481
482// Need to return original type instead of unsigned.
483#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
484 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
485 return BitCast(DFromV<decltype(v)>(), \
486 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
487 }
489#undef HWY_SVE_POPCNT
490
491// ================================================== SIGN
492
493// ------------------------------ Neg
495
496// ------------------------------ Abs
498
499// ------------------------------ CopySign[ToAbs]
500
501template <class V>
502HWY_API V CopySign(const V magn, const V sign) {
503 const auto msb = SignBit(DFromV<V>());
504 return Or(AndNot(msb, magn), And(msb, sign));
505}
506
507template <class V>
508HWY_API V CopySignToAbs(const V abs, const V sign) {
509 const auto msb = SignBit(DFromV<V>());
510 return Or(abs, And(msb, sign));
511}
512
513// ================================================== ARITHMETIC
514
515// ------------------------------ Add
516
517namespace detail {
519} // namespace detail
520
522
523// ------------------------------ Sub
524
525namespace detail {
526// Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg.
527#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
528 HWY_API HWY_SVE_V(BASE, BITS) \
529 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
530 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
531 }
532
534#undef HWY_SVE_RETV_ARGPVN_MASK
535} // namespace detail
536
538
539// ------------------------------ SumsOf8
540HWY_API svuint64_t SumsOf8(const svuint8_t v) {
541 const ScalableTag<uint32_t> du32;
542 const ScalableTag<uint64_t> du64;
543 const svbool_t pg = detail::PTrue(du64);
544
545 const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
546 // Compute pairwise sum of u32 and extend to u64.
547 // TODO(janwas): on SVE2, we can instead use svaddp.
548 const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
549 // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
550 const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
551 return Add(hi, lo);
552}
553
554// ------------------------------ SaturatedAdd
555
558
559// ------------------------------ SaturatedSub
560
563
564// ------------------------------ AbsDiff
566
567// ------------------------------ ShiftLeft[Same]
568
569#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
570 template <int kBits> \
571 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
572 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
573 } \
574 HWY_API HWY_SVE_V(BASE, BITS) \
575 NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
576 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
577 }
578
580
581// ------------------------------ ShiftRight[Same]
582
585
586#undef HWY_SVE_SHIFT_N
587
588// ------------------------------ RotateRight
589
590// TODO(janwas): svxar on SVE2
591template <int kBits, class V>
593 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
594 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
595 if (kBits == 0) return v;
596 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
597}
598
599// ------------------------------ Shl/r
600
601#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
602 HWY_API HWY_SVE_V(BASE, BITS) \
603 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
604 const RebindToUnsigned<DFromV<decltype(v)>> du; \
605 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
606 BitCast(du, bits)); \
607 }
608
610
613
614#undef HWY_SVE_SHIFT
615
616// ------------------------------ Min/Max
617
622
623namespace detail {
626} // namespace detail
627
628// ------------------------------ Mul
631
632// ------------------------------ MulHigh
634namespace detail {
637} // namespace detail
638
639// ------------------------------ MulFixedPoint15
640HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
641#if HWY_TARGET == HWY_SVE2
642 return svqrdmulh_s16(a, b);
643#else
644 const DFromV<decltype(a)> d;
645 const RebindToUnsigned<decltype(d)> du;
646
647 const svuint16_t lo = BitCast(du, Mul(a, b));
648 const svint16_t hi = MulHigh(a, b);
649 // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
650 // carry that into the result. Instead isolate the top two bits because only
651 // they can influence the result.
652 const svuint16_t lo_top2 = ShiftRight<14>(lo);
653 // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
654 const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
655 return Add(Add(hi, hi), BitCast(d, rounding));
656#endif
657}
658
659// ------------------------------ Div
661
662// ------------------------------ ApproximateReciprocal
664
665// ------------------------------ Sqrt
667
668// ------------------------------ ApproximateReciprocalSqrt
670
671// ------------------------------ MulAdd
672#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
673 HWY_API HWY_SVE_V(BASE, BITS) \
674 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
675 HWY_SVE_V(BASE, BITS) add) { \
676 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
677 }
678
680
681// ------------------------------ NegMulAdd
683
684// ------------------------------ MulSub
686
687// ------------------------------ NegMulSub
689
690#undef HWY_SVE_FMA
691
692// ------------------------------ Round etc.
693
698
699// ================================================== MASK
700
701// ------------------------------ RebindMask
702template <class D, typename MFrom>
703HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) {
704 return mask;
705}
706
707// ------------------------------ Mask logical
708
709HWY_API svbool_t Not(svbool_t m) {
710 // We don't know the lane type, so assume 8-bit. For larger types, this will
711 // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
712 // correspond to the lowest byte in the lane. Per ARM, such bits are ignored.
713 return svnot_b_z(HWY_SVE_PTRUE(8), m);
714}
715HWY_API svbool_t And(svbool_t a, svbool_t b) {
716 return svand_b_z(b, b, a); // same order as AndNot for consistency
717}
718HWY_API svbool_t AndNot(svbool_t a, svbool_t b) {
719 return svbic_b_z(b, b, a); // reversed order like NEON
720}
721HWY_API svbool_t Or(svbool_t a, svbool_t b) {
722 return svsel_b(a, a, b); // a ? true : b
723}
724HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
725 return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
726}
727
728// ------------------------------ CountTrue
729
730#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
731 template <size_t N, int kPow2> \
732 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
733 return sv##OP##_b##BITS(detail::MakeMask(d), m); \
734 }
735
737#undef HWY_SVE_COUNT_TRUE
738
739// For 16-bit Compress: full vector, not limited to SV_POW2.
740namespace detail {
741
742#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
743 template <size_t N, int kPow2> \
744 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \
745 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
746 }
747
748HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp)
749#undef HWY_SVE_COUNT_TRUE_FULL
750
751} // namespace detail
752
753// ------------------------------ AllFalse
754template <class D>
755HWY_API bool AllFalse(D d, svbool_t m) {
756 return !svptest_any(detail::MakeMask(d), m);
757}
758
759// ------------------------------ AllTrue
760template <class D>
761HWY_API bool AllTrue(D d, svbool_t m) {
762 return CountTrue(d, m) == Lanes(d);
763}
764
765// ------------------------------ FindFirstTrue
766template <class D>
767HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
768 return AllFalse(d, m) ? intptr_t{-1}
769 : static_cast<intptr_t>(
770 CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
771}
772
773// ------------------------------ IfThenElse
774#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
775 HWY_API HWY_SVE_V(BASE, BITS) \
776 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
777 return sv##OP##_##CHAR##BITS(m, yes, no); \
778 }
779
781#undef HWY_SVE_IF_THEN_ELSE
782
783// ------------------------------ IfThenElseZero
784template <class V>
785HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
786 return IfThenElse(mask, yes, Zero(DFromV<V>()));
787}
788
789// ------------------------------ IfThenZeroElse
790template <class V>
791HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
792 return IfThenElse(mask, Zero(DFromV<V>()), no);
793}
794
795// ================================================== COMPARE
796
797// mask = f(vector, vector)
798#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
799 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
800 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
801 }
802#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
803 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
804 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
805 }
806
807// ------------------------------ Eq
809namespace detail {
811} // namespace detail
812
813// ------------------------------ Ne
815namespace detail {
817} // namespace detail
818
819// ------------------------------ Lt
821namespace detail {
823} // namespace detail
824
825// ------------------------------ Le
827
828#undef HWY_SVE_COMPARE
829#undef HWY_SVE_COMPARE_N
830
831// ------------------------------ Gt/Ge (swapped order)
832template <class V>
833HWY_API svbool_t Gt(const V a, const V b) {
834 return Lt(b, a);
835}
836template <class V>
837HWY_API svbool_t Ge(const V a, const V b) {
838 return Le(b, a);
839}
840
841// ------------------------------ TestBit
842template <class V>
843HWY_API svbool_t TestBit(const V a, const V bit) {
844 return detail::NeN(And(a, bit), 0);
845}
846
847// ------------------------------ MaskFromVec (Ne)
848template <class V>
849HWY_API svbool_t MaskFromVec(const V v) {
850 return detail::NeN(v, static_cast<TFromV<V>>(0));
851}
852
853// ------------------------------ VecFromMask
854template <class D>
855HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
856 const RebindToSigned<D> di;
857 // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which
858 // requires an extra instruction plus M0 pipeline.
859 return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
860}
861
862// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
863
864#if HWY_TARGET == HWY_SVE2
865
866#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \
867 HWY_API HWY_SVE_V(BASE, BITS) \
868 NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
869 HWY_SVE_V(BASE, BITS) no) { \
870 return sv##OP##_##CHAR##BITS(yes, no, mask); \
871 }
872
874#undef HWY_SVE_IF_VEC
875
876template <class V, HWY_IF_FLOAT_V(V)>
877HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
878 const DFromV<V> d;
879 const RebindToUnsigned<decltype(d)> du;
880 return BitCast(
881 d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no)));
882}
883
884#else
885
886template <class V>
887HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
888 return Or(And(mask, yes), AndNot(mask, no));
889}
890
891#endif // HWY_TARGET == HWY_SVE2
892
893// ------------------------------ Floating-point classification (Ne)
894
895template <class V>
896HWY_API svbool_t IsNaN(const V v) {
897 return Ne(v, v); // could also use cmpuo
898}
899
900template <class V>
901HWY_API svbool_t IsInf(const V v) {
902 using T = TFromV<V>;
903 const DFromV<decltype(v)> d;
904 const RebindToSigned<decltype(d)> di;
905 const VFromD<decltype(di)> vi = BitCast(di, v);
906 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
907 return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
908}
909
910// Returns whether normal/subnormal/zero.
911template <class V>
912HWY_API svbool_t IsFinite(const V v) {
913 using T = TFromV<V>;
914 const DFromV<decltype(v)> d;
915 const RebindToUnsigned<decltype(d)> du;
916 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
917 const VFromD<decltype(du)> vu = BitCast(du, v);
918 // 'Shift left' to clear the sign bit, then right so we can compare with the
919 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
920 // negative and non-negative floats would be greater).
921 const VFromD<decltype(di)> exp =
922 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
923 return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>()));
924}
925
926// ================================================== MEMORY
927
928// ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
929
930#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
931 template <size_t N, int kPow2> \
932 HWY_API HWY_SVE_V(BASE, BITS) \
933 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
934 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
935 return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \
936 }
937
938#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
939 template <size_t N, int kPow2> \
940 HWY_API HWY_SVE_V(BASE, BITS) \
941 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
942 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
943 return sv##OP##_##CHAR##BITS(m, p); \
944 }
945
946#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
947 template <size_t N, int kPow2> \
948 HWY_API HWY_SVE_V(BASE, BITS) \
949 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
950 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
951 /* All-true predicate to load all 128 bits. */ \
952 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
953 }
954
955#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
956 template <size_t N, int kPow2> \
957 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
958 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
959 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
960 sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \
961 }
962
963#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
964 template <size_t N, int kPow2> \
965 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
966 HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
967 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
968 sv##OP##_##CHAR##BITS(m, p, v); \
969 }
970
977
978#undef HWY_SVE_LOAD
979#undef HWY_SVE_MASKED_LOAD
980#undef HWY_SVE_LOAD_DUP128
981#undef HWY_SVE_STORE
982#undef HWY_SVE_BLENDED_STORE
983
984// BF16 is the same as svuint16_t because BF16 is optional before v8.6.
985template <size_t N, int kPow2>
987 const bfloat16_t* HWY_RESTRICT p) {
988 return Load(RebindToUnsigned<decltype(d)>(),
989 reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
990}
991
992template <size_t N, int kPow2>
995 Store(v, RebindToUnsigned<decltype(d)>(),
996 reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
997}
998
999// ------------------------------ Load/StoreU
1000
1001// SVE only requires lane alignment, not natural alignment of the entire
1002// vector.
1003template <class D>
1005 return Load(d, p);
1006}
1007
1008template <class V, class D>
1009HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1010 Store(v, d, p);
1011}
1012
1013// ------------------------------ ScatterOffset/Index
1014
1015#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1016 template <size_t N, int kPow2> \
1017 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1018 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1019 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1020 HWY_SVE_V(int, BITS) offset) { \
1021 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
1022 v); \
1023 }
1024
1025#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1026 template <size_t N, int kPow2> \
1027 HWY_API void NAME( \
1028 HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1029 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
1030 sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
1031 }
1032
1035#undef HWY_SVE_SCATTER_OFFSET
1036#undef HWY_SVE_SCATTER_INDEX
1037
1038// ------------------------------ GatherOffset/Index
1039
1040#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1041 template <size_t N, int kPow2> \
1042 HWY_API HWY_SVE_V(BASE, BITS) \
1043 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1044 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1045 HWY_SVE_V(int, BITS) offset) { \
1046 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
1047 offset); \
1048 }
1049#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1050 template <size_t N, int kPow2> \
1051 HWY_API HWY_SVE_V(BASE, BITS) \
1052 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1053 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1054 HWY_SVE_V(int, BITS) index) { \
1055 return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
1056 index); \
1057 }
1058
1061#undef HWY_SVE_GATHER_OFFSET
1062#undef HWY_SVE_GATHER_INDEX
1063
1064// ------------------------------ LoadInterleaved2
1065
1066// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1067#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1068#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1069#else
1070#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1071#endif
1072
1073#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \
1074 template <size_t N, int kPow2> \
1075 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1076 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1077 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1078 const sv##BASE##BITS##x2_t tuple = \
1079 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1080 v0 = svget2(tuple, 0); \
1081 v1 = svget2(tuple, 1); \
1082 }
1084
1085#undef HWY_SVE_LOAD2
1086
1087// ------------------------------ LoadInterleaved3
1088
1089#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \
1090 template <size_t N, int kPow2> \
1091 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1092 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1093 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1094 HWY_SVE_V(BASE, BITS) & v2) { \
1095 const sv##BASE##BITS##x3_t tuple = \
1096 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1097 v0 = svget3(tuple, 0); \
1098 v1 = svget3(tuple, 1); \
1099 v2 = svget3(tuple, 2); \
1100 }
1102
1103#undef HWY_SVE_LOAD3
1104
1105// ------------------------------ LoadInterleaved4
1106
1107#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \
1108 template <size_t N, int kPow2> \
1109 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1110 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1111 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1112 HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
1113 const sv##BASE##BITS##x4_t tuple = \
1114 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1115 v0 = svget4(tuple, 0); \
1116 v1 = svget4(tuple, 1); \
1117 v2 = svget4(tuple, 2); \
1118 v3 = svget4(tuple, 3); \
1119 }
1121
1122#undef HWY_SVE_LOAD4
1123
1124// ------------------------------ StoreInterleaved2
1125
1126#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1127 template <size_t N, int kPow2> \
1128 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1129 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1130 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1131 const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \
1132 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple); \
1133 }
1135
1136#undef HWY_SVE_STORE2
1137
1138// ------------------------------ StoreInterleaved3
1139
1140#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
1141 template <size_t N, int kPow2> \
1142 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1143 HWY_SVE_V(BASE, BITS) v2, \
1144 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1145 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1146 const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
1147 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \
1148 }
1150
1151#undef HWY_SVE_STORE3
1152
1153// ------------------------------ StoreInterleaved4
1154
1155#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
1156 template <size_t N, int kPow2> \
1157 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1158 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1159 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1160 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1161 const sv##BASE##BITS##x4_t quad = \
1162 svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
1163 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \
1164 }
1166
1167#undef HWY_SVE_STORE4
1168
1169// ================================================== CONVERT
1170
1171// ------------------------------ PromoteTo
1172
1173// Same sign
1174#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
1175 template <size_t N, int kPow2> \
1176 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1177 HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \
1178 return sv##OP##_##CHAR##BITS(v); \
1179 }
1180
1184
1185// 2x
1186template <size_t N, int kPow2>
1187HWY_API svuint32_t PromoteTo(Simd<uint32_t, N, kPow2> dto, svuint8_t vfrom) {
1188 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1189 return PromoteTo(dto, PromoteTo(d2, vfrom));
1190}
1191template <size_t N, int kPow2>
1192HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svint8_t vfrom) {
1193 const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1194 return PromoteTo(dto, PromoteTo(d2, vfrom));
1195}
1196
1197// Sign change
1198template <size_t N, int kPow2>
1199HWY_API svint16_t PromoteTo(Simd<int16_t, N, kPow2> dto, svuint8_t vfrom) {
1200 const RebindToUnsigned<decltype(dto)> du;
1201 return BitCast(dto, PromoteTo(du, vfrom));
1202}
1203template <size_t N, int kPow2>
1204HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint16_t vfrom) {
1205 const RebindToUnsigned<decltype(dto)> du;
1206 return BitCast(dto, PromoteTo(du, vfrom));
1207}
1208template <size_t N, int kPow2>
1209HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
1210 const Repartition<uint16_t, DFromV<decltype(vfrom)>> du16;
1211 const Repartition<int16_t, decltype(du16)> di16;
1212 return PromoteTo(dto, BitCast(di16, PromoteTo(du16, vfrom)));
1213}
1214
1215// ------------------------------ PromoteTo F
1216
1217namespace detail {
1219} // namespace detail
1220
1221template <size_t N, int kPow2>
1223 const svfloat16_t v) {
1224 // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
1225 // first replicate each lane once.
1226 const svfloat16_t vv = detail::ZipLower(v, v);
1227 return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
1228}
1229
1230template <size_t N, int kPow2>
1232 const svfloat32_t v) {
1233 const svfloat32_t vv = detail::ZipLower(v, v);
1234 return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
1235}
1236
1237template <size_t N, int kPow2>
1239 const svint32_t v) {
1240 const svint32_t vv = detail::ZipLower(v, v);
1241 return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
1242}
1243
1244// For 16-bit Compress
1245namespace detail {
1247#undef HWY_SVE_PROMOTE_TO
1248
1249template <size_t N, int kPow2>
1250HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) {
1251 const RebindToUnsigned<decltype(df)> du;
1252 const RepartitionToNarrow<decltype(du)> dn;
1253 return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
1254}
1255
1256} // namespace detail
1257
1258// ------------------------------ DemoteTo U
1259
1260namespace detail {
1261
1262// Saturates unsigned vectors to half/quarter-width TN.
1263template <typename TN, class VU>
1264VU SaturateU(VU v) {
1265 return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>()));
1266}
1267
1268// Saturates unsigned vectors to half/quarter-width TN.
1269template <typename TN, class VI>
1270VI SaturateI(VI v) {
1271 return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
1272}
1273
1274} // namespace detail
1275
1276template <size_t N, int kPow2>
1277HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint16_t v) {
1278 const DFromV<decltype(v)> di;
1279 const RebindToUnsigned<decltype(di)> du;
1280 using TN = TFromD<decltype(dn)>;
1281 // First clamp negative numbers to zero and cast to unsigned.
1282 const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0));
1283 // Saturate to unsigned-max and halve the width.
1284 const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1285 return svuzp1_u8(vn, vn);
1286}
1287
1288template <size_t N, int kPow2>
1289HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint32_t v) {
1290 const DFromV<decltype(v)> di;
1291 const RebindToUnsigned<decltype(di)> du;
1292 using TN = TFromD<decltype(dn)>;
1293 // First clamp negative numbers to zero and cast to unsigned.
1294 const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1295 // Saturate to unsigned-max and halve the width.
1296 const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1297 return svuzp1_u16(vn, vn);
1298}
1299
1300template <size_t N, int kPow2>
1301HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
1302 const DFromV<decltype(v)> di;
1303 const RebindToUnsigned<decltype(di)> du;
1304 const RepartitionToNarrow<decltype(du)> d2;
1305 using TN = TFromD<decltype(dn)>;
1306 // First clamp negative numbers to zero and cast to unsigned.
1307 const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1308 // Saturate to unsigned-max and quarter the width.
1309 const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped));
1310 const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16));
1311 return svuzp1_u8(x2, x2);
1312}
1313
1314HWY_API svuint8_t U8FromU32(const svuint32_t v) {
1315 const DFromV<svuint32_t> du32;
1316 const RepartitionToNarrow<decltype(du32)> du16;
1317 const RepartitionToNarrow<decltype(du16)> du8;
1318
1319 const svuint16_t cast16 = BitCast(du16, v);
1320 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1321 const svuint8_t cast8 = BitCast(du8, x2);
1322 return svuzp1_u8(cast8, cast8);
1323}
1324
1325// ------------------------------ DemoteTo I
1326
1327template <size_t N, int kPow2>
1328HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint16_t v) {
1329#if HWY_TARGET == HWY_SVE2
1330 const svint8_t vn = BitCast(dn, svqxtnb_s16(v));
1331#else
1332 using TN = TFromD<decltype(dn)>;
1333 const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1334#endif
1335 return svuzp1_s8(vn, vn);
1336}
1337
1338template <size_t N, int kPow2>
1339HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint32_t v) {
1340#if HWY_TARGET == HWY_SVE2
1341 const svint16_t vn = BitCast(dn, svqxtnb_s32(v));
1342#else
1343 using TN = TFromD<decltype(dn)>;
1344 const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1345#endif
1346 return svuzp1_s16(vn, vn);
1347}
1348
1349template <size_t N, int kPow2>
1350HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint32_t v) {
1351 const RepartitionToWide<decltype(dn)> d2;
1352#if HWY_TARGET == HWY_SVE2
1353 const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
1354#else
1355 using TN = TFromD<decltype(dn)>;
1356 const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v));
1357#endif
1358 const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16));
1359 return BitCast(dn, svuzp1_s8(v2, v2));
1360}
1361
1362// ------------------------------ ConcatEven/ConcatOdd
1363
1364// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
1365// full vector length, not rounded down to a power of two as we require).
1366namespace detail {
1367
1368#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1369 HWY_INLINE HWY_SVE_V(BASE, BITS) \
1370 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1371 return sv##OP##_##CHAR##BITS(lo, hi); \
1372 }
1375#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1376HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
1377HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
1378#endif
1379#undef HWY_SVE_CONCAT_EVERY_SECOND
1380
1381// Used to slide up / shift whole register left; mask indicates which range
1382// to take from lo, and the rest is filled from hi starting at its lowest.
1383#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
1384 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1385 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1386 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1387 }
1388HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
1389#undef HWY_SVE_SPLICE
1390
1391} // namespace detail
1392
1393template <class D>
1395#if HWY_SVE_IS_POW2
1396 (void)d;
1397 return detail::ConcatOdd(hi, lo);
1398#else
1399 const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
1400 const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
1401 return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1402#endif
1403}
1404
1405template <class D>
1407#if HWY_SVE_IS_POW2
1408 (void)d;
1409 return detail::ConcatEven(hi, lo);
1410#else
1411 const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
1412 const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
1413 return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1414#endif
1415}
1416
1417// ------------------------------ DemoteTo F
1418
1419template <size_t N, int kPow2>
1420HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
1421 const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
1422 return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1423}
1424
1425template <size_t N, int kPow2>
1426HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
1427 const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
1428 return detail::ConcatOdd(in_even, in_even); // can ignore upper half of vec
1429}
1430
1431template <size_t N, int kPow2>
1432HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
1433 const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
1434 return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1435}
1436
1437template <size_t N, int kPow2>
1438HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
1439 const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
1440 return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1441}
1442
1443// ------------------------------ ConvertTo F
1444
1445#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
1446 template <size_t N, int kPow2> \
1447 HWY_API HWY_SVE_V(BASE, BITS) \
1448 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
1449 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1450 } \
1451 /* Truncates (rounds toward zero). */ \
1452 template <size_t N, int kPow2> \
1453 HWY_API HWY_SVE_V(int, BITS) \
1454 NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
1455 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1456 }
1457
1458// API only requires f32 but we provide f64 for use by Iota.
1460#undef HWY_SVE_CONVERT
1461
1462// ------------------------------ NearestInt (Round, ConvertTo)
1463template <class VF, class DI = RebindToSigned<DFromV<VF>>>
1465 // No single instruction, round then truncate.
1466 return ConvertTo(DI(), Round(v));
1467}
1468
1469// ------------------------------ Iota (Add, ConvertTo)
1470
1471#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
1472 template <size_t N, int kPow2> \
1473 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1474 HWY_SVE_T(BASE, BITS) first) { \
1475 return sv##OP##_##CHAR##BITS(first, 1); \
1476 }
1477
1479#undef HWY_SVE_IOTA
1480
1481template <class D, HWY_IF_FLOAT_D(D)>
1483 const RebindToSigned<D> di;
1484 return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
1485}
1486
1487// ------------------------------ InterleaveLower
1488
1489template <class D, class V>
1490HWY_API V InterleaveLower(D d, const V a, const V b) {
1491 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
1492#if HWY_TARGET == HWY_SVE2_128
1493 (void)d;
1494 return detail::ZipLower(a, b);
1495#else
1496 // Move lower halves of blocks to lower half of vector.
1497 const Repartition<uint64_t, decltype(d)> d64;
1498 const auto a64 = BitCast(d64, a);
1499 const auto b64 = BitCast(d64, b);
1500 const auto a_blocks = detail::ConcatEven(a64, a64); // only lower half needed
1501 const auto b_blocks = detail::ConcatEven(b64, b64);
1502 return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1503#endif
1504}
1505
1506template <class V>
1507HWY_API V InterleaveLower(const V a, const V b) {
1508 return InterleaveLower(DFromV<V>(), a, b);
1509}
1510
1511// ------------------------------ InterleaveUpper
1512
1513// Only use zip2 if vector are a powers of two, otherwise getting the actual
1514// "upper half" requires MaskUpperHalf.
1515#if HWY_TARGET == HWY_SVE2_128
1516namespace detail {
1518} // namespace detail
1519#endif
1520
1521// Full vector: guaranteed to have at least one block
1522template <class D, class V = VFromD<D>,
1523 hwy::EnableIf<detail::IsFull(D())>* = nullptr>
1524HWY_API V InterleaveUpper(D d, const V a, const V b) {
1525#if HWY_TARGET == HWY_SVE2_128
1526 (void)d;
1527 return detail::ZipUpper(a, b);
1528#else
1529 // Move upper halves of blocks to lower half of vector.
1530 const Repartition<uint64_t, decltype(d)> d64;
1531 const auto a64 = BitCast(d64, a);
1532 const auto b64 = BitCast(d64, b);
1533 const auto a_blocks = detail::ConcatOdd(a64, a64); // only lower half needed
1534 const auto b_blocks = detail::ConcatOdd(b64, b64);
1535 return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1536#endif
1537}
1538
1539// Capped/fraction: need runtime check
1540template <class D, class V = VFromD<D>,
1541 hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
1542HWY_API V InterleaveUpper(D d, const V a, const V b) {
1543 // Less than one block: treat as capped
1544 if (Lanes(d) * sizeof(TFromD<D>) < 16) {
1545 const Half<decltype(d)> d2;
1546 return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
1547 }
1548 return InterleaveUpper(DFromV<V>(), a, b);
1549}
1550
1551// ================================================== COMBINE
1552
1553namespace detail {
1554
1555#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
1556template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1557svbool_t MaskLowerHalf(D d) {
1558 switch (Lanes(d)) {
1559 case 32:
1560 return svptrue_pat_b8(SV_VL16);
1561 case 16:
1562 return svptrue_pat_b8(SV_VL8);
1563 case 8:
1564 return svptrue_pat_b8(SV_VL4);
1565 case 4:
1566 return svptrue_pat_b8(SV_VL2);
1567 default:
1568 return svptrue_pat_b8(SV_VL1);
1569 }
1570}
1571template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1572svbool_t MaskLowerHalf(D d) {
1573 switch (Lanes(d)) {
1574 case 16:
1575 return svptrue_pat_b16(SV_VL8);
1576 case 8:
1577 return svptrue_pat_b16(SV_VL4);
1578 case 4:
1579 return svptrue_pat_b16(SV_VL2);
1580 default:
1581 return svptrue_pat_b16(SV_VL1);
1582 }
1583}
1584template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1585svbool_t MaskLowerHalf(D d) {
1586 switch (Lanes(d)) {
1587 case 8:
1588 return svptrue_pat_b32(SV_VL4);
1589 case 4:
1590 return svptrue_pat_b32(SV_VL2);
1591 default:
1592 return svptrue_pat_b32(SV_VL1);
1593 }
1594}
1595template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1596svbool_t MaskLowerHalf(D d) {
1597 switch (Lanes(d)) {
1598 case 4:
1599 return svptrue_pat_b64(SV_VL2);
1600 default:
1601 return svptrue_pat_b64(SV_VL1);
1602 }
1603}
1604#endif
1605#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
1606template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1607svbool_t MaskLowerHalf(D d) {
1608 switch (Lanes(d)) {
1609 case 16:
1610 return svptrue_pat_b8(SV_VL8);
1611 case 8:
1612 return svptrue_pat_b8(SV_VL4);
1613 case 4:
1614 return svptrue_pat_b8(SV_VL2);
1615 case 2:
1616 case 1:
1617 default:
1618 return svptrue_pat_b8(SV_VL1);
1619 }
1620}
1621template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1622svbool_t MaskLowerHalf(D d) {
1623 switch (Lanes(d)) {
1624 case 8:
1625 return svptrue_pat_b16(SV_VL4);
1626 case 4:
1627 return svptrue_pat_b16(SV_VL2);
1628 case 2:
1629 case 1:
1630 default:
1631 return svptrue_pat_b16(SV_VL1);
1632 }
1633}
1634template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1635svbool_t MaskLowerHalf(D d) {
1636 return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
1637}
1638template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1639svbool_t MaskLowerHalf(D /*d*/) {
1640 return svptrue_pat_b64(SV_VL1);
1641}
1642#endif // HWY_TARGET == HWY_SVE2_128
1643#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
1644template <class D>
1645svbool_t MaskLowerHalf(D d) {
1646 return FirstN(d, Lanes(d) / 2);
1647}
1648#endif
1649
1650template <class D>
1651svbool_t MaskUpperHalf(D d) {
1652 // TODO(janwas): WHILEGE on pow2 SVE2
1653 if (HWY_SVE_IS_POW2 && IsFull(d)) {
1654 return Not(MaskLowerHalf(d));
1655 }
1656
1657 // For Splice to work as intended, make sure bits above Lanes(d) are zero.
1659}
1660
1661// Right-shift vector pair by constexpr; can be used to slide down (=N) or up
1662// (=Lanes()-N).
1663#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
1664 template <size_t kIndex> \
1665 HWY_API HWY_SVE_V(BASE, BITS) \
1666 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1667 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1668 }
1670#undef HWY_SVE_EXT
1671
1672} // namespace detail
1673
1674// ------------------------------ ConcatUpperLower
1675template <class D, class V>
1676HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
1677 return IfThenElse(detail::MaskLowerHalf(d), lo, hi);
1678}
1679
1680// ------------------------------ ConcatLowerLower
1681template <class D, class V>
1682HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
1683 if (detail::IsFull(d)) {
1684#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1685 return detail::ConcatEvenBlocks(hi, lo);
1686#endif
1687#if HWY_TARGET == HWY_SVE2_128
1688 const Repartition<uint64_t, D> du64;
1689 const auto lo64 = BitCast(du64, lo);
1690 return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi)));
1691#endif
1692 }
1693 return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
1694}
1695
1696// ------------------------------ ConcatLowerUpper
1697template <class D, class V>
1698HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
1699#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
1700 if (detail::IsFull(d)) {
1701 return detail::Ext<Lanes(d) / 2>(hi, lo);
1702 }
1703#endif
1704 return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
1705}
1706
1707// ------------------------------ ConcatUpperUpper
1708template <class D, class V>
1709HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
1710 if (detail::IsFull(d)) {
1711#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1712 return detail::ConcatOddBlocks(hi, lo);
1713#endif
1714#if HWY_TARGET == HWY_SVE2_128
1715 const Repartition<uint64_t, D> du64;
1716 const auto lo64 = BitCast(du64, lo);
1717 return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi)));
1718#endif
1719 }
1720 const svbool_t mask_upper = detail::MaskUpperHalf(d);
1721 const V lo_upper = detail::Splice(lo, lo, mask_upper);
1722 return IfThenElse(mask_upper, hi, lo_upper);
1723}
1724
1725// ------------------------------ Combine
1726template <class D, class V2>
1727HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) {
1728 return ConcatLowerLower(d, hi, lo);
1729}
1730
1731// ------------------------------ ZeroExtendVector
1732template <class D, class V>
1733HWY_API V ZeroExtendVector(const D d, const V lo) {
1734 return Combine(d, Zero(Half<D>()), lo);
1735}
1736
1737// ------------------------------ Lower/UpperHalf
1738
1739template <class D2, class V>
1740HWY_API V LowerHalf(D2 /* tag */, const V v) {
1741 return v;
1742}
1743
1744template <class V>
1745HWY_API V LowerHalf(const V v) {
1746 return v;
1747}
1748
1749template <class D2, class V>
1750HWY_API V UpperHalf(const D2 d2, const V v) {
1751#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
1752 return detail::Ext<Lanes(d2)>(v, v);
1753#else
1754 return detail::Splice(v, v, detail::MaskUpperHalf(Twice<decltype(d2)>()));
1755#endif
1756}
1757
1758// ================================================== REDUCE
1759
1760// These return T, whereas the Highway op returns a broadcasted vector.
1761namespace detail {
1762#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1763 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1764 return sv##OP##_##CHAR##BITS(pg, v); \
1765 }
1766
1770// NaN if all are
1773
1774#undef HWY_SVE_REDUCE
1775} // namespace detail
1776
1777template <class D, class V>
1778V SumOfLanes(D d, V v) {
1780}
1781
1782template <class D, class V>
1783V MinOfLanes(D d, V v) {
1785}
1786
1787template <class D, class V>
1788V MaxOfLanes(D d, V v) {
1790}
1791
1792
1793// ================================================== SWIZZLE
1794
1795// ------------------------------ GetLane
1796
1797namespace detail {
1798#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
1799 HWY_INLINE HWY_SVE_T(BASE, BITS) \
1800 NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1801 return sv##OP##_##CHAR##BITS(mask, v); \
1802 }
1803
1805#undef HWY_SVE_GET_LANE
1806} // namespace detail
1807
1808template <class V>
1809HWY_API TFromV<V> GetLane(V v) {
1810 return detail::GetLane(v, detail::PFalse());
1811}
1812
1813// ------------------------------ ExtractLane
1814template <class V>
1816 return detail::GetLane(v, FirstN(DFromV<V>(), i));
1817}
1818
1819// ------------------------------ InsertLane (IfThenElse)
1820template <class V>
1821HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
1822 const DFromV<V> d;
1823 const auto is_i = detail::EqN(Iota(d, 0), static_cast<TFromV<V>>(i));
1824 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
1825}
1826
1827// ------------------------------ DupEven
1828
1829namespace detail {
1830HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1)
1831} // namespace detail
1832
1833template <class V>
1834HWY_API V DupEven(const V v) {
1835 return detail::InterleaveEven(v, v);
1836}
1837
1838// ------------------------------ DupOdd
1839
1840namespace detail {
1841HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2)
1842} // namespace detail
1843
1844template <class V>
1845HWY_API V DupOdd(const V v) {
1846 return detail::InterleaveOdd(v, v);
1847}
1848
1849// ------------------------------ OddEven
1850
1851#if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE2
1852
1853#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
1854 HWY_API HWY_SVE_V(BASE, BITS) \
1855 NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
1856 return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0); \
1857 }
1858
1860#undef HWY_SVE_ODD_EVEN
1861
1862template <class V, HWY_IF_FLOAT_V(V)>
1863HWY_API V OddEven(const V odd, const V even) {
1864 const DFromV<V> d;
1865 const RebindToUnsigned<decltype(d)> du;
1866 return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even)));
1867}
1868
1869#else
1870
1871template <class V>
1872HWY_API V OddEven(const V odd, const V even) {
1873 const auto odd_in_even = detail::Ext<1>(odd, odd);
1874 return detail::InterleaveEven(even, odd_in_even);
1875}
1876
1877#endif // HWY_TARGET
1878
1879// ------------------------------ OddEvenBlocks
1880template <class V>
1881HWY_API V OddEvenBlocks(const V odd, const V even) {
1882 const DFromV<V> d;
1883#if HWY_TARGET == HWY_SVE_256
1884 return ConcatUpperLower(d, odd, even);
1885#elif HWY_TARGET == HWY_SVE2_128
1886 (void)odd;
1887 (void)d;
1888 return even;
1889#else
1890 const RebindToUnsigned<decltype(d)> du;
1891 using TU = TFromD<decltype(du)>;
1892 constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
1893 const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
1894 const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
1895 const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
1896 return IfThenElse(is_even, even, odd);
1897#endif
1898}
1899
1900// ------------------------------ TableLookupLanes
1901
1902template <class D, class VI>
1904 using TI = TFromV<VI>;
1905 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size mismatch");
1906 const RebindToUnsigned<D> du;
1907 const auto indices = BitCast(du, vec);
1908#if HWY_IS_DEBUG_BUILD
1909 HWY_DASSERT(AllTrue(du, detail::LtN(indices, static_cast<TI>(Lanes(d)))));
1910#else
1911 (void)d;
1912#endif
1913 return indices;
1914}
1915
1916template <class D, typename TI>
1918 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
1919 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
1920}
1921
1922// <32bit are not part of Highway API, but used in Broadcast.
1923#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
1924 HWY_API HWY_SVE_V(BASE, BITS) \
1925 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
1926 return sv##OP##_##CHAR##BITS(v, idx); \
1927 }
1928
1930#undef HWY_SVE_TABLE
1931
1932// ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
1933
1934namespace detail {
1935
1936template <typename T, size_t N, int kPow2>
1937constexpr size_t LanesPerBlock(Simd<T, N, kPow2> /* tag */) {
1938 // We might have a capped vector smaller than a block, so honor that.
1939 return HWY_MIN(16 / sizeof(T), detail::ScaleByPower(N, kPow2));
1940}
1941
1942} // namespace detail
1943
1944template <class V>
1946 const DFromV<V> d;
1947#if HWY_TARGET == HWY_SVE_256
1948 return ConcatLowerUpper(d, v, v);
1949#elif HWY_TARGET == HWY_SVE2_128
1950 (void)d;
1951 return v;
1952#else
1953 const RebindToUnsigned<decltype(d)> du;
1954 constexpr auto kLanesPerBlock =
1955 static_cast<TFromV<V>>(detail::LanesPerBlock(d));
1956 const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
1957 return TableLookupLanes(v, idx);
1958#endif
1959}
1960
1961// ------------------------------ Reverse
1962
1963namespace detail {
1964
1965#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
1966 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1967 return sv##OP##_##CHAR##BITS(v); \
1968 }
1969
1970HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
1971#undef HWY_SVE_REVERSE
1972
1973} // namespace detail
1974
1975template <class D, class V>
1977 using T = TFromD<D>;
1978 const auto reversed = detail::ReverseFull(v);
1979 if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed;
1980 // Shift right to remove extra (non-pow2 and remainder) lanes.
1981 // TODO(janwas): on SVE2, use WHILEGE.
1982 // Avoids FirstN truncating to the return vector size. Must also avoid Not
1983 // because that is limited to SV_POW2.
1984 const ScalableTag<T> dfull;
1985 const svbool_t all_true = detail::AllPTrue(dfull);
1986 const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
1987 const svbool_t mask =
1988 svnot_b_z(all_true, FirstN(dfull, all_lanes - Lanes(d)));
1989 return detail::Splice(reversed, reversed, mask);
1990}
1991
1992// ------------------------------ Reverse2
1993
1994template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1996 const RebindToUnsigned<decltype(d)> du;
1997 const RepartitionToWide<decltype(du)> dw;
1998 return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v)));
1999}
2000
2001template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
2002HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2003 const RebindToUnsigned<decltype(d)> du;
2004 const RepartitionToWide<decltype(du)> dw;
2005 return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v)));
2006}
2007
2008template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
2009HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
2010#if HWY_TARGET == HWY_SVE2_128
2011 if (detail::IsFull(d)) {
2012 return detail::Ext<1>(v, v);
2013 }
2014#endif
2015 (void)d;
2016 const auto odd_in_even = detail::Ext<1>(v, v); // x321
2017 return detail::InterleaveEven(odd_in_even, v); // 2301
2018}
2019// ------------------------------ Reverse4 (TableLookupLanes)
2020template <class D>
2022 if (HWY_TARGET == HWY_SVE_256 && sizeof(TFromD<D>) == 8 &&
2023 detail::IsFull(d)) {
2024 return detail::ReverseFull(v);
2025 }
2026 // TODO(janwas): is this approach faster than Shuffle0123?
2027 const RebindToUnsigned<decltype(d)> du;
2028 const auto idx = detail::XorN(Iota(du, 0), 3);
2029 return TableLookupLanes(v, idx);
2030}
2031
2032// ------------------------------ Reverse8 (TableLookupLanes)
2033template <class D>
2035 const RebindToUnsigned<decltype(d)> du;
2036 const auto idx = detail::XorN(Iota(du, 0), 7);
2037 return TableLookupLanes(v, idx);
2038}
2039
2040// ------------------------------ Compress (PromoteTo)
2041
2042template <typename T>
2043struct CompressIsPartition {
2044#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2045 // Optimization for 64-bit lanes (could also be applied to 32-bit, but that
2046 // requires a larger table).
2047 enum { value = (sizeof(T) == 8) };
2048#else
2049 enum { value = 0 };
2050#endif // HWY_TARGET == HWY_SVE_256
2051};
2052
2053#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
2054 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
2055 return sv##OP##_##CHAR##BITS(mask, v); \
2056 }
2057
2058#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2061#else
2063#endif
2064#undef HWY_SVE_COMPRESS
2065
2066#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2067template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2068HWY_API V Compress(V v, svbool_t mask) {
2069 const DFromV<V> d;
2070 const RebindToUnsigned<decltype(d)> du64;
2071
2072 // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
2073 // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
2074 // SetTableIndices.
2075 const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
2076 const size_t offset = detail::SumOfLanes(mask, bits);
2077
2078 // See CompressIsPartition.
2079 alignas(16) static constexpr uint64_t table[4 * 16] = {
2080 // PrintCompress64x4Tables
2081 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
2082 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
2083 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
2084 return TableLookupLanes(v, SetTableIndices(d, table + offset));
2085}
2086#endif // HWY_TARGET == HWY_SVE_256
2087#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2088template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2089HWY_API V Compress(V v, svbool_t mask) {
2090 // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
2091 // swaps upper/lower (the lower half is set to the upper half, and the
2092 // remaining upper half is filled from the lower half of the second v), and
2093 // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10
2094 // unchanged and map everything else to 00.
2095 const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
2096 return detail::Splice(v, v, AndNot(maskLL, mask));
2097}
2098#endif // HWY_TARGET == HWY_SVE_256
2099
2100template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
2101HWY_API V Compress(V v, svbool_t mask16) {
2102 static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
2103 const DFromV<V> d16;
2104
2105 // Promote vector and mask to 32-bit
2106 const RepartitionToWide<decltype(d16)> dw;
2107 const auto v32L = PromoteTo(dw, v);
2108 const auto v32H = detail::PromoteUpperTo(dw, v);
2109 const svbool_t mask32L = svunpklo_b(mask16);
2110 const svbool_t mask32H = svunpkhi_b(mask16);
2111
2112 const auto compressedL = Compress(v32L, mask32L);
2113 const auto compressedH = Compress(v32H, mask32H);
2114
2115 // Demote to 16-bit (already in range) - separately so we can splice
2116 const V evenL = BitCast(d16, compressedL);
2117 const V evenH = BitCast(d16, compressedH);
2118 const V v16L = detail::ConcatEven(evenL, evenL); // only lower half needed
2119 const V v16H = detail::ConcatEven(evenH, evenH);
2120
2121 // We need to combine two vectors of non-constexpr length, so the only option
2122 // is Splice, which requires us to synthesize a mask. NOTE: this function uses
2123 // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt.
2124 const size_t countL = detail::CountTrueFull(dw, mask32L);
2125 const auto compressed_maskL = FirstN(d16, countL);
2126 return detail::Splice(v16H, v16L, compressed_maskL);
2127}
2128
2129// Must treat float16_t as integers so we can ConcatEven.
2130HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
2131 const DFromV<decltype(v)> df;
2132 const RebindToSigned<decltype(df)> di;
2133 return BitCast(df, Compress(BitCast(di, v), mask16));
2134}
2135
2136// ------------------------------ CompressNot
2137
2138template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
2139HWY_API V CompressNot(V v, const svbool_t mask) {
2140 return Compress(v, Not(mask));
2141}
2142
2143template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2144HWY_API V CompressNot(V v, svbool_t mask) {
2145#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2146 // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
2147 // swaps upper/lower (the lower half is set to the upper half, and the
2148 // remaining upper half is filled from the lower half of the second v), and
2149 // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map
2150 // 01 to 10, and everything else to 00.
2151 const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane
2152 return detail::Splice(v, v, AndNot(mask, maskLL));
2153#endif
2154#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2155 const DFromV<V> d;
2156 const RebindToUnsigned<decltype(d)> du64;
2157
2158 // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
2159 // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
2160 // SetTableIndices.
2161 const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
2162 const size_t offset = detail::SumOfLanes(mask, bits);
2163
2164 // See CompressIsPartition.
2165 alignas(16) static constexpr uint64_t table[4 * 16] = {
2166 // PrintCompressNot64x4Tables
2167 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
2168 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
2169 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
2170 return TableLookupLanes(v, SetTableIndices(d, table + offset));
2171#endif // HWY_TARGET == HWY_SVE_256
2172
2173 return Compress(v, Not(mask));
2174}
2175
2176// ------------------------------ CompressBlocksNot
2177HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
2178#if HWY_TARGET == HWY_SVE2_128
2179 (void)mask;
2180 return v;
2181#endif
2182 return CompressNot(v, mask);
2183}
2184
2185// ------------------------------ CompressStore
2186template <class V, class D>
2187HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d,
2188 TFromD<D>* HWY_RESTRICT unaligned) {
2189 StoreU(Compress(v, mask), d, unaligned);
2190 return CountTrue(d, mask);
2191}
2192
2193// ------------------------------ CompressBlendedStore
2194template <class V, class D>
2195HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d,
2196 TFromD<D>* HWY_RESTRICT unaligned) {
2197 const size_t count = CountTrue(d, mask);
2198 const svbool_t store_mask = FirstN(d, count);
2199 BlendedStore(Compress(v, mask), store_mask, d, unaligned);
2200 return count;
2201}
2202
2203// ================================================== BLOCKWISE
2204
2205// ------------------------------ CombineShiftRightBytes
2206
2207// Prevent accidentally using these for 128-bit vectors - should not be
2208// necessary.
2209#if HWY_TARGET != HWY_SVE2_128
2210namespace detail {
2211
2212// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
2213// offsets are implicitly relative to the start of their 128-bit block.
2214template <class D, class V>
2215HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
2216 using T = MakeUnsigned<TFromD<D>>;
2217 return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0);
2218}
2219
2220template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 1)>
2221svbool_t FirstNPerBlock(D d) {
2222 const RebindToUnsigned<decltype(d)> du;
2223 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2224 const svuint8_t idx_mod =
2225 svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2226 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2227 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
2228 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
2229 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
2230 15 % kLanesPerBlock);
2231 return detail::LtN(BitCast(du, idx_mod), kLanes);
2232}
2233template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 2)>
2234svbool_t FirstNPerBlock(D d) {
2235 const RebindToUnsigned<decltype(d)> du;
2236 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2237 const svuint16_t idx_mod =
2238 svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2239 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2240 6 % kLanesPerBlock, 7 % kLanesPerBlock);
2241 return detail::LtN(BitCast(du, idx_mod), kLanes);
2242}
2243template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 4)>
2244svbool_t FirstNPerBlock(D d) {
2245 const RebindToUnsigned<decltype(d)> du;
2246 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2247 const svuint32_t idx_mod =
2248 svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2249 3 % kLanesPerBlock);
2250 return detail::LtN(BitCast(du, idx_mod), kLanes);
2251}
2252template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 8)>
2253svbool_t FirstNPerBlock(D d) {
2254 const RebindToUnsigned<decltype(d)> du;
2255 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2256 const svuint64_t idx_mod =
2257 svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
2258 return detail::LtN(BitCast(du, idx_mod), kLanes);
2259}
2260
2261} // namespace detail
2262#endif // HWY_TARGET != HWY_SVE2_128
2263
2264template <size_t kBytes, class D, class V = VFromD<D>>
2265HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
2266 const Repartition<uint8_t, decltype(d)> d8;
2267 const auto hi8 = BitCast(d8, hi);
2268 const auto lo8 = BitCast(d8, lo);
2269#if HWY_TARGET == HWY_SVE2_128
2270 return BitCast(d, detail::Ext<kBytes>(hi8, lo8));
2271#else
2272 const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
2273 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
2274 const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
2275 return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
2276#endif
2277}
2278
2279// ------------------------------ Shuffle2301
2280template <class V>
2282 const DFromV<V> d;
2283 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2284 return Reverse2(d, v);
2285}
2286
2287// ------------------------------ Shuffle2103
2288template <class V>
2290 const DFromV<V> d;
2291 const Repartition<uint8_t, decltype(d)> d8;
2292 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2293 const svuint8_t v8 = BitCast(d8, v);
2294 return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
2295}
2296
2297// ------------------------------ Shuffle0321
2298template <class V>
2300 const DFromV<V> d;
2301 const Repartition<uint8_t, decltype(d)> d8;
2302 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2303 const svuint8_t v8 = BitCast(d8, v);
2304 return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
2305}
2306
2307// ------------------------------ Shuffle1032
2308template <class V>
2310 const DFromV<V> d;
2311 const Repartition<uint8_t, decltype(d)> d8;
2312 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2313 const svuint8_t v8 = BitCast(d8, v);
2314 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
2315}
2316
2317// ------------------------------ Shuffle01
2318template <class V>
2319HWY_API V Shuffle01(const V v) {
2320 const DFromV<V> d;
2321 const Repartition<uint8_t, decltype(d)> d8;
2322 static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
2323 const svuint8_t v8 = BitCast(d8, v);
2324 return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
2325}
2326
2327// ------------------------------ Shuffle0123
2328template <class V>
2330 return Shuffle2301(Shuffle1032(v));
2331}
2332
2333// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
2334template <class D, class V = VFromD<D>>
2336#if HWY_TARGET == HWY_SVE_256
2337 if (detail::IsFull(d)) {
2338 return SwapAdjacentBlocks(v);
2339 } else if (detail::IsFull(Twice<D>())) {
2340 return v;
2341 }
2342#elif HWY_TARGET == HWY_SVE2_128
2343 (void)d;
2344 return v;
2345#endif
2346 const Repartition<uint64_t, D> du64;
2347 return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
2348}
2349
2350// ------------------------------ TableLookupBytes
2351
2352template <class V, class VI>
2353HWY_API VI TableLookupBytes(const V v, const VI idx) {
2354 const DFromV<VI> d;
2355 const Repartition<uint8_t, decltype(d)> du8;
2356#if HWY_TARGET == HWY_SVE2_128
2357 return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx)));
2358#else
2359 const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
2360 const auto idx8 = Add(BitCast(du8, idx), offsets128);
2361 return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
2362#endif
2363}
2364
2365template <class V, class VI>
2366HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
2367 const DFromV<VI> d;
2368 // Mask size must match vector type, so cast everything to this type.
2369 const Repartition<int8_t, decltype(d)> di8;
2370
2371 auto idx8 = BitCast(di8, idx);
2372 const auto msb = detail::LtN(idx8, 0);
2373
2374 const auto lookup = TableLookupBytes(BitCast(di8, v), idx8);
2375 return BitCast(d, IfThenZeroElse(msb, lookup));
2376}
2377
2378// ------------------------------ Broadcast
2379
2380#if HWY_TARGET == HWY_SVE2_128
2381namespace detail {
2382#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \
2383 template <int kLane> \
2384 HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
2385 return sv##OP##_##CHAR##BITS(v, kLane); \
2386 }
2387
2389#undef HWY_SVE_BROADCAST
2390} // namespace detail
2391#endif
2392
2393template <int kLane, class V>
2394HWY_API V Broadcast(const V v) {
2395 const DFromV<V> d;
2396 const RebindToUnsigned<decltype(d)> du;
2397 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
2398 static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
2399#if HWY_TARGET == HWY_SVE2_128
2400 return detail::Broadcast<kLane>(v);
2401#else
2402 auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
2403 if (kLane != 0) {
2404 idx = detail::AddN(idx, kLane);
2405 }
2406 return TableLookupLanes(v, idx);
2407#endif
2408}
2409
2410// ------------------------------ ShiftLeftLanes
2411
2412template <size_t kLanes, class D, class V = VFromD<D>>
2413HWY_API V ShiftLeftLanes(D d, const V v) {
2414 const auto zero = Zero(d);
2415 const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
2416#if HWY_TARGET == HWY_SVE2_128
2417 return shifted;
2418#else
2419 // Match x86 semantics by zeroing lower lanes in 128-bit blocks
2420 return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
2421#endif
2422}
2423
2424template <size_t kLanes, class V>
2426 return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
2427}
2428
2429// ------------------------------ ShiftRightLanes
2430template <size_t kLanes, class D, class V = VFromD<D>>
2432 // For capped/fractional vectors, clear upper lanes so we shift in zeros.
2433 if (!detail::IsFull(d)) {
2435 }
2436
2437#if HWY_TARGET == HWY_SVE2_128
2438 return detail::Ext<kLanes>(Zero(d), v);
2439#else
2440 const auto shifted = detail::Ext<kLanes>(v, v);
2441 // Match x86 semantics by zeroing upper lanes in 128-bit blocks
2442 constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
2443 const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
2444 return IfThenElseZero(mask, shifted);
2445#endif
2446}
2447
2448// ------------------------------ ShiftLeftBytes
2449
2450template <int kBytes, class D, class V = VFromD<D>>
2451HWY_API V ShiftLeftBytes(const D d, const V v) {
2452 const Repartition<uint8_t, decltype(d)> d8;
2453 return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
2454}
2455
2456template <int kBytes, class V>
2458 return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
2459}
2460
2461// ------------------------------ ShiftRightBytes
2462template <int kBytes, class D, class V = VFromD<D>>
2463HWY_API V ShiftRightBytes(const D d, const V v) {
2464 const Repartition<uint8_t, decltype(d)> d8;
2465 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
2466}
2467
2468// ------------------------------ ZipLower
2469
2470template <class V, class DW = RepartitionToWide<DFromV<V>>>
2471HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2472 const RepartitionToNarrow<DW> dn;
2473 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2474 return BitCast(dw, InterleaveLower(dn, a, b));
2475}
2476template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2477HWY_API VFromD<DW> ZipLower(const V a, const V b) {
2478 return BitCast(DW(), InterleaveLower(D(), a, b));
2479}
2480
2481// ------------------------------ ZipUpper
2482template <class V, class DW = RepartitionToWide<DFromV<V>>>
2483HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2484 const RepartitionToNarrow<DW> dn;
2485 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2486 return BitCast(dw, InterleaveUpper(dn, a, b));
2487}
2488
2489// ================================================== Ops with dependencies
2490
2491// ------------------------------ PromoteTo bfloat16 (ZipLower)
2492template <size_t N, int kPow2>
2494 const svuint16_t v) {
2495 return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
2496}
2497
2498// ------------------------------ ReorderDemote2To (OddEven)
2499template <size_t N, int kPow2>
2501 svfloat32_t a, svfloat32_t b) {
2502 const RebindToUnsigned<decltype(dbf16)> du16;
2503 const Repartition<uint32_t, decltype(dbf16)> du32;
2504 const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
2505 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2506}
2507
2508// ------------------------------ ZeroIfNegative (Lt, IfThenElse)
2509template <class V>
2511 return IfThenZeroElse(detail::LtN(v, 0), v);
2512}
2513
2514// ------------------------------ BroadcastSignBit (ShiftRight)
2515template <class V>
2517 return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
2518}
2519
2520// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
2521template <class V>
2522HWY_API V IfNegativeThenElse(V v, V yes, V no) {
2523 static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
2524 const DFromV<V> d;
2525 const RebindToSigned<decltype(d)> di;
2526
2527 const svbool_t m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2528 return IfThenElse(m, yes, no);
2529}
2530
2531// ------------------------------ AverageRound (ShiftRight)
2532
2533#if HWY_TARGET == HWY_SVE2
2536#else
2537template <class V>
2538V AverageRound(const V a, const V b) {
2539 return ShiftRight<1>(detail::AddN(Add(a, b), 1));
2540}
2541#endif // HWY_TARGET == HWY_SVE2
2542
2543// ------------------------------ LoadMaskBits (TestBit)
2544
2545// `p` points to at least 8 readable bytes, not all of which need be valid.
2546template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
2547HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
2548 const RebindToUnsigned<D> du;
2549 const svuint8_t iota = Iota(du, 0);
2550
2551 // Load correct number of bytes (bits/8) with 7 zeros after each.
2552 const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
2553 // Replicate bytes 8x such that each byte contains the bit that governs it.
2554 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
2555
2556 const svuint8_t bit =
2557 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
2558 return TestBit(rep8, bit);
2559}
2560
2561template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
2562HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2563 const uint8_t* HWY_RESTRICT bits) {
2564 const RebindToUnsigned<D> du;
2565 const Repartition<uint8_t, D> du8;
2566
2567 // There may be up to 128 bits; avoid reading past the end.
2568 const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits);
2569
2570 // Replicate bytes 16x such that each lane contains the bit that governs it.
2571 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0)));
2572
2573 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
2574 return TestBit(BitCast(du, rep16), bit);
2575}
2576
2577template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
2578HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2579 const uint8_t* HWY_RESTRICT bits) {
2580 const RebindToUnsigned<D> du;
2581 const Repartition<uint8_t, D> du8;
2582
2583 // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable,
2584 // so we can skip computing the actual length (Lanes(du)+7)/8.
2585 const svuint8_t bytes = svld1(FirstN(du8, 8), bits);
2586
2587 // Replicate bytes 32x such that each lane contains the bit that governs it.
2588 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0)));
2589
2590 // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
2591 const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
2592
2593 return TestBit(BitCast(du, rep32), bit);
2594}
2595
2596template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
2597HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2598 const uint8_t* HWY_RESTRICT bits) {
2599 const RebindToUnsigned<D> du;
2600
2601 // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
2602 // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
2603 uint32_t mask_bits;
2604 CopyBytes<4>(bits, &mask_bits);
2605 const auto vbits = Set(du, mask_bits);
2606
2607 // 2 ^ {0,1, .., 31}, will not have more lanes than that.
2608 const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0));
2609
2610 return TestBit(vbits, bit);
2611}
2612
2613// ------------------------------ StoreMaskBits
2614
2615namespace detail {
2616
2617// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
2618template <class T, HWY_IF_LANE_SIZE(T, 1)>
2619HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2620 return svdup_n_u8_z(m, 1);
2621}
2622template <class T, HWY_IF_LANE_SIZE(T, 2)>
2623HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2624 const ScalableTag<uint8_t> d8;
2625 const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
2626 return detail::ConcatEven(b16, b16); // only lower half needed
2627}
2628template <class T, HWY_IF_LANE_SIZE(T, 4)>
2629HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2630 return U8FromU32(svdup_n_u32_z(m, 1));
2631}
2632template <class T, HWY_IF_LANE_SIZE(T, 8)>
2633HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2634 const ScalableTag<uint32_t> d32;
2635 const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
2636 return U8FromU32(detail::ConcatEven(b64, b64)); // only lower half needed
2637}
2638
2639// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
2640HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
2641 const ScalableTag<uint8_t> d8;
2642 const ScalableTag<uint16_t> d16;
2643 const ScalableTag<uint32_t> d32;
2644 const ScalableTag<uint64_t> d64;
2645 // TODO(janwas): could use SVE2 BDEP, but it's optional.
2646 x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
2647 x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
2648 x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
2649 return BitCast(d64, x);
2650}
2651
2652} // namespace detail
2653
2654// `p` points to at least 8 writable bytes.
2655// TODO(janwas): specialize for HWY_SVE_256
2656template <class D>
2657HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
2658 svuint64_t bits_in_u64 =
2660
2661 const size_t num_bits = Lanes(d);
2662 const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below
2663
2664 // Truncate each u64 to 8 bits and store to u8.
2665 svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
2666
2667 // Non-full byte, need to clear the undefined upper bits. Can happen for
2668 // capped/fractional vectors or large T and small hardware vectors.
2669 if (num_bits < 8) {
2670 const int mask = (1ull << num_bits) - 1;
2671 bits[0] = static_cast<uint8_t>(bits[0] & mask);
2672 }
2673 // Else: we wrote full bytes because num_bits is a power of two >= 8.
2674
2675 return num_bytes;
2676}
2677
2678// ------------------------------ CompressBits (LoadMaskBits)
2679template <class V>
2680HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2681 return Compress(v, LoadMaskBits(DFromV<V>(), bits));
2682}
2683
2684// ------------------------------ CompressBitsStore (LoadMaskBits)
2685template <class D>
2687 D d, TFromD<D>* HWY_RESTRICT unaligned) {
2688 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2689}
2690
2691// ------------------------------ MulEven (InterleaveEven)
2692
2693#if HWY_TARGET == HWY_SVE2
2694namespace detail {
2695#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
2696 HWY_API HWY_SVE_V(BASE, BITS) \
2697 NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
2698 return sv##OP##_##CHAR##BITS(a, b); \
2699 }
2700
2702#undef HWY_SVE_MUL_EVEN
2703} // namespace detail
2704#endif
2705
2706template <class V, class DW = RepartitionToWide<DFromV<V>>>
2707HWY_API VFromD<DW> MulEven(const V a, const V b) {
2708#if HWY_TARGET == HWY_SVE2
2709 return BitCast(DW(), detail::MulEven(a, b));
2710#else
2711 const auto lo = Mul(a, b);
2712 const auto hi = detail::MulHigh(a, b);
2713 return BitCast(DW(), detail::InterleaveEven(lo, hi));
2714#endif
2715}
2716
2717HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
2718 const auto lo = Mul(a, b);
2719 const auto hi = detail::MulHigh(a, b);
2720 return detail::InterleaveEven(lo, hi);
2721}
2722
2723HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
2724 const auto lo = Mul(a, b);
2725 const auto hi = detail::MulHigh(a, b);
2726 return detail::InterleaveOdd(lo, hi);
2727}
2728
2729// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2730template <size_t N, int kPow2>
2732 svuint16_t a, svuint16_t b,
2733 const svfloat32_t sum0,
2734 svfloat32_t& sum1) {
2735 // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16.
2736 const Repartition<uint16_t, decltype(df32)> du16;
2737 const RebindToUnsigned<decltype(df32)> du32;
2738 const svuint16_t zero = Zero(du16);
2739 const svuint32_t a0 = ZipLower(du32, zero, BitCast(du16, a));
2740 const svuint32_t a1 = ZipUpper(du32, zero, BitCast(du16, a));
2741 const svuint32_t b0 = ZipLower(du32, zero, BitCast(du16, b));
2742 const svuint32_t b1 = ZipUpper(du32, zero, BitCast(du16, b));
2743 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2744 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2745}
2746
2747// ------------------------------ AESRound / CLMul
2748
2749#if defined(__ARM_FEATURE_SVE2_AES)
2750
2751// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
2752#ifdef HWY_NATIVE_AES
2753#undef HWY_NATIVE_AES
2754#else
2755#define HWY_NATIVE_AES
2756#endif
2757
2758HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) {
2759 // It is not clear whether E and MC fuse like they did on NEON.
2760 const svuint8_t zero = svdup_n_u8(0);
2761 return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
2762}
2763
2764HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) {
2765 return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
2766}
2767
2768HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) {
2769 return svpmullb_pair(a, b);
2770}
2771
2772HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
2773 return svpmullt_pair(a, b);
2774}
2775
2776#endif // __ARM_FEATURE_SVE2_AES
2777
2778// ------------------------------ Lt128
2779
2780namespace detail {
2781#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \
2782 template <size_t N, int kPow2> \
2783 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \
2784 return sv##OP##_b##BITS(m, m); \
2785 }
2786
2787HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) // actually for bool
2788HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool
2789#undef HWY_SVE_DUP
2790
2791#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2792template <class D>
2793HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
2794 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2795 const svbool_t eqHx = Eq(a, b); // only odd lanes used
2796 // Convert to vector: more pipelines can TRN* for vectors than predicates.
2797 const svuint64_t ltHL = VecFromMask(d, Lt(a, b));
2798 // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
2799 // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated.
2800 const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL);
2801 // Duplicate upper lane into lower.
2802 return DupOdd(ltHx);
2803}
2804#endif
2805} // namespace detail
2806
2807template <class D>
2808HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
2809#if HWY_TARGET == HWY_SVE_256
2810 return MaskFromVec(detail::Lt128Vec(d, a, b));
2811#else
2812 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2813 const svbool_t eqHx = Eq(a, b); // only odd lanes used
2814 const svbool_t ltHL = Lt(a, b);
2815 // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
2816 const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL);
2817 // Duplicate upper lane into lower.
2818 return detail::DupOddB(d, ltHx);
2819#endif // HWY_TARGET != HWY_SVE_256
2820}
2821
2822// ------------------------------ Lt128Upper
2823
2824template <class D>
2825HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
2826 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2827 const svbool_t ltHL = Lt(a, b);
2828 return detail::DupOddB(d, ltHL);
2829}
2830
2831// ------------------------------ Min128, Max128 (Lt128)
2832
2833template <class D>
2834HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
2835#if HWY_TARGET == HWY_SVE_256
2836 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
2837#else
2838 return IfThenElse(Lt128(d, a, b), a, b);
2839#endif
2840}
2841
2842template <class D>
2843HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
2844#if HWY_TARGET == HWY_SVE_256
2845 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
2846#else
2847 return IfThenElse(Lt128(d, b, a), a, b);
2848#endif
2849}
2850
2851template <class D>
2852HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) {
2853 return IfThenElse(Lt128Upper(d, a, b), a, b);
2854}
2855
2856template <class D>
2857HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) {
2858 return IfThenElse(Lt128Upper(d, b, a), a, b);
2859}
2860
2861// ================================================== END MACROS
2862namespace detail { // for code folding
2863#undef HWY_IF_FLOAT_V
2864#undef HWY_IF_LANE_SIZE_V
2865#undef HWY_IF_SIGNED_V
2866#undef HWY_IF_UNSIGNED_V
2867#undef HWY_SVE_ALL_PTRUE
2868#undef HWY_SVE_D
2869#undef HWY_SVE_FOREACH
2870#undef HWY_SVE_FOREACH_F
2871#undef HWY_SVE_FOREACH_F16
2872#undef HWY_SVE_FOREACH_F32
2873#undef HWY_SVE_FOREACH_F64
2874#undef HWY_SVE_FOREACH_I
2875#undef HWY_SVE_FOREACH_I08
2876#undef HWY_SVE_FOREACH_I16
2877#undef HWY_SVE_FOREACH_I32
2878#undef HWY_SVE_FOREACH_I64
2879#undef HWY_SVE_FOREACH_IF
2880#undef HWY_SVE_FOREACH_U
2881#undef HWY_SVE_FOREACH_U08
2882#undef HWY_SVE_FOREACH_U16
2883#undef HWY_SVE_FOREACH_U32
2884#undef HWY_SVE_FOREACH_U64
2885#undef HWY_SVE_FOREACH_UI
2886#undef HWY_SVE_FOREACH_UI08
2887#undef HWY_SVE_FOREACH_UI16
2888#undef HWY_SVE_FOREACH_UI32
2889#undef HWY_SVE_FOREACH_UI64
2890#undef HWY_SVE_FOREACH_UIF3264
2891#undef HWY_SVE_PTRUE
2892#undef HWY_SVE_RETV_ARGPV
2893#undef HWY_SVE_RETV_ARGPVN
2894#undef HWY_SVE_RETV_ARGPVV
2895#undef HWY_SVE_RETV_ARGV
2896#undef HWY_SVE_RETV_ARGVN
2897#undef HWY_SVE_RETV_ARGVV
2898#undef HWY_SVE_T
2899#undef HWY_SVE_UNDEFINED
2900#undef HWY_SVE_V
2901
2902} // namespace detail
2903// NOLINTNEXTLINE(google-readability-namespace-comments)
2904} // namespace HWY_NAMESPACE
2905} // namespace hwy
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:103
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:59
HWY_AFTER_NAMESPACE()
#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1073
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:71
#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2781
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:730
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1383
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1025
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1368
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:55
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1471
#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1107
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:353
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:672
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:342
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:126
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:259
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1965
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:946
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:527
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1040
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:300
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:938
#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1853
#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2382
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:111
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:155
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:63
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:328
#define HWY_SVE_PTRUE(BITS)
Definition: arm_sve-inl.h:206
#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1089
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1663
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1155
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1140
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:742
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1049
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:89
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:161
#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:955
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1923
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:178
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:601
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:95
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1174
#define HWY_SVE_IS_POW2
Definition: arm_sve-inl.h:30
#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:963
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2053
#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:930
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:802
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:483
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:83
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:138
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1445
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:774
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:437
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:122
HWY_BEFORE_NAMESPACE()
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:56
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:77
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:798
#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:866
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:270
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2695
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1126
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:107
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1015
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:569
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1762
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:99
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:173
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:151
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:447
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1798
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_TARGET
Definition: detect_targets.h:341
#define HWY_SVE_256
Definition: detect_targets.h:78
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition: arm_sve-inl.h:2619
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:189
DupOddB
Definition: arm_sve-inl.h:2788
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition: arm_sve-inl.h:2640
svbool_t MaskLowerHalf(D d)
Definition: arm_sve-inl.h:1557
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: rvv-inl.h:1817
svbool_t MakeMask(D d)
Definition: arm_sve-inl.h:290
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1937
VI SaturateI(VI v)
Definition: arm_sve-inl.h:1270
svbool_t MaskUpperHalf(D d)
Definition: arm_sve-inl.h:1651
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
VU SaturateU(VU v)
Definition: arm_sve-inl.h:1264
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) HWY_API svbool_t PFalse()
Definition: arm_sve-inl.h:280
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition: arm_sve-inl.h:1250
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) HWY_SVE_FOREACH_U(HWY_SVE_DUP
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:111
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition: ops/shared-inl.h:103
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition: rvv-inl.h:1823
trn2 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2793
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPV, Not, not) namespace detail
Definition: arm_sve-inl.h:391
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API VFromD< D > ConcatEven(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1406
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2717
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_sve-inl.h:2483
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API VFromD< DW > ZipLower(const V a, const V b)
Definition: arm_sve-inl.h:2477
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API VFromD< D > ConcatOdd(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1394
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API constexpr bool IsSame()
Definition: base.h:322
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_sve-inl.h:40
Definition: ops/shared-inl.h:40
Definition: base.h:358
Definition: base.h:251
uint16_t bits
Definition: base.h:252