Grok  9.7.5
rvv-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // RISC-V V vectors (length not known at compile time).
17 // External include guard in highway.h - see comment there.
18 
19 #include <riscv_vector.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 
27 namespace hwy {
28 namespace HWY_NAMESPACE {
29 
30 template <class V>
31 struct DFromV_t {}; // specialized in macros
32 template <class V>
33 using DFromV = typename DFromV_t<RemoveConst<V>>::type;
34 
35 template <class V>
36 using TFromV = TFromD<DFromV<V>>;
37 
38 // Enables the overload if Pow2 is in [min, max].
39 #define HWY_RVV_IF_POW2_IN(D, min, max) \
40  hwy::EnableIf<(min) <= Pow2(D()) && Pow2(D()) <= (max)>* = nullptr
41 
42 template <typename T, size_t N, int kPow2>
43 constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) {
44  // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
45  // argument enables fractional LMUL < 1. Limit to 64 because that is the
46  // largest value for which vbool##_t are defined.
47  return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
48 }
49 
50 // ================================================== MACROS
51 
52 // Generate specializations and function definitions using X macros. Although
53 // harder to read and debug, writing everything manually is too bulky.
54 
55 namespace detail { // for code folding
56 
57 // For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
58 // The first two arguments are SEW and SHIFT such that SEW >> SHIFT = MLEN.
59 #define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
60  X_MACRO(64, 0, 64, NAME, OP) \
61  X_MACRO(32, 0, 32, NAME, OP) \
62  X_MACRO(16, 0, 16, NAME, OP) \
63  X_MACRO(8, 0, 8, NAME, OP) \
64  X_MACRO(8, 1, 4, NAME, OP) \
65  X_MACRO(8, 2, 2, NAME, OP) \
66  X_MACRO(8, 3, 1, NAME, OP)
67 
68 // For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
69 // reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
70 // _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
71 //
72 // Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
73 // reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
74 // respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
75 // Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
76 
77 // LMULS = _TRUNC: truncatable (not the smallest LMUL)
78 #define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
79  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
80  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
81  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
82  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
83  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
84  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
85 
86 #define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
87  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
88  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
89  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
90  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
91  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
92 
93 #define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
94  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
95  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
96  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
97  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
98 
99 #define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
100  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
101  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
102  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
103 
104 // LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
105 #define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
106  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
107  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
108  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
109  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
110  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
111  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
112 
113 #define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
114  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
115  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
116  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
117  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
118  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
119  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
120 
121 #define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
122  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
123  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
124  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
125  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
126  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
127 
128 #define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
129  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
130  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
131  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
132  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
133 
134 // LMULS = _EXT: not the largest LMUL
135 #define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
136  X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \
137  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
138  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
139  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
140  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
141  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)
142 
143 #define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
144  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
145  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
146  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
147  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
148  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)
149 
150 #define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
151  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
152  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
153  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
154  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)
155 
156 #define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
157  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
158  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
159  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)
160 
161 // LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
162 #define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
163  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
164  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
165 
166 #define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
167  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
168  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
169 
170 #define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
171  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
172  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
173 
174 #define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
175  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
176  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
177 
178 // 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
179 // 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
180 // though RISC-V LMUL must be at least SEW/64 (notice that this rules out
181 // LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
182 // one less than should be supported, with all other parameters (vector type
183 // etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
184 // returns half of what it usually would.
185 //
186 // Notice that we can only add overloads whenever there is a D argument: those
187 // are unique with respect to non-virtual-LMUL overloads because their kPow2
188 // template argument differs. Otherwise, there is no actual vuint64mf2_t, and
189 // defining another overload with the same LMUL would be an error. Thus we have
190 // a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
191 // _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
192 // functions that take a D.
193 
194 #define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
195 
196 #define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
197  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP)
198 
199 #define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
200  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP)
201 
202 #define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
203  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP)
204 
205 // ALL + VIRT
206 #define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
207  HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
208  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
209 
210 #define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
211  HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
212  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
213 
214 #define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
215  HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
216  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
217 
218 #define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
219  HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
220  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
221 
222 // EXT + VIRT
223 #define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
224  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
225  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
226 
227 #define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
228  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
229  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
230 
231 #define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
232  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
233  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
234 
235 #define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
236  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
237  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
238 
239 // DEMOTE + VIRT
240 #define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
241  HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
242  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
243 
244 #define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
245  HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
246  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
247 
248 #define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
249  HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
250  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
251 
252 #define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
253  HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
254  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
255 
256 // SEW for unsigned:
257 #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
258  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
259 #define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
260  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
261 #define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
262  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
263 #define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
264  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
265 
266 // SEW for signed:
267 #define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
268  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
269 #define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
270  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
271 #define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
272  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
273 #define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
274  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
275 
276 // SEW for float:
277 #if HWY_HAVE_FLOAT16
278 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
279  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
280 #else
281 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
282 #endif
283 #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
284  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
285 #define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
286  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
287 
288 // Commonly used type/SEW groups:
289 #define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
290  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
291  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
292 
293 #define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
294  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
295  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
296 
297 #define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
298  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
299  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
300 
301 #define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
302  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
303  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
304 
305 #define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
306  HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
307  HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
308 
309 #define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
310  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
311  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
312  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
313 
314 #define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
315  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
316  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
317  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
318 
319 #define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
320  HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
321  HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
322 
323 #define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
324  HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
325  HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
326 
327 // For all combinations of SEW:
328 #define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
329  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
330  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
331  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
332  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
333 
334 #define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
335  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
336  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
337  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
338  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
339 
340 #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
341  HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
342  HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
343 
344 // Commonly used type categories:
345 #define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
346  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
347  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
348 
349 #define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
350  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
351  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
352  HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
353 
354 // Assemble types for use in x-macros
355 #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
356 #define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
357 #define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
358 #define HWY_RVV_M(MLEN) vbool##MLEN##_t
359 
360 } // namespace detail
361 
362 // Until we have full intrinsic support for fractional LMUL, mixed-precision
363 // code can use LMUL 1..8 (adequate unless they need many registers).
364 #define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
365  MLEN, NAME, OP) \
366  template <> \
367  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
368  using Lane = HWY_RVV_T(BASE, SEW); \
369  using type = ScalableTag<Lane, SHIFT>; \
370  };
371 
373 #undef HWY_SPECIALIZE
374 
375 // ------------------------------ Lanes
376 
377 // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
378 // vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
379 #define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
380  MLEN, NAME, OP) \
381  template <size_t N> \
382  HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
383  size_t actual = v##OP##SEW##LMUL(); \
384  /* Common case of full vectors: avoid any extra instructions. */ \
385  /* actual includes LMUL, so do not shift again. */ \
386  if (detail::IsFull(d)) return actual; \
387  /* Check for virtual LMUL, e.g. "uint16mf8_t" (not provided by */ \
388  /* intrinsics). In this case the actual LMUL is 1/4, so divide by */ \
389  /* another factor of two. */ \
390  if (detail::ScaleByPower(128 / SEW, SHIFT) == 1) actual >>= 1; \
391  return HWY_MIN(actual, N); \
392  }
393 
394 HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL_VIRT)
395 #undef HWY_RVV_LANES
396 
397 template <size_t N, int kPow2>
400 }
401 
402 // ------------------------------ Common x-macros
403 
404 // Last argument to most intrinsics. Use when the op has no d arg of its own,
405 // which means there is no user-specified cap.
406 #define HWY_RVV_AVL(SEW, SHIFT) \
407  Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
408 
409 // vector = f(vector), e.g. Not
410 #define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
411  SHIFT, MLEN, NAME, OP) \
412  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
413  return v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
414  }
415 
416 // vector = f(vector, scalar), e.g. detail::AddS
417 #define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
418  SHIFT, MLEN, NAME, OP) \
419  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
420  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
421  return v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
422  }
423 
424 // vector = f(vector, vector), e.g. Add
425 #define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
426  SHIFT, MLEN, NAME, OP) \
427  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
428  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
429  return v##OP##_vv_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
430  }
431 
432 // ================================================== INIT
433 
434 // ------------------------------ Set
435 
436 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
437  MLEN, NAME, OP) \
438  template <size_t N> \
439  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
440  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \
441  return v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \
442  }
443 
444 HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
445 HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
446 #undef HWY_RVV_SET
447 
448 // Treat bfloat16_t as uint16_t (using the previously defined Set overloads);
449 // required for Zero and VFromD.
450 template <size_t N, int kPow2>
451 decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(Simd<bfloat16_t, N, kPow2> d,
452  bfloat16_t arg) {
453  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
454 }
455 
456 template <class D>
457 using VFromD = decltype(Set(D(), TFromD<D>()));
458 
459 // ------------------------------ Zero
460 
461 template <typename T, size_t N, int kPow2>
463  return Set(d, T(0));
464 }
465 
466 // ------------------------------ Undefined
467 
468 // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
469 // by it gives unpredictable results. It should only be used for maskoff, so
470 // keep it internal. For the Highway op, just use Zero (single instruction).
471 namespace detail {
472 #define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
473  SHIFT, MLEN, NAME, OP) \
474  template <size_t N> \
475  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
476  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \
477  return v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \
478  }
479 
481 #undef HWY_RVV_UNDEFINED
482 } // namespace detail
483 
484 template <class D>
486  return Zero(d);
487 }
488 
489 // ------------------------------ BitCast
490 
491 namespace detail {
492 
493 // Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
494 #define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
495  MLEN, NAME, OP) \
496  HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
497  return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(v); /* no AVL */ \
498  }
499 HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
500 #undef HWY_RVV_TRUNC
501 
502 // Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
503 #define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
504  MLEN, NAME, OP) \
505  template <size_t N> \
506  HWY_API HWY_RVV_V(BASE, SEW, LMULD) \
507  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
508  HWY_RVV_V(BASE, SEW, LMUL) v) { \
509  return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD(v); /* no AVL */ \
510  }
511 HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
512 #undef HWY_RVV_EXT
513 
514 // For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
515 // the same as the actual input type.
516 #define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
517  SHIFT, MLEN, NAME, OP) \
518  template <size_t N> \
519  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
520  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
521  HWY_RVV_V(BASE, SEW, LMUL) v) { \
522  return v; \
523  }
524 HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
525 #undef HWY_RVV_EXT_VIRT
526 
527 // For BitCastToByte, the D arg is only to prevent duplicate definitions caused
528 // by _ALL_VIRT.
529 
530 // There is no reinterpret from u8 <-> u8, so just return.
531 #define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
532  SHIFT, MLEN, NAME, OP) \
533  template <typename T, size_t N> \
534  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
535  vuint8##LMUL##_t v) { \
536  return v; \
537  } \
538  template <size_t N> \
539  HWY_API vuint8##LMUL##_t BitCastFromByte( \
540  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
541  return v; \
542  }
543 
544 // For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
545 #define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
546  SHIFT, MLEN, NAME, OP) \
547  template <typename T, size_t N> \
548  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
549  vint8##LMUL##_t v) { \
550  return vreinterpret_v_i8##LMUL##_u8##LMUL(v); \
551  } \
552  template <size_t N> \
553  HWY_API vint8##LMUL##_t BitCastFromByte( \
554  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
555  return vreinterpret_v_u8##LMUL##_i8##LMUL(v); \
556  }
557 
558 // Separate u/i because clang only provides signed <-> unsigned reinterpret for
559 // the same SEW.
560 #define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
561  MLEN, NAME, OP) \
562  template <typename T, size_t N> \
563  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
564  HWY_RVV_V(BASE, SEW, LMUL) v) { \
565  return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
566  } \
567  template <size_t N> \
568  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
569  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
570  return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
571  }
572 
573 // Signed/Float: first cast to/from unsigned
574 #define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
575  SHIFT, MLEN, NAME, OP) \
576  template <typename T, size_t N> \
577  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
578  HWY_RVV_V(BASE, SEW, LMUL) v) { \
579  return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
580  v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \
581  } \
582  template <size_t N> \
583  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
584  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
585  return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
586  v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
587  }
588 
589 // Additional versions for virtual LMUL using LMULH for byte vectors.
590 #define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
591  SHIFT, MLEN, NAME, OP) \
592  template <typename T, size_t N> \
593  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
594  HWY_RVV_V(BASE, SEW, LMUL) v) { \
595  return detail::Trunc(v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \
596  } \
597  template <size_t N> \
598  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
599  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
600  HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
601  const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
602  return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \
603  }
604 
605 // Signed/Float: first cast to/from unsigned
606 #define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
607  SHIFT, MLEN, NAME, OP) \
608  template <typename T, size_t N> \
609  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
610  HWY_RVV_V(BASE, SEW, LMUL) v) { \
611  return detail::Trunc(v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
612  v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \
613  } \
614  template <size_t N> \
615  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
616  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
617  HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
618  const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
619  return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
620  v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \
621  }
622 
623 HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
624 HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
625 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
626 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
627 HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
628 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
629 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
630 HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
631 
632 #undef HWY_RVV_CAST_U8
633 #undef HWY_RVV_CAST_I8
634 #undef HWY_RVV_CAST_U
635 #undef HWY_RVV_CAST_IF
636 #undef HWY_RVV_CAST_VIRT_U
637 #undef HWY_RVV_CAST_VIRT_IF
638 
639 template <size_t N, int kPow2>
643 }
644 
645 } // namespace detail
646 
647 template <class D, class FromV>
648 HWY_API VFromD<D> BitCast(D d, FromV v) {
650 }
651 
652 namespace detail {
653 
654 template <class V, class DU = RebindToUnsigned<DFromV<V>>>
656  return BitCast(DU(), v);
657 }
658 
659 } // namespace detail
660 
661 // ------------------------------ Iota
662 
663 namespace detail {
664 
665 #define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
666  MLEN, NAME, OP) \
667  template <size_t N> \
668  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
669  return v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \
670  }
671 
672 HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
673 #undef HWY_RVV_IOTA
674 
675 template <class D, class DU = RebindToUnsigned<D>>
676 HWY_INLINE VFromD<DU> Iota0(const D /*d*/) {
677  return BitCastToUnsigned(Iota0(DU()));
678 }
679 
680 } // namespace detail
681 
682 // ================================================== LOGICAL
683 
684 // ------------------------------ Not
685 
687 
688 template <class V, HWY_IF_FLOAT_V(V)>
689 HWY_API V Not(const V v) {
690  using DF = DFromV<V>;
691  using DU = RebindToUnsigned<DF>;
692  return BitCast(DF(), Not(BitCast(DU(), v)));
693 }
694 
695 // ------------------------------ And
696 
697 // Non-vector version (ideally immediate) for use with Iota0
698 namespace detail {
699 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
700 } // namespace detail
701 
703 
704 template <class V, HWY_IF_FLOAT_V(V)>
705 HWY_API V And(const V a, const V b) {
706  using DF = DFromV<V>;
707  using DU = RebindToUnsigned<DF>;
708  return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
709 }
710 
711 // ------------------------------ Or
712 
714 
715 template <class V, HWY_IF_FLOAT_V(V)>
716 HWY_API V Or(const V a, const V b) {
717  using DF = DFromV<V>;
718  using DU = RebindToUnsigned<DF>;
719  return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
720 }
721 
722 // ------------------------------ Xor
723 
724 // Non-vector version (ideally immediate) for use with Iota0
725 namespace detail {
726 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
727 } // namespace detail
728 
730 
731 template <class V, HWY_IF_FLOAT_V(V)>
732 HWY_API V Xor(const V a, const V b) {
733  using DF = DFromV<V>;
734  using DU = RebindToUnsigned<DF>;
735  return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
736 }
737 
738 // ------------------------------ AndNot
739 
740 template <class V>
741 HWY_API V AndNot(const V not_a, const V b) {
742  return And(Not(not_a), b);
743 }
744 
745 // ------------------------------ OrAnd
746 
747 template <class V>
748 HWY_API V OrAnd(const V o, const V a1, const V a2) {
749  return Or(o, And(a1, a2));
750 }
751 
752 // ------------------------------ CopySign
753 
755 
756 template <class V>
757 HWY_API V CopySignToAbs(const V abs, const V sign) {
758  // RVV can also handle abs < 0, so no extra action needed.
759  return CopySign(abs, sign);
760 }
761 
762 // ================================================== ARITHMETIC
763 
764 // ------------------------------ Add
765 
766 namespace detail {
767 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
768 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
769 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
770 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
771 } // namespace detail
772 
775 
776 // ------------------------------ Sub
779 
780 // ------------------------------ SaturatedAdd
781 
784 
787 
788 // ------------------------------ SaturatedSub
789 
792 
795 
796 // ------------------------------ AverageRound
797 
798 // TODO(janwas): check vxrm rounding mode
801 
802 // ------------------------------ ShiftLeft[Same]
803 
804 // Intrinsics do not define .vi forms, so use .vx instead.
805 #define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
806  MLEN, NAME, OP) \
807  template <int kBits> \
808  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
809  return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, HWY_RVV_AVL(SEW, SHIFT)); \
810  } \
811  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
812  NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
813  return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
814  HWY_RVV_AVL(SEW, SHIFT)); \
815  }
816 
818 
819 // ------------------------------ ShiftRight[Same]
820 
823 
824 #undef HWY_RVV_SHIFT
825 
826 // ------------------------------ SumsOf8 (ShiftRight, Add)
827 template <class VU8>
829  const DFromV<VU8> du8;
830  const RepartitionToWide<decltype(du8)> du16;
831  const RepartitionToWide<decltype(du16)> du32;
832  const RepartitionToWide<decltype(du32)> du64;
833  using VU16 = VFromD<decltype(du16)>;
834 
835  const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
836  const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
837  const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
838 
839  const VU16 szz_FE_zz_BA_zz_76_zz_32 =
840  BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
841  const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
842  Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
843  const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
844  BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
845  const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
846  Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
847  return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
848 }
849 
850 // ------------------------------ RotateRight
851 template <int kBits, class V>
852 HWY_API V RotateRight(const V v) {
853  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
854  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
855  if (kBits == 0) return v;
856  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
857 }
858 
859 // ------------------------------ Shl
860 #define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
861  SHIFT, MLEN, NAME, OP) \
862  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
863  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
864  return v##OP##_vv_##CHAR##SEW##LMUL(v, bits, HWY_RVV_AVL(SEW, SHIFT)); \
865  }
866 
868 
869 #define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
870  SHIFT, MLEN, NAME, OP) \
871  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
872  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
873  return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits), \
874  HWY_RVV_AVL(SEW, SHIFT)); \
875  }
876 
878 
879 // ------------------------------ Shr
880 
883 
884 #undef HWY_RVV_SHIFT_II
885 #undef HWY_RVV_SHIFT_VV
886 
887 // ------------------------------ Min
888 
892 
893 // ------------------------------ Max
894 
895 namespace detail {
896 
897 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
898 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
899 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
900 
901 } // namespace detail
902 
906 
907 // ------------------------------ Mul
908 
909 // Only for internal use (Highway only promises Mul for 16/32-bit inputs).
910 // Used by MulLower.
911 namespace detail {
913 } // namespace detail
914 
918 
919 // ------------------------------ MulHigh
920 
921 // Only for internal use (Highway only promises MulHigh for 16-bit inputs).
922 // Used by MulEven; vwmul does not work for m8.
923 namespace detail {
927 } // namespace detail
928 
931 
932 // ------------------------------ MulFixedPoint15
934 
935 // ------------------------------ Div
937 
938 // ------------------------------ ApproximateReciprocal
940 
941 // ------------------------------ Sqrt
943 
944 // ------------------------------ ApproximateReciprocalSqrt
946 
947 // ------------------------------ MulAdd
948 // Note: op is still named vv, not vvv.
949 #define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
950  MLEN, NAME, OP) \
951  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
952  NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
953  HWY_RVV_V(BASE, SEW, LMUL) add) { \
954  return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, HWY_RVV_AVL(SEW, SHIFT)); \
955  }
956 
957 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL)
958 
959 // ------------------------------ NegMulAdd
961 
962 // ------------------------------ MulSub
963 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL)
964 
965 // ------------------------------ NegMulSub
967 
968 #undef HWY_RVV_FMA
969 
970 // ================================================== COMPARE
971 
972 // Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
973 // vboolXX_t is a power of two divisor for vector bits. SLEN 8 / LMUL 1 = 1/8th
974 // of all bits; SLEN 8 / LMUL 4 = half of all bits.
975 
976 // mask = f(vector, vector)
977 #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
978  SHIFT, MLEN, NAME, OP) \
979  HWY_API HWY_RVV_M(MLEN) \
980  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
981  return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b, \
982  HWY_RVV_AVL(SEW, SHIFT)); \
983  }
984 
985 // mask = f(vector, scalar)
986 #define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
987  SHIFT, MLEN, NAME, OP) \
988  HWY_API HWY_RVV_M(MLEN) \
989  NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
990  return v##OP##_##CHAR##SEW##LMUL##_b##MLEN(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
991  }
992 
993 // ------------------------------ Eq
996 
997 namespace detail {
998 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
999 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
1000 } // namespace detail
1001 
1002 // ------------------------------ Ne
1005 
1006 namespace detail {
1007 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
1008 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
1009 } // namespace detail
1010 
1011 // ------------------------------ Lt
1015 
1016 namespace detail {
1017 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
1018 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
1019 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
1020 } // namespace detail
1021 
1022 // ------------------------------ Le
1024 
1025 #undef HWY_RVV_RETM_ARGVV
1026 #undef HWY_RVV_RETM_ARGVS
1027 
1028 // ------------------------------ Gt/Ge
1029 
1030 template <class V>
1031 HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
1032  return Le(b, a);
1033 }
1034 
1035 template <class V>
1036 HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
1037  return Lt(b, a);
1038 }
1039 
1040 // ------------------------------ TestBit
1041 template <class V>
1042 HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
1043  return detail::NeS(And(a, bit), 0);
1044 }
1045 
1046 // ------------------------------ Not
1047 
1048 // mask = f(mask)
1049 #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
1050  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
1051  return vm##OP##_m_b##MLEN(m, ~0ull); \
1052  }
1053 
1055 
1056 #undef HWY_RVV_RETM_ARGM
1057 
1058 // ------------------------------ And
1059 
1060 // mask = f(mask_a, mask_b) (note arg2,arg1 order!)
1061 #define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
1062  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
1063  return vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
1064  }
1065 
1067 
1068 // ------------------------------ AndNot
1070 
1071 // ------------------------------ Or
1073 
1074 // ------------------------------ Xor
1076 
1077 #undef HWY_RVV_RETM_ARGMM
1078 
1079 // ------------------------------ IfThenElse
1080 #define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1081  SHIFT, MLEN, NAME, OP) \
1082  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1083  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
1084  HWY_RVV_V(BASE, SEW, LMUL) no) { \
1085  return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes, HWY_RVV_AVL(SEW, SHIFT)); \
1086  }
1087 
1089 
1090 #undef HWY_RVV_IF_THEN_ELSE
1091 
1092 // ------------------------------ IfThenElseZero
1093 template <class M, class V>
1094 HWY_API V IfThenElseZero(const M mask, const V yes) {
1095  return IfThenElse(mask, yes, Zero(DFromV<V>()));
1096 }
1097 
1098 // ------------------------------ IfThenZeroElse
1099 
1100 #define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1101  LMULH, SHIFT, MLEN, NAME, OP) \
1102  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1103  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \
1104  return v##OP##_##CHAR##SEW##LMUL(m, no, 0, HWY_RVV_AVL(SEW, SHIFT)); \
1105  }
1106 
1109 
1110 #undef HWY_RVV_IF_THEN_ZERO_ELSE
1111 
1112 // ------------------------------ MaskFromVec
1113 
1114 template <class V>
1115 HWY_API auto MaskFromVec(const V v) -> decltype(Eq(v, v)) {
1116  return detail::NeS(v, 0);
1117 }
1118 
1119 template <class D>
1120 using MFromD = decltype(MaskFromVec(Zero(D())));
1121 
1122 template <class D, typename MFrom>
1123 HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
1124  // No need to check lane size/LMUL are the same: if not, casting MFrom to
1125  // MFromD<D> would fail.
1126  return mask;
1127 }
1128 
1129 // ------------------------------ VecFromMask
1130 
1131 namespace detail {
1132 #define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1133  SHIFT, MLEN, NAME, OP) \
1134  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1135  NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_M(MLEN) m) { \
1136  return v##OP##_##CHAR##SEW##LMUL##_m(m, v0, v0, 1, \
1137  HWY_RVV_AVL(SEW, SHIFT)); \
1138  }
1139 
1140 HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, SubS, sub_vx, _ALL)
1141 #undef HWY_RVV_VEC_FROM_MASK
1142 } // namespace detail
1143 
1144 template <class D, HWY_IF_NOT_FLOAT_D(D)>
1146  return detail::SubS(Zero(d), mask);
1147 }
1148 
1149 template <class D, HWY_IF_FLOAT_D(D)>
1150 HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
1151  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
1152 }
1153 
1154 // ------------------------------ IfVecThenElse (MaskFromVec)
1155 
1156 template <class V>
1157 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1158  return IfThenElse(MaskFromVec(mask), yes, no);
1159 }
1160 
1161 // ------------------------------ ZeroIfNegative
1162 template <class V>
1163 HWY_API V ZeroIfNegative(const V v) {
1164  return IfThenZeroElse(detail::LtS(v, 0), v);
1165 }
1166 
1167 // ------------------------------ BroadcastSignBit
1168 template <class V>
1169 HWY_API V BroadcastSignBit(const V v) {
1170  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1171 }
1172 
1173 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1174 template <class V>
1175 HWY_API V IfNegativeThenElse(V v, V yes, V no) {
1176  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
1177  const DFromV<V> d;
1178  const RebindToSigned<decltype(d)> di;
1179 
1180  MFromD<decltype(d)> m =
1182  return IfThenElse(m, yes, no);
1183 }
1184 
1185 // ------------------------------ FindFirstTrue
1186 
1187 #define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1188  template <class D> \
1189  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1190  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1191  return vfirst_m_b##MLEN(m, Lanes(d)); \
1192  }
1193 
1195 #undef HWY_RVV_FIND_FIRST_TRUE
1196 
1197 // ------------------------------ AllFalse
1198 template <class D>
1200  return FindFirstTrue(d, m) < 0;
1201 }
1202 
1203 // ------------------------------ AllTrue
1204 
1205 #define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1206  template <class D> \
1207  HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \
1208  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1209  return AllFalse(d, vmnot_m_b##MLEN(m, Lanes(d))); \
1210  }
1211 
1213 #undef HWY_RVV_ALL_TRUE
1214 
1215 // ------------------------------ CountTrue
1216 
1217 #define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1218  template <class D> \
1219  HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \
1220  static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1221  return vcpop_m_b##MLEN(m, Lanes(d)); \
1222  }
1223 
1225 #undef HWY_RVV_COUNT_TRUE
1226 
1227 // ================================================== MEMORY
1228 
1229 // ------------------------------ Load
1230 
1231 #define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1232  MLEN, NAME, OP) \
1233  template <size_t N> \
1234  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1235  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1236  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1237  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \
1238  }
1239 HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
1240 #undef HWY_RVV_LOAD
1241 
1242 // There is no native BF16, treat as uint16_t.
1243 template <size_t N, int kPow2>
1246  return Load(RebindToUnsigned<decltype(d)>(),
1247  reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1248 }
1249 
1250 template <size_t N, int kPow2>
1253  Store(v, RebindToUnsigned<decltype(d)>(),
1254  reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1255 }
1256 
1257 // ------------------------------ LoadU
1258 
1259 // RVV only requires lane alignment, not natural alignment of the entire vector.
1260 template <class D>
1261 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1262  return Load(d, p);
1263 }
1264 
1265 // ------------------------------ MaskedLoad
1266 
1267 #define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1268  SHIFT, MLEN, NAME, OP) \
1269  template <size_t N> \
1270  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1271  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1272  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1273  return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, Zero(d), p, Lanes(d)); \
1274  }
1276 #undef HWY_RVV_MASKED_LOAD
1277 
1278 // ------------------------------ Store
1279 
1280 #define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1281  MLEN, NAME, OP) \
1282  template <size_t N> \
1283  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1284  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1285  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1286  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \
1287  }
1288 HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1289 #undef HWY_RVV_STORE
1290 
1291 // ------------------------------ BlendedStore
1292 
1293 #define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1294  SHIFT, MLEN, NAME, OP) \
1295  template <size_t N> \
1296  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1297  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1298  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1299  return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d)); \
1300  }
1302 #undef HWY_RVV_BLENDED_STORE
1303 
1304 namespace detail {
1305 
1306 #define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1307  MLEN, NAME, OP) \
1308  template <size_t N> \
1309  HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1310  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
1311  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1312  return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count); \
1313  }
1314 HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1315 #undef HWY_RVV_STOREN
1316 
1317 } // namespace detail
1318 
1319 // ------------------------------ StoreU
1320 
1321 // RVV only requires lane alignment, not natural alignment of the entire vector.
1322 template <class V, class D>
1323 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1324  Store(v, d, p);
1325 }
1326 
1327 // ------------------------------ Stream
1328 template <class V, class D, typename T>
1329 HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
1330  Store(v, d, aligned);
1331 }
1332 
1333 // ------------------------------ ScatterOffset
1334 
1335 #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1336  SHIFT, MLEN, NAME, OP) \
1337  template <size_t N> \
1338  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1339  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1340  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1341  HWY_RVV_V(int, SEW, LMUL) offset) { \
1342  return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1343  base, detail::BitCastToUnsigned(offset), v, Lanes(d)); \
1344  }
1346 #undef HWY_RVV_SCATTER
1347 
1348 // ------------------------------ ScatterIndex
1349 
1350 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1352  const VFromD<RebindToSigned<D>> index) {
1353  return ScatterOffset(v, d, base, ShiftLeft<2>(index));
1354 }
1355 
1356 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1357 HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
1358  const VFromD<RebindToSigned<D>> index) {
1359  return ScatterOffset(v, d, base, ShiftLeft<3>(index));
1360 }
1361 
1362 // ------------------------------ GatherOffset
1363 
1364 #define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1365  MLEN, NAME, OP) \
1366  template <size_t N> \
1367  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1368  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1369  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1370  HWY_RVV_V(int, SEW, LMUL) offset) { \
1371  return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1372  base, detail::BitCastToUnsigned(offset), Lanes(d)); \
1373  }
1375 #undef HWY_RVV_GATHER
1376 
1377 // ------------------------------ GatherIndex
1378 
1379 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1381  const VFromD<RebindToSigned<D>> index) {
1382  return GatherOffset(d, base, ShiftLeft<2>(index));
1383 }
1384 
1385 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1386 HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
1387  const VFromD<RebindToSigned<D>> index) {
1388  return GatherOffset(d, base, ShiftLeft<3>(index));
1389 }
1390 
1391 // ------------------------------ StoreInterleaved3
1392 
1393 #define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1394  MLEN, NAME, OP) \
1395  template <size_t N> \
1396  HWY_API void NAME( \
1397  HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1398  HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1399  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
1400  return v##OP##e8_v_##CHAR##SEW##LMUL(unaligned, v0, v1, v2, Lanes(d)); \
1401  }
1402 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1403 HWY_RVV_STORE3(uint, u, 8, _, _, mf8, _, _, /*kShift=*/-3, 64,
1404  StoreInterleaved3, sseg3)
1405 HWY_RVV_STORE3(uint, u, 8, _, _, mf4, _, _, /*kShift=*/-2, 32,
1407 HWY_RVV_STORE3(uint, u, 8, _, _, mf2, _, _, /*kShift=*/-1, 16,
1408  StoreInterleaved3, sseg3)
1409 HWY_RVV_STORE3(uint, u, 8, _, _, m1, _, _, /*kShift=*/0, 8, StoreInterleaved3,
1410  sseg3)
1411 HWY_RVV_STORE3(uint, u, 8, _, _, m2, _, _, /*kShift=*/1, 4, StoreInterleaved3,
1412  sseg3)
1413 #undef HWY_RVV_STORE3
1414 
1415 // ------------------------------ StoreInterleaved4
1416 
1417 #define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1418  MLEN, NAME, OP) \
1419  template <size_t N> \
1420  HWY_API void NAME( \
1421  HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
1422  HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
1423  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1424  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \
1425  return v##OP##e8_v_##CHAR##SEW##LMUL(aligned, v0, v1, v2, v3, Lanes(d)); \
1426  }
1427 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
1428 HWY_RVV_STORE4(uint, u, 8, _, _, mf8, _, _, /*kShift=*/-3, 64,
1430 HWY_RVV_STORE4(uint, u, 8, _, _, mf4, _, _, /*kShift=*/-2, 32,
1431  StoreInterleaved4, sseg4)
1432 HWY_RVV_STORE4(uint, u, 8, _, _, mf2, _, _, /*kShift=*/-1, 16,
1433  StoreInterleaved4, sseg4)
1434 HWY_RVV_STORE4(uint, u, 8, _, _, m1, _, _, /*kShift=*/0, 8, StoreInterleaved4,
1435  sseg4)
1436 HWY_RVV_STORE4(uint, u, 8, _, _, m2, _, _, /*kShift=*/1, 4, StoreInterleaved4,
1437  sseg4)
1438 
1439 #undef HWY_RVV_STORE4
1440 
1441 // ================================================== CONVERT
1442 
1443 // ------------------------------ PromoteTo
1444 
1445 // SEW is for the input so we can use F16 (no-op if not supported).
1446 #define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1447  SHIFT, MLEN, NAME, OP) \
1448  template <size_t N> \
1449  HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
1450  HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1451  return OP##CHAR##SEWD##LMULD(v, Lanes(d)); \
1452  }
1453 
1454 HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1455 HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1456 HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
1457 HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1458 HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1459 HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
1460 HWY_RVV_FOREACH_F16(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1461 HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
1462 #undef HWY_RVV_PROMOTE
1463 
1464 // The above X-macro cannot handle 4x promotion nor type switching.
1465 // TODO(janwas): use BASE2 arg to allow the latter.
1466 #define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
1467  SHIFT, ADD) \
1468  template <size_t N> \
1469  HWY_API HWY_RVV_V(BASE, BITS, LMUL) \
1470  PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \
1471  HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
1472  return OP##CHAR##BITS##LMUL(v, Lanes(d)); \
1473  }
1474 
1475 #define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1476  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
1477  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
1478  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \
1479  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \
1480  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
1481 
1482 #define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
1483  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
1484  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \
1485  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \
1486  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \
1487  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
1488 
1489 HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
1490 HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
1491 
1492 // i32 to f64
1493 HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
1494 
1495 #undef HWY_RVV_PROMOTE_X4
1496 #undef HWY_RVV_PROMOTE_X2
1497 #undef HWY_RVV_PROMOTE
1498 
1499 // Unsigned to signed: cast for unsigned promote.
1500 template <size_t N, int kPow2>
1502  VFromD<Rebind<uint8_t, decltype(d)>> v)
1503  -> VFromD<decltype(d)> {
1504  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1505 }
1506 
1507 template <size_t N, int kPow2>
1509  VFromD<Rebind<uint8_t, decltype(d)>> v)
1510  -> VFromD<decltype(d)> {
1511  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1512 }
1513 
1514 template <size_t N, int kPow2>
1516  VFromD<Rebind<uint16_t, decltype(d)>> v)
1517  -> VFromD<decltype(d)> {
1518  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1519 }
1520 
1521 template <size_t N, int kPow2>
1523  VFromD<Rebind<bfloat16_t, decltype(d)>> v)
1524  -> VFromD<decltype(d)> {
1525  const RebindToSigned<decltype(d)> di32;
1526  const Rebind<uint16_t, decltype(d)> du16;
1527  return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
1528 }
1529 
1530 // ------------------------------ DemoteTo U
1531 
1532 // SEW is for the source so we can use _DEMOTE.
1533 #define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1534  MLEN, NAME, OP) \
1535  template <size_t N> \
1536  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1537  HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1538  return OP##CHAR##SEWH##LMULH(v, 0, Lanes(d)); \
1539  } \
1540  template <size_t N> \
1541  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME##Shr16( \
1542  HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1543  return OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \
1544  }
1545 
1546 // Unsigned -> unsigned (also used for bf16)
1547 namespace detail {
1548 HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1549 HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
1550 } // namespace detail
1551 
1552 // SEW is for the source so we can use _DEMOTE.
1553 #define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1554  SHIFT, MLEN, NAME, OP) \
1555  template <size_t N> \
1556  HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \
1557  HWY_RVV_D(uint, SEWH, N, SHIFT - 1) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1558  /* First clamp negative numbers to zero to match x86 packus. */ \
1559  return detail::DemoteTo(d, detail::BitCastToUnsigned(detail::MaxS(v, 0))); \
1560  }
1563 #undef HWY_RVV_DEMOTE_I_TO_U
1564 
1565 template <size_t N>
1566 HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) {
1567  return vnclipu_wx_u8mf8(DemoteTo(Simd<uint16_t, N, -2>(), v), 0, Lanes(d));
1568 }
1569 template <size_t N>
1570 HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) {
1571  return vnclipu_wx_u8mf4(DemoteTo(Simd<uint16_t, N, -1>(), v), 0, Lanes(d));
1572 }
1573 template <size_t N>
1574 HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) {
1575  return vnclipu_wx_u8mf2(DemoteTo(Simd<uint16_t, N, 0>(), v), 0, Lanes(d));
1576 }
1577 template <size_t N>
1578 HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) {
1579  return vnclipu_wx_u8m1(DemoteTo(Simd<uint16_t, N, 1>(), v), 0, Lanes(d));
1580 }
1581 template <size_t N>
1582 HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) {
1583  return vnclipu_wx_u8m2(DemoteTo(Simd<uint16_t, N, 2>(), v), 0, Lanes(d));
1584 }
1585 
1586 HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
1587  const size_t avl = Lanes(ScalableTag<uint8_t, -3>());
1588  return vnclipu_wx_u8mf8(vnclipu_wx_u16mf4(v, 0, avl), 0, avl);
1589 }
1590 HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) {
1591  const size_t avl = Lanes(ScalableTag<uint8_t, -2>());
1592  return vnclipu_wx_u8mf4(vnclipu_wx_u16mf2(v, 0, avl), 0, avl);
1593 }
1594 HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) {
1595  const size_t avl = Lanes(ScalableTag<uint8_t, -1>());
1596  return vnclipu_wx_u8mf2(vnclipu_wx_u16m1(v, 0, avl), 0, avl);
1597 }
1598 HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) {
1599  const size_t avl = Lanes(ScalableTag<uint8_t, 0>());
1600  return vnclipu_wx_u8m1(vnclipu_wx_u16m2(v, 0, avl), 0, avl);
1601 }
1602 HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
1603  const size_t avl = Lanes(ScalableTag<uint8_t, 1>());
1604  return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
1605 }
1606 
1607 // ------------------------------ DemoteTo I
1608 
1609 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1610 HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
1611 
1612 template <size_t N>
1613 HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) {
1614  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -2>(), v));
1615 }
1616 template <size_t N>
1617 HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) {
1618  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -1>(), v));
1619 }
1620 template <size_t N>
1621 HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) {
1622  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 0>(), v));
1623 }
1624 template <size_t N>
1625 HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) {
1626  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 1>(), v));
1627 }
1628 template <size_t N>
1629 HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
1630  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
1631 }
1632 
1633 #undef HWY_RVV_DEMOTE
1634 
1635 // ------------------------------ DemoteTo F
1636 
1637 // SEW is for the source so we can use _DEMOTE.
1638 #define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1639  SHIFT, MLEN, NAME, OP) \
1640  template <size_t N> \
1641  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
1642  HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
1643  return OP##SEWH##LMULH(v, Lanes(d)); \
1644  }
1645 
1646 #if HWY_HAVE_FLOAT16
1647 HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f,
1648  _DEMOTE_VIRT)
1649 #endif
1651  _DEMOTE_VIRT)
1652 #undef HWY_RVV_DEMOTE_F
1653 
1654 // TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
1655 template <size_t N>
1656 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) {
1657  return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
1658 }
1659 template <size_t N>
1660 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) {
1661  return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
1662 }
1663 template <size_t N>
1664 HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) {
1665  return vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
1666 }
1667 template <size_t N>
1668 HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) {
1669  return vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
1670 }
1671 template <size_t N>
1672 HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) {
1673  return vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
1674 }
1675 
1676 template <size_t N, int kPow2>
1679  const RebindToUnsigned<decltype(d)> du16;
1680  const Rebind<uint32_t, decltype(d)> du32;
1681  return detail::DemoteToShr16(du16, BitCast(du32, v));
1682 }
1683 
1684 // ------------------------------ ConvertTo F
1685 
1686 #define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1687  SHIFT, MLEN, NAME, OP) \
1688  template <size_t N> \
1689  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
1690  HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \
1691  return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
1692  } \
1693  /* Truncates (rounds toward zero). */ \
1694  template <size_t N> \
1695  HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
1696  HWY_RVV_V(BASE, SEW, LMUL) v) { \
1697  return vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \
1698  } \
1699 // API only requires f32 but we provide f64 for internal use.
1700 HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
1701 #undef HWY_RVV_CONVERT
1702 
1703 // Uses default rounding mode. Must be separate because there is no D arg.
1704 #define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1705  SHIFT, MLEN, NAME, OP) \
1706  HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1707  return vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
1708  }
1710 #undef HWY_RVV_NEAREST
1711 
1712 // ================================================== COMBINE
1713 
1714 namespace detail {
1715 
1716 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
1717 // offsets are implicitly relative to the start of their 128-bit block.
1718 template <typename T, size_t N, int kPow2>
1720  size_t lpb = 16 / sizeof(T);
1721  if (IsFull(d)) return lpb;
1722  // Also honor the user-specified (constexpr) N limit.
1723  lpb = HWY_MIN(lpb, N);
1724  // No fraction, we're done.
1725  if (kPow2 >= 0) return lpb;
1726  // Fractional LMUL: Lanes(d) may be smaller than lpb, so honor that.
1727  return HWY_MIN(lpb, Lanes(d));
1728 }
1729 
1730 template <class D, class V>
1731 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
1732  using T = MakeUnsigned<TFromD<D>>;
1733  return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
1734 }
1735 
1736 template <size_t kLanes, class D>
1738  const RebindToUnsigned<D> du;
1739  const RebindToSigned<D> di;
1740  const auto idx_mod = AndS(Iota0(du), LanesPerBlock(du) - 1);
1741  return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
1742 }
1743 
1744 // vector = f(vector, vector, size_t)
1745 #define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1746  MLEN, NAME, OP) \
1747  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1748  NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
1749  size_t lanes) { \
1750  return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \
1751  HWY_RVV_AVL(SEW, SHIFT)); \
1752  }
1753 
1754 HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideUp, slideup, _ALL)
1755 HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideDown, slidedown, _ALL)
1756 
1757 #undef HWY_RVV_SLIDE
1758 
1759 } // namespace detail
1760 
1761 // ------------------------------ ConcatUpperLower
1762 template <class D, class V>
1763 HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
1764  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
1765 }
1766 
1767 // ------------------------------ ConcatLowerLower
1768 template <class D, class V>
1769 HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
1770  return detail::SlideUp(lo, hi, Lanes(d) / 2);
1771 }
1772 
1773 // ------------------------------ ConcatUpperUpper
1774 template <class D, class V>
1775 HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
1776  // Move upper half into lower
1777  const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
1778  return ConcatUpperLower(d, hi, lo_down);
1779 }
1780 
1781 // ------------------------------ ConcatLowerUpper
1782 template <class D, class V>
1783 HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
1784  // Move half of both inputs to the other half
1785  const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2);
1786  const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
1787  return ConcatUpperLower(d, hi_up, lo_down);
1788 }
1789 
1790 // ------------------------------ Combine
1791 template <class D2, class V>
1792 HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
1793  return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi),
1794  Lanes(d2) / 2);
1795 }
1796 
1797 // ------------------------------ ZeroExtendVector
1798 
1799 template <class D2, class V>
1801  return Combine(d2, Xor(lo, lo), lo);
1802 }
1803 
1804 // ------------------------------ Lower/UpperHalf
1805 
1806 namespace detail {
1807 
1808 // RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note
1809 // that SEW = sizeof(T)*8 and LMUL = 1 << Pow2().
1810 template <class D>
1811 constexpr bool IsSupportedLMUL(D d) {
1812  return (size_t{1} << (Pow2(d) + 3)) >= sizeof(TFromD<D>);
1813 }
1814 
1815 } // namespace detail
1816 
1817 // If IsSupportedLMUL, just 'truncate' i.e. halve LMUL.
1818 template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr>
1819 HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) {
1820  return detail::Trunc(v);
1821 }
1822 
1823 // Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and
1824 // the hardware may set "vill" if we attempt such an LMUL. However, the V
1825 // extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it
1826 // still makes sense to have half of an SEW=64 vector. We instead just return
1827 // the vector, and rely on the kPow2 in DH to halve the return value of Lanes().
1828 template <class DH, class V,
1829  hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr>
1830 HWY_API V LowerHalf(const DH /* tag */, const V v) {
1831  return v;
1832 }
1833 
1834 // Same, but without D arg
1835 template <class V>
1837  return LowerHalf(Half<DFromV<V>>(), v);
1838 }
1839 
1840 template <class DH>
1842  return LowerHalf(d2, detail::SlideDown(v, v, Lanes(d2)));
1843 }
1844 
1845 // ================================================== SWIZZLE
1846 
1847 namespace detail {
1848 // Special instruction for 1 lane is presumably faster?
1849 #define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1850  MLEN, NAME, OP) \
1851  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1852  return v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \
1853  }
1854 
1855 HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL)
1856 HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL)
1857 HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL)
1858 HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL)
1859 #undef HWY_RVV_SLIDE1
1860 } // namespace detail
1861 
1862 // ------------------------------ GetLane
1863 
1864 #define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1865  SHIFT, MLEN, NAME, OP) \
1866  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1867  return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \
1868  }
1869 
1872 #undef HWY_RVV_GET_LANE
1873 
1874 // ------------------------------ OddEven
1875 template <class V>
1876 HWY_API V OddEven(const V a, const V b) {
1877  const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
1878  const auto is_even = detail::EqS(detail::AndS(detail::Iota0(du), 1), 0);
1879  return IfThenElse(is_even, b, a);
1880 }
1881 
1882 // ------------------------------ DupEven (OddEven)
1883 template <class V>
1884 HWY_API V DupEven(const V v) {
1885  const V up = detail::Slide1Up(v);
1886  return OddEven(up, v);
1887 }
1888 
1889 // ------------------------------ DupOdd (OddEven)
1890 template <class V>
1891 HWY_API V DupOdd(const V v) {
1892  const V down = detail::Slide1Down(v);
1893  return OddEven(v, down);
1894 }
1895 
1896 // ------------------------------ OddEvenBlocks
1897 template <class V>
1898 HWY_API V OddEvenBlocks(const V a, const V b) {
1899  const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
1900  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>));
1901  const auto idx_block = ShiftRight<kShift>(detail::Iota0(du));
1902  const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
1903  return IfThenElse(is_even, b, a);
1904 }
1905 
1906 // ------------------------------ SwapAdjacentBlocks
1907 
1908 template <class V>
1909 HWY_API V SwapAdjacentBlocks(const V v) {
1910  const DFromV<V> d;
1911  const size_t lpb = detail::LanesPerBlock(d);
1912  const V down = detail::SlideDown(v, v, lpb);
1913  const V up = detail::SlideUp(v, v, lpb);
1914  return OddEvenBlocks(up, down);
1915 }
1916 
1917 // ------------------------------ TableLookupLanes
1918 
1919 template <class D, class VI>
1920 HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) {
1921  static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane");
1922  const RebindToUnsigned<decltype(d)> du; // instead of <D>: avoids unused d.
1923  const auto indices = BitCast(du, vec);
1924 #if HWY_IS_DEBUG_BUILD
1925  HWY_DASSERT(AllTrue(du, detail::LtS(indices, Lanes(d))));
1926 #endif
1927  return indices;
1928 }
1929 
1930 template <class D, typename TI>
1931 HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
1932  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
1933  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
1934 }
1935 
1936 // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
1937 // to 2048! We could instead use vrgatherei16.
1938 #define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1939  MLEN, NAME, OP) \
1940  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1941  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
1942  return v##OP##_vv_##CHAR##SEW##LMUL(v, idx, HWY_RVV_AVL(SEW, SHIFT)); \
1943  }
1944 
1946 #undef HWY_RVV_TABLE
1947 
1948 // ------------------------------ ConcatOdd (TableLookupLanes)
1949 template <class D, class V>
1950 HWY_API V ConcatOdd(D d, const V hi, const V lo) {
1951  const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
1952  const auto iota = detail::Iota0(du);
1953  const auto idx = detail::AddS(Add(iota, iota), 1);
1954  const auto lo_odd = TableLookupLanes(lo, idx);
1955  const auto hi_odd = TableLookupLanes(hi, idx);
1956  return detail::SlideUp(lo_odd, hi_odd, Lanes(d) / 2);
1957 }
1958 
1959 // ------------------------------ ConcatEven (TableLookupLanes)
1960 template <class D, class V>
1961 HWY_API V ConcatEven(D d, const V hi, const V lo) {
1962  const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
1963  const auto iota = detail::Iota0(du);
1964  const auto idx = Add(iota, iota);
1965  const auto lo_even = TableLookupLanes(lo, idx);
1966  const auto hi_even = TableLookupLanes(hi, idx);
1967  return detail::SlideUp(lo_even, hi_even, Lanes(d) / 2);
1968 }
1969 
1970 // ------------------------------ Reverse (TableLookupLanes)
1971 template <class D>
1973  const RebindToUnsigned<D> du;
1974  using TU = TFromD<decltype(du)>;
1975  const size_t N = Lanes(du);
1976  const auto idx =
1977  detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
1978  return TableLookupLanes(v, idx);
1979 }
1980 
1981 // ------------------------------ Reverse2 (RotateRight, OddEven)
1982 
1983 // Shifting and adding requires fewer instructions than blending, but casting to
1984 // u32 only works for LMUL in [1/2, 8].
1985 template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -1, 3)>
1987  const Repartition<uint32_t, D> du32;
1988  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
1989 }
1990 // For LMUL < 1/2, we can extend and then truncate.
1991 template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -3, -2)>
1992 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
1993  const Twice<decltype(d)> d2;
1994  const Twice<decltype(d2)> d4;
1995  const Repartition<uint32_t, decltype(d4)> du32;
1996  const auto vx = detail::Ext(d4, detail::Ext(d2, v));
1997  const auto rx = BitCast(d4, RotateRight<16>(BitCast(du32, vx)));
1998  return detail::Trunc(detail::Trunc(rx));
1999 }
2000 
2001 // Shifting and adding requires fewer instructions than blending, but casting to
2002 // u64 does not work for LMUL < 1.
2003 template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, 0, 3)>
2004 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2005  const Repartition<uint64_t, decltype(d)> du64;
2006  return BitCast(d, RotateRight<32>(BitCast(du64, v)));
2007 }
2008 
2009 // For fractions, we can extend and then truncate.
2010 template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, -2, -1)>
2011 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
2012  const Twice<decltype(d)> d2;
2013  const Twice<decltype(d2)> d4;
2014  const Repartition<uint64_t, decltype(d4)> du64;
2015  const auto vx = detail::Ext(d4, detail::Ext(d2, v));
2016  const auto rx = BitCast(d4, RotateRight<32>(BitCast(du64, vx)));
2017  return detail::Trunc(detail::Trunc(rx));
2018 }
2019 
2020 template <class D, class V = VFromD<D>, HWY_IF_LANE_SIZE_D(D, 8)>
2021 HWY_API V Reverse2(D /* tag */, const V v) {
2022  const V up = detail::Slide1Up(v);
2023  const V down = detail::Slide1Down(v);
2024  return OddEven(up, down);
2025 }
2026 
2027 // ------------------------------ Reverse4 (TableLookupLanes)
2028 
2029 template <class D>
2030 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2031  const RebindToUnsigned<D> du;
2032  const auto idx = detail::XorS(detail::Iota0(du), 3);
2033  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2034 }
2035 
2036 // ------------------------------ Reverse8 (TableLookupLanes)
2037 
2038 template <class D>
2039 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
2040  const RebindToUnsigned<D> du;
2041  const auto idx = detail::XorS(detail::Iota0(du), 7);
2042  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
2043 }
2044 
2045 // ------------------------------ ReverseBlocks (Reverse, Shuffle01)
2046 template <class D, class V = VFromD<D>>
2047 HWY_API V ReverseBlocks(D d, V v) {
2048  const Repartition<uint64_t, D> du64;
2049  const size_t N = Lanes(du64);
2050  const auto rev =
2051  detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
2052  // Swap lo/hi u64 within each block
2053  const auto idx = detail::XorS(rev, 1);
2054  return BitCast(d, TableLookupLanes(BitCast(du64, v), idx));
2055 }
2056 
2057 // ------------------------------ Compress
2058 
2059 template <typename T>
2060 struct CompressIsPartition {
2061  enum { value = 0 };
2062 };
2063 
2064 #define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2065  SHIFT, MLEN, NAME, OP) \
2066  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2067  NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
2068  return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v, HWY_RVV_AVL(SEW, SHIFT)); \
2069  }
2070 
2073 #undef HWY_RVV_COMPRESS
2074 
2075 // ------------------------------ CompressStore
2076 template <class V, class M, class D>
2077 HWY_API size_t CompressStore(const V v, const M mask, const D d,
2078  TFromD<D>* HWY_RESTRICT unaligned) {
2079  StoreU(Compress(v, mask), d, unaligned);
2080  return CountTrue(d, mask);
2081 }
2082 
2083 // ------------------------------ CompressBlendedStore
2084 template <class V, class M, class D>
2085 HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
2086  TFromD<D>* HWY_RESTRICT unaligned) {
2087  const size_t count = CountTrue(d, mask);
2088  detail::StoreN(count, Compress(v, mask), d, unaligned);
2089  return count;
2090 }
2091 
2092 // ================================================== BLOCKWISE
2093 
2094 // ------------------------------ CombineShiftRightBytes
2095 template <size_t kBytes, class D, class V = VFromD<D>>
2096 HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
2097  const Repartition<uint8_t, decltype(d)> d8;
2098  const auto hi8 = BitCast(d8, hi);
2099  const auto lo8 = BitCast(d8, lo);
2100  const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
2101  const auto lo_down = detail::SlideDown(lo8, lo8, kBytes);
2102  const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
2103  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
2104 }
2105 
2106 // ------------------------------ CombineShiftRightLanes
2107 template <size_t kLanes, class D, class V = VFromD<D>>
2108 HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
2109  constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
2110  const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
2111  const auto lo_down = detail::SlideDown(lo, lo, kLanes);
2112  const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
2113  return IfThenElse(is_lo, lo_down, hi_up);
2114 }
2115 
2116 // ------------------------------ Shuffle2301 (ShiftLeft)
2117 template <class V>
2118 HWY_API V Shuffle2301(const V v) {
2119  const DFromV<V> d;
2120  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2121  const Repartition<uint64_t, decltype(d)> du64;
2122  const auto v64 = BitCast(du64, v);
2123  return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
2124 }
2125 
2126 // ------------------------------ Shuffle2103
2127 template <class V>
2128 HWY_API V Shuffle2103(const V v) {
2129  const DFromV<V> d;
2130  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2131  return CombineShiftRightLanes<3>(d, v, v);
2132 }
2133 
2134 // ------------------------------ Shuffle0321
2135 template <class V>
2136 HWY_API V Shuffle0321(const V v) {
2137  const DFromV<V> d;
2138  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2139  return CombineShiftRightLanes<1>(d, v, v);
2140 }
2141 
2142 // ------------------------------ Shuffle1032
2143 template <class V>
2144 HWY_API V Shuffle1032(const V v) {
2145  const DFromV<V> d;
2146  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
2147  return CombineShiftRightLanes<2>(d, v, v);
2148 }
2149 
2150 // ------------------------------ Shuffle01
2151 template <class V>
2152 HWY_API V Shuffle01(const V v) {
2153  const DFromV<V> d;
2154  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
2155  return CombineShiftRightLanes<1>(d, v, v);
2156 }
2157 
2158 // ------------------------------ Shuffle0123
2159 template <class V>
2160 HWY_API V Shuffle0123(const V v) {
2161  return Shuffle2301(Shuffle1032(v));
2162 }
2163 
2164 // ------------------------------ TableLookupBytes
2165 
2166 // Extends or truncates a vector to match the given d.
2167 namespace detail {
2168 
2169 template <typename T, size_t N, int kPow2>
2171  -> VFromD<decltype(d)> {
2172  const Simd<T, N, kPow2 - 1> dh;
2173  const Simd<T, N, kPow2 - 2> dhh;
2174  return Ext(d, Ext(dh, Ext(dhh, v)));
2175 }
2176 template <typename T, size_t N, int kPow2>
2178  -> VFromD<decltype(d)> {
2179  const Simd<T, N, kPow2 - 1> dh;
2180  return Ext(d, Ext(dh, v));
2181 }
2182 template <typename T, size_t N, int kPow2>
2184  -> VFromD<decltype(d)> {
2185  return Ext(d, v);
2186 }
2187 
2188 template <typename T, size_t N, int kPow2>
2190  -> VFromD<decltype(d)> {
2191  return v;
2192 }
2193 
2194 template <typename T, size_t N, int kPow2>
2196  -> VFromD<decltype(d)> {
2197  return Trunc(v);
2198 }
2199 template <typename T, size_t N, int kPow2>
2201  -> VFromD<decltype(d)> {
2202  return Trunc(Trunc(v));
2203 }
2204 template <typename T, size_t N, int kPow2>
2206  -> VFromD<decltype(d)> {
2207  return Trunc(Trunc(Trunc(v)));
2208 }
2209 
2210 } // namespace detail
2211 
2212 template <class VT, class VI>
2213 HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
2214  const DFromV<VT> dt; // T=table, I=index.
2215  const DFromV<VI> di;
2216  const Repartition<uint8_t, decltype(dt)> dt8;
2217  const Repartition<uint8_t, decltype(di)> di8;
2218  // Required for producing half-vectors with table lookups from a full vector.
2219  // If we instead run at the LMUL of the index vector, lookups into the table
2220  // would be truncated. Thus we run at the larger of the two LMULs and truncate
2221  // the result vector to the original index LMUL.
2222  constexpr int kPow2T = Pow2(dt8);
2223  constexpr int kPow2I = Pow2(di8);
2224  const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8; // m=max
2225  const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt));
2226  const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi));
2227  auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
2228  // If the table is shorter, wrap around offsets so they do not reference
2229  // undefined lanes in the newly extended vmt.
2230  if (kPow2T < kPow2I) {
2231  offsets = detail::AndS(offsets, Lanes(dt8) - 1);
2232  }
2233  const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
2234  return BitCast(di, detail::ChangeLMUL(di8, out));
2235 }
2236 
2237 template <class VT, class VI>
2238 HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
2239  const DFromV<VI> di;
2240  const Repartition<int8_t, decltype(di)> di8;
2241  const auto idx8 = BitCast(di8, idx);
2242  const auto lookup = TableLookupBytes(vt, idx8);
2243  return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup));
2244 }
2245 
2246 // ------------------------------ Broadcast
2247 template <int kLane, class V>
2248 HWY_API V Broadcast(const V v) {
2249  const DFromV<V> d;
2250  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
2252  if (kLane != 0) {
2253  idx = detail::AddS(idx, kLane);
2254  }
2255  return TableLookupLanes(v, idx);
2256 }
2257 
2258 // ------------------------------ ShiftLeftLanes
2259 
2260 template <size_t kLanes, class D, class V = VFromD<D>>
2261 HWY_API V ShiftLeftLanes(const D d, const V v) {
2262  const RebindToSigned<decltype(d)> di;
2263  using TI = TFromD<decltype(di)>;
2264  const auto shifted = detail::SlideUp(v, v, kLanes);
2265  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
2266  const auto idx_mod =
2267  detail::AndS(detail::Iota0(di), detail::LanesPerBlock(di) - 1);
2268  const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
2269  return IfThenZeroElse(clear, shifted);
2270 }
2271 
2272 template <size_t kLanes, class V>
2273 HWY_API V ShiftLeftLanes(const V v) {
2274  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
2275 }
2276 
2277 // ------------------------------ ShiftLeftBytes
2278 
2279 template <int kBytes, class D>
2281  const Repartition<uint8_t, decltype(d)> d8;
2282  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
2283 }
2284 
2285 template <int kBytes, class V>
2286 HWY_API V ShiftLeftBytes(const V v) {
2287  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
2288 }
2289 
2290 // ------------------------------ ShiftRightLanes
2291 template <size_t kLanes, typename T, size_t N, int kPow2,
2292  class V = VFromD<Simd<T, N, kPow2>>>
2294  const RebindToSigned<decltype(d)> di;
2295  using TI = TFromD<decltype(di)>;
2296  // For partial vectors, clear upper lanes so we shift in zeros.
2297  if (N <= 16 / sizeof(T)) {
2298  v = IfThenElseZero(FirstN(d, N), v);
2299  }
2300 
2301  const auto shifted = detail::SlideDown(v, v, kLanes);
2302  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
2303  const size_t lpb = detail::LanesPerBlock(di);
2304  const auto idx_mod = detail::AndS(detail::Iota0(di), lpb - 1);
2305  const auto keep =
2306  detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
2307  return IfThenElseZero(keep, shifted);
2308 }
2309 
2310 // ------------------------------ ShiftRightBytes
2311 template <int kBytes, class D, class V = VFromD<D>>
2312 HWY_API V ShiftRightBytes(const D d, const V v) {
2313  const Repartition<uint8_t, decltype(d)> d8;
2314  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
2315 }
2316 
2317 // ------------------------------ InterleaveLower
2318 
2319 template <class D, class V>
2320 HWY_API V InterleaveLower(D d, const V a, const V b) {
2321  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2322  const RebindToUnsigned<decltype(d)> du;
2323  const auto i = detail::Iota0(du);
2324  const auto idx_mod =
2325  ShiftRight<1>(detail::AndS(i, detail::LanesPerBlock(du) - 1));
2326  const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2327  const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2328  return IfThenElse(is_even, TableLookupLanes(a, idx),
2329  TableLookupLanes(b, idx));
2330 }
2331 
2332 template <class V>
2333 HWY_API V InterleaveLower(const V a, const V b) {
2334  return InterleaveLower(DFromV<V>(), a, b);
2335 }
2336 
2337 // ------------------------------ InterleaveUpper
2338 
2339 template <class D, class V>
2340 HWY_API V InterleaveUpper(const D d, const V a, const V b) {
2341  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
2342  const RebindToUnsigned<decltype(d)> du;
2343  const size_t lpb = detail::LanesPerBlock(du);
2344  const auto i = detail::Iota0(du);
2345  const auto idx_mod = ShiftRight<1>(detail::AndS(i, lpb - 1));
2346  const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
2347  const auto idx = detail::AddS(idx_lower, lpb / 2);
2348  const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
2349  return IfThenElse(is_even, TableLookupLanes(a, idx),
2350  TableLookupLanes(b, idx));
2351 }
2352 
2353 // ------------------------------ ZipLower
2354 
2355 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2356 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2357  const RepartitionToNarrow<DW> dn;
2358  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2359  return BitCast(dw, InterleaveLower(dn, a, b));
2360 }
2361 
2362 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2363 HWY_API VFromD<DW> ZipLower(V a, V b) {
2364  return BitCast(DW(), InterleaveLower(a, b));
2365 }
2366 
2367 // ------------------------------ ZipUpper
2368 template <class DW, class V>
2369 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2370  const RepartitionToNarrow<DW> dn;
2371  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
2372  return BitCast(dw, InterleaveUpper(dn, a, b));
2373 }
2374 
2375 // ================================================== REDUCE
2376 
2377 // vector = f(vector, zero_m1)
2378 #define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2379  MLEN, NAME, OP) \
2380  template <class D> \
2381  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2382  NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
2383  return Set(d, GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
2384  v0, v, v0, Lanes(d)))); \
2385  }
2386 
2387 // ------------------------------ SumOfLanes
2388 
2389 namespace detail {
2390 HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL)
2391 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL)
2392 } // namespace detail
2393 
2394 template <class D>
2396  const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
2397  return detail::RedSum(d, v, v0);
2398 }
2399 
2400 // ------------------------------ MinOfLanes
2401 namespace detail {
2402 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL)
2403 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL)
2404 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL)
2405 } // namespace detail
2406 
2407 template <class D>
2409  using T = TFromD<D>;
2410  const ScalableTag<T> d1; // always m1
2411  const auto neutral = Set(d1, HighestValue<T>());
2412  return detail::RedMin(d, v, neutral);
2413 }
2414 
2415 // ------------------------------ MaxOfLanes
2416 namespace detail {
2417 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL)
2418 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL)
2419 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL)
2420 } // namespace detail
2421 
2422 template <class D>
2424  using T = TFromD<D>;
2425  const ScalableTag<T> d1; // always m1
2426  const auto neutral = Set(d1, LowestValue<T>());
2427  return detail::RedMax(d, v, neutral);
2428 }
2429 
2430 #undef HWY_RVV_REDUCE
2431 
2432 // ================================================== Ops with dependencies
2433 
2434 // ------------------------------ PopulationCount (ShiftRight)
2435 
2436 // Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot.
2437 template <typename V, class D = DFromV<V>, HWY_IF_LANES_ARE(uint8_t, V),
2438  hwy::EnableIf<Pow2(D()) < 1 || MaxLanes(D()) < 16>* = nullptr>
2439 HWY_API V PopulationCount(V v) {
2440  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
2441  v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
2442  v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
2443  return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
2444 }
2445 
2446 // ------------------------------ LoadDup128
2447 
2448 template <class D>
2449 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
2450  const auto loaded = Load(d, p);
2451  // Broadcast the first block
2452  const auto idx = detail::AndS(detail::Iota0(d), detail::LanesPerBlock(d) - 1);
2453  return TableLookupLanes(loaded, idx);
2454 }
2455 
2456 // ------------------------------ LoadMaskBits
2457 
2458 // Support all combinations of T and SHIFT(LMUL) without explicit overloads for
2459 // each. First overload for MLEN=1..64.
2460 namespace detail {
2461 
2462 // Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN
2463 // increases with lane size and decreases for increasing LMUL. Cap at 64, the
2464 // largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL
2465 // e.g. vuint16mf8_t: (8*2 << 3) == 128.
2466 template <class D>
2467 using MaskTag = hwy::SizeTag<HWY_MIN(
2468  64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -Pow2(D())))>;
2469 
2470 #define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2471  HWY_INLINE HWY_RVV_M(MLEN) \
2472  NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \
2473  return OP##_v_b##MLEN(bits, N); \
2474  }
2476 #undef HWY_RVV_LOAD_MASK_BITS
2477 } // namespace detail
2478 
2479 template <class D, class MT = detail::MaskTag<D>>
2480 HWY_API auto LoadMaskBits(D d, const uint8_t* bits)
2481  -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) {
2482  return detail::LoadMaskBits(MT(), bits, Lanes(d));
2483 }
2484 
2485 // ------------------------------ StoreMaskBits
2486 #define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
2487  template <class D> \
2488  HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \
2489  const size_t N = Lanes(d); \
2490  OP##_v_b##MLEN(bits, m, N); \
2491  /* Non-full byte, need to clear the undefined upper bits. */ \
2492  /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \
2493  constexpr bool kLessThan8 = \
2494  detail::ScaleByPower(16 / sizeof(TFromD<D>), Pow2(d)) < 8; \
2495  if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \
2496  const int mask = (1 << N) - 1; \
2497  bits[0] = static_cast<uint8_t>(bits[0] & mask); \
2498  } \
2499  return (N + 7) / 8; \
2500  }
2502 #undef HWY_RVV_STORE_MASK_BITS
2503 
2504 // ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
2505 
2506 template <class V>
2507 HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2508  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
2509 }
2510 
2511 template <class D>
2512 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2513  D d, TFromD<D>* HWY_RESTRICT unaligned) {
2514  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2515 }
2516 
2517 // ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
2518 
2519 // Disallow for 8-bit because Iota is likely to overflow.
2520 template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2521 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2522  const RebindToSigned<D> di;
2523  using TI = TFromD<decltype(di)>;
2524  return RebindMask(
2525  d, detail::LtS(BitCast(di, detail::Iota0(d)), static_cast<TI>(n)));
2526 }
2527 
2528 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
2529 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
2530  const auto zero = Zero(d);
2531  const auto one = Set(d, 1);
2532  return Eq(detail::SlideUp(one, zero, n), one);
2533 }
2534 
2535 // ------------------------------ Neg (Sub)
2536 
2537 template <class V, HWY_IF_SIGNED_V(V)>
2538 HWY_API V Neg(const V v) {
2539  return detail::ReverseSubS(v, 0);
2540 }
2541 
2542 // vector = f(vector), but argument is repeated
2543 #define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2544  SHIFT, MLEN, NAME, OP) \
2545  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2546  return v##OP##_vv_##CHAR##SEW##LMUL(v, v, HWY_RVV_AVL(SEW, SHIFT)); \
2547  }
2548 
2550 
2551 // ------------------------------ Abs (Max, Neg)
2552 
2553 template <class V, HWY_IF_SIGNED_V(V)>
2554 HWY_API V Abs(const V v) {
2555  return Max(v, Neg(v));
2556 }
2557 
2559 
2560 #undef HWY_RVV_RETV_ARGV2
2561 
2562 // ------------------------------ AbsDiff (Abs, Sub)
2563 template <class V>
2564 HWY_API V AbsDiff(const V a, const V b) {
2565  return Abs(Sub(a, b));
2566 }
2567 
2568 // ------------------------------ Round (NearestInt, ConvertTo, CopySign)
2569 
2570 // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
2571 // a dedicated instruction for that. Rounding to integer and converting back to
2572 // float is correct except when the input magnitude is large, in which case the
2573 // input was already an integer (because mantissa >> exponent is zero).
2574 
2575 namespace detail {
2576 enum RoundingModes { kNear, kTrunc, kDown, kUp };
2577 
2578 template <class V>
2579 HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
2580  return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>());
2581 }
2582 
2583 } // namespace detail
2584 
2585 template <class V>
2586 HWY_API V Round(const V v) {
2587  const DFromV<V> df;
2588 
2589  const auto integer = NearestInt(v); // round using current mode
2590  const auto int_f = ConvertTo(df, integer);
2591 
2592  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
2593 }
2594 
2595 // ------------------------------ Trunc (ConvertTo)
2596 template <class V>
2597 HWY_API V Trunc(const V v) {
2598  const DFromV<V> df;
2599  const RebindToSigned<decltype(df)> di;
2600 
2601  const auto integer = ConvertTo(di, v); // round toward 0
2602  const auto int_f = ConvertTo(df, integer);
2603 
2604  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
2605 }
2606 
2607 // ------------------------------ Ceil
2608 template <class V>
2609 HWY_API V Ceil(const V v) {
2610  asm volatile("fsrm %0" ::"r"(detail::kUp));
2611  const auto ret = Round(v);
2612  asm volatile("fsrm %0" ::"r"(detail::kNear));
2613  return ret;
2614 }
2615 
2616 // ------------------------------ Floor
2617 template <class V>
2618 HWY_API V Floor(const V v) {
2619  asm volatile("fsrm %0" ::"r"(detail::kDown));
2620  const auto ret = Round(v);
2621  asm volatile("fsrm %0" ::"r"(detail::kNear));
2622  return ret;
2623 }
2624 
2625 // ------------------------------ Iota (ConvertTo)
2626 
2627 template <class D, HWY_IF_UNSIGNED_D(D)>
2628 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2629  return detail::AddS(detail::Iota0(d), first);
2630 }
2631 
2632 template <class D, HWY_IF_SIGNED_D(D)>
2633 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2634  const RebindToUnsigned<D> du;
2635  return detail::AddS(BitCast(d, detail::Iota0(du)), first);
2636 }
2637 
2638 template <class D, HWY_IF_FLOAT_D(D)>
2639 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2640  const RebindToUnsigned<D> du;
2641  const RebindToSigned<D> di;
2642  return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
2643 }
2644 
2645 // ------------------------------ MulEven/Odd (Mul, OddEven)
2646 
2647 template <class V, HWY_IF_LANE_SIZE_V(V, 4), class D = DFromV<V>,
2648  class DW = RepartitionToWide<D>>
2649 HWY_API VFromD<DW> MulEven(const V a, const V b) {
2650  const auto lo = Mul(a, b);
2651  const auto hi = detail::MulHigh(a, b);
2652  return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
2653 }
2654 
2655 // There is no 64x64 vwmul.
2656 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2657 HWY_INLINE V MulEven(const V a, const V b) {
2658  const auto lo = detail::Mul(a, b);
2659  const auto hi = detail::MulHigh(a, b);
2660  return OddEven(detail::Slide1Up(hi), lo);
2661 }
2662 
2663 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
2664 HWY_INLINE V MulOdd(const V a, const V b) {
2665  const auto lo = detail::Mul(a, b);
2666  const auto hi = detail::MulHigh(a, b);
2667  return OddEven(hi, detail::Slide1Down(lo));
2668 }
2669 
2670 // ------------------------------ ReorderDemote2To (OddEven)
2671 
2672 template <size_t N, int kPow2>
2675  VFromD<RepartitionToWide<decltype(dbf16)>> a,
2676  VFromD<RepartitionToWide<decltype(dbf16)>> b) {
2677  const RebindToUnsigned<decltype(dbf16)> du16;
2678  const RebindToUnsigned<DFromV<decltype(a)>> du32;
2679  const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
2680  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2681 }
2682 
2683 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2684 
2685 template <class DF>
2687 
2688 template <size_t N, int kPow2>
2690  VFromD<DU16FromDF<decltype(df32)>> a,
2691  VFromD<DU16FromDF<decltype(df32)>> b,
2692  const VFromD<decltype(df32)> sum0,
2693  VFromD<decltype(df32)>& sum1)
2694  -> VFromD<decltype(df32)> {
2695  const DU16FromDF<decltype(df32)> du16;
2696  const RebindToUnsigned<decltype(df32)> du32;
2697  using VU32 = VFromD<decltype(du32)>;
2698  const VFromD<decltype(du16)> zero = Zero(du16);
2699  const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
2700  const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
2701  const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
2702  const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
2703  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2704  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2705 }
2706 
2707 // ------------------------------ Lt128
2708 
2709 template <class D>
2711  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2712  // Truth table of Eq and Compare for Hi and Lo u64.
2713  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
2714  // =H =L cH cL | out = cH | (=H & cL)
2715  // 0 0 0 0 | 0
2716  // 0 0 0 1 | 0
2717  // 0 0 1 0 | 1
2718  // 0 0 1 1 | 1
2719  // 0 1 0 0 | 0
2720  // 0 1 0 1 | 0
2721  // 0 1 1 0 | 1
2722  // 1 0 0 0 | 0
2723  // 1 0 0 1 | 1
2724  // 1 1 0 0 | 0
2725  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
2726  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
2727  // Shift leftward so L can influence H.
2728  const VFromD<D> ltLx = detail::Slide1Up(ltHL);
2729  const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx);
2730  // Replicate H to its neighbor.
2731  return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
2732 }
2733 
2734 // ------------------------------ Min128, Max128 (Lt128)
2735 
2736 template <class D>
2737 HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
2738  const VFromD<D> aXH = detail::Slide1Down(a);
2739  const VFromD<D> bXH = detail::Slide1Down(b);
2740  const VFromD<D> minHL = Min(a, b);
2741  const MFromD<D> ltXH = Lt(aXH, bXH);
2742  const MFromD<D> eqXH = Eq(aXH, bXH);
2743  // If the upper lane is the decider, take lo from the same reg.
2744  const VFromD<D> lo = IfThenElse(ltXH, a, b);
2745  // The upper lane is just minHL; if they are equal, we also need to use the
2746  // actual min of the lower lanes.
2747  return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
2748 }
2749 
2750 template <class D>
2751 HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
2752  const VFromD<D> aXH = detail::Slide1Down(a);
2753  const VFromD<D> bXH = detail::Slide1Down(b);
2754  const VFromD<D> maxHL = Max(a, b);
2755  const MFromD<D> ltXH = Lt(aXH, bXH);
2756  const MFromD<D> eqXH = Eq(aXH, bXH);
2757  // If the upper lane is the decider, take lo from the same reg.
2758  const VFromD<D> lo = IfThenElse(ltXH, b, a);
2759  // The upper lane is just maxHL; if they are equal, we also need to use the
2760  // actual min of the lower lanes.
2761  return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
2762 }
2763 
2764 // ================================================== END MACROS
2765 namespace detail { // for code folding
2766 #undef HWY_RVV_AVL
2767 #undef HWY_RVV_D
2768 #undef HWY_RVV_FOREACH
2769 #undef HWY_RVV_FOREACH_08_ALL
2770 #undef HWY_RVV_FOREACH_08_ALL_VIRT
2771 #undef HWY_RVV_FOREACH_08_DEMOTE
2772 #undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
2773 #undef HWY_RVV_FOREACH_08_EXT
2774 #undef HWY_RVV_FOREACH_08_EXT_VIRT
2775 #undef HWY_RVV_FOREACH_08_TRUNC
2776 #undef HWY_RVV_FOREACH_08_VIRT
2777 #undef HWY_RVV_FOREACH_16_ALL
2778 #undef HWY_RVV_FOREACH_16_ALL_VIRT
2779 #undef HWY_RVV_FOREACH_16_DEMOTE
2780 #undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
2781 #undef HWY_RVV_FOREACH_16_EXT
2782 #undef HWY_RVV_FOREACH_16_EXT_VIRT
2783 #undef HWY_RVV_FOREACH_16_TRUNC
2784 #undef HWY_RVV_FOREACH_16_VIRT
2785 #undef HWY_RVV_FOREACH_32_ALL
2786 #undef HWY_RVV_FOREACH_32_ALL_VIRT
2787 #undef HWY_RVV_FOREACH_32_DEMOTE
2788 #undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
2789 #undef HWY_RVV_FOREACH_32_EXT
2790 #undef HWY_RVV_FOREACH_32_EXT_VIRT
2791 #undef HWY_RVV_FOREACH_32_TRUNC
2792 #undef HWY_RVV_FOREACH_32_VIRT
2793 #undef HWY_RVV_FOREACH_64_ALL
2794 #undef HWY_RVV_FOREACH_64_ALL_VIRT
2795 #undef HWY_RVV_FOREACH_64_DEMOTE
2796 #undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
2797 #undef HWY_RVV_FOREACH_64_EXT
2798 #undef HWY_RVV_FOREACH_64_EXT_VIRT
2799 #undef HWY_RVV_FOREACH_64_TRUNC
2800 #undef HWY_RVV_FOREACH_64_VIRT
2801 #undef HWY_RVV_FOREACH_B
2802 #undef HWY_RVV_FOREACH_F
2803 #undef HWY_RVV_FOREACH_F16
2804 #undef HWY_RVV_FOREACH_F32
2805 #undef HWY_RVV_FOREACH_F3264
2806 #undef HWY_RVV_FOREACH_F64
2807 #undef HWY_RVV_FOREACH_I
2808 #undef HWY_RVV_FOREACH_I08
2809 #undef HWY_RVV_FOREACH_I16
2810 #undef HWY_RVV_FOREACH_I163264
2811 #undef HWY_RVV_FOREACH_I32
2812 #undef HWY_RVV_FOREACH_I64
2813 #undef HWY_RVV_FOREACH_U
2814 #undef HWY_RVV_FOREACH_U08
2815 #undef HWY_RVV_FOREACH_U16
2816 #undef HWY_RVV_FOREACH_U163264
2817 #undef HWY_RVV_FOREACH_U32
2818 #undef HWY_RVV_FOREACH_U64
2819 #undef HWY_RVV_FOREACH_UI
2820 #undef HWY_RVV_FOREACH_UI08
2821 #undef HWY_RVV_FOREACH_UI16
2822 #undef HWY_RVV_FOREACH_UI163264
2823 #undef HWY_RVV_FOREACH_UI32
2824 #undef HWY_RVV_FOREACH_UI3264
2825 #undef HWY_RVV_FOREACH_UI64
2826 #undef HWY_RVV_M
2827 #undef HWY_RVV_RETV_ARGV
2828 #undef HWY_RVV_RETV_ARGVS
2829 #undef HWY_RVV_RETV_ARGVV
2830 #undef HWY_RVV_T
2831 #undef HWY_RVV_V
2832 } // namespace detail
2833 // NOLINTNEXTLINE(google-readability-namespace-comments)
2834 } // namespace HWY_NAMESPACE
2835 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DASSERT(condition)
Definition: base.h:193
HWY_INLINE VFromD< DU > BitCastToUnsigned(V v)
Definition: rvv-inl.h:655
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: arm_sve-inl.h:1664
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1503
svbool_t FirstNPerBlock(D d)
Definition: arm_sve-inl.h:1670
HWY_INLINE auto ChangeLMUL(Simd< T, N, kPow2 > d, VFromD< Simd< T, N, kPow2 - 3 >> v) -> VFromD< decltype(d)>
Definition: rvv-inl.h:2170
HWY_INLINE VFromD< DU > Iota0(const D)
Definition: rvv-inl.h:676
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:115
constexpr bool IsSupportedLMUL(D d)
Definition: rvv-inl.h:1811
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
d
Definition: rvv-inl.h:1656
sseg3 sseg3 sseg4 mf2
Definition: rvv-inl.h:1432
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1120
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
RepartitionToNarrow< RebindToUnsigned< DF > > DU16FromDF
Definition: rvv-inl.h:2686
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
_
Definition: rvv-inl.h:1405
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f, _DEMOTE_VIRT) template< size_t N > HWY_API vint32mf2_t DemoteTo(Simd< int32_t
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_RVV_STORE3(uint, u, 8, _, _, mf8, _, _, -3, 64, StoreInterleaved3, sseg3) HWY_RVV_STORE3(uint
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:247
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
sseg3 sseg3 sseg4 HWY_RVV_STORE4(uint, u, 8, _, _, mf4, _, _, -2, 32, StoreInterleaved4, sseg4) HWY_RVV_STORE4(uint
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
typename D::Twice Twice
Definition: ops/shared-inl.h:220
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
mf4
Definition: rvv-inl.h:1405
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
constexpr size_t MLenFromD(Simd< T, N, kPow2 >)
Definition: rvv-inl.h:43
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
sseg3 sseg3 sseg4 sseg4 m2
Definition: rvv-inl.h:1436
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:162
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_INLINE constexpr HWY_MAYBE_UNUSED size_t MaxLanes(D)
Definition: ops/shared-inl.h:271
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
sseg3 m1
Definition: rvv-inl.h:1409
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
sseg3 sseg3 mf8
Definition: rvv-inl.h:1428
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API V Trunc(const V v)
Definition: rvv-inl.h:2597
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2108
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
u
Definition: rvv-inl.h:1405
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
constexpr T MantissaEnd()
Definition: base.h:570
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:273
constexpr HWY_API bool IsSame()
Definition: base.h:286
constexpr size_t CeilLog2(TI x)
Definition: base.h:700
constexpr HWY_API bool IsSigned()
Definition: base.h:483
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_IF_LANE_SIZE_D(D, bytes)
Definition: ops/shared-inl.h:227
#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:314
#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1364
HWY_AFTER_NAMESPACE()
#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1475
#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:531
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:977
#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:309
#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:472
#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1267
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:269
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1335
#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:323
#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:574
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:263
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:340
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1293
#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition: rvv-inl.h:1482
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2378
#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:590
#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1849
#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1704
#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1306
#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:503
#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:293
#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:869
#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:949
#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1533
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:379
#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1466
#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:860
#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:805
#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:297
#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1132
#define HWY_RVV_IF_POW2_IN(D, min, max)
Definition: rvv-inl.h:39
#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1080
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1938
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP)
Definition: rvv-inl.h:59
#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:417
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:281
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:271
#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:319
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1049
#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:410
#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:364
#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1205
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:257
#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1745
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:2064
HWY_BEFORE_NAMESPACE()
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:349
#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:436
#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1638
#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:494
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:283
#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:545
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1061
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:334
#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1231
#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1553
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:261
#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:665
#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:606
#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:305
#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1100
#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1187
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:516
#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:560
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1280
#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1864
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:986
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:328
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:345
#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1686
#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:425
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:259
#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition: rvv-inl.h:1217
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
Definition: rvv-inl.h:267
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: ops/shared-inl.h:40
Definition: base.h:317
Definition: base.h:253