Grok 10.0.1
generic_ops-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Target-independent types/functions defined after target-specific ops.
17
18// Relies on the external include guard in highway.h.
20namespace hwy {
21namespace HWY_NAMESPACE {
22
23// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
24template <class V>
25using LaneType = decltype(GetLane(V()));
26
27// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
28// type of functions that do not take a vector argument, or as an argument type
29// if the function only has a template argument for D, or for explicit type
30// names instead of auto. This may be a built-in type.
31template <class D>
32using Vec = decltype(Zero(D()));
33
34// Mask type. Useful as the return type of functions that do not take a mask
35// argument, or as an argument type if the function only has a template argument
36// for D, or for explicit type names instead of auto.
37template <class D>
38using Mask = decltype(MaskFromVec(Zero(D())));
39
40// Returns the closest value to v within [lo, hi].
41template <class V>
42HWY_API V Clamp(const V v, const V lo, const V hi) {
43 return Min(Max(lo, v), hi);
44}
45
46// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
47// and RVV has its own implementation of -Lanes.
48#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
49
50template <size_t kLanes, class D, class V = VFromD<D>>
51HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
52 constexpr size_t kBytes = kLanes * sizeof(LaneType<V>);
53 static_assert(kBytes < 16, "Shift count is per-block");
54 return CombineShiftRightBytes<kBytes>(d, hi, lo);
55}
56
57#endif
58
59// Returns lanes with the most significant bit set and all other bits zero.
60template <class D>
62 const RebindToUnsigned<decltype(d)> du;
63 return BitCast(d, Set(du, SignMask<TFromD<D>>()));
64}
65
66// Returns quiet NaN.
67template <class D>
69 const RebindToSigned<D> di;
70 // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
71 // mantissa MSB (to indicate quiet) would be sufficient.
72 return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
73}
74
75// Returns positive infinity.
76template <class D>
78 const RebindToUnsigned<D> du;
79 using T = TFromD<D>;
80 using TU = TFromD<decltype(du)>;
81 const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
82 return BitCast(d, Set(du, max_x2 >> 1));
83}
84
85// ------------------------------ SafeFillN
86
87template <class D, typename T = TFromD<D>>
88HWY_API void SafeFillN(const size_t num, const T value, D d,
89 T* HWY_RESTRICT to) {
90#if HWY_MEM_OPS_MIGHT_FAULT
91 (void)d;
92 for (size_t i = 0; i < num; ++i) {
93 to[i] = value;
94 }
95#else
96 BlendedStore(Set(d, value), FirstN(d, num), d, to);
97#endif
98}
99
100// ------------------------------ SafeCopyN
101
102template <class D, typename T = TFromD<D>>
103HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
104 T* HWY_RESTRICT to) {
105#if HWY_MEM_OPS_MIGHT_FAULT
106 (void)d;
107 for (size_t i = 0; i < num; ++i) {
108 to[i] = from[i];
109 }
110#else
111 const Mask<D> mask = FirstN(d, num);
112 BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
113#endif
114}
115
116// "Include guard": skip if native instructions are available. The generic
117// implementation is currently shared between x86_* and wasm_*, and is too large
118// to duplicate.
119
120#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
121#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
122#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
123#else
124#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
125#endif
126
127// ------------------------------ LoadInterleaved2
128
129template <typename T, size_t N, class V>
131 V& v0, V& v1) {
132 const V A = LoadU(d, unaligned + 0 * N); // v1[1] v0[1] v1[0] v0[0]
133 const V B = LoadU(d, unaligned + 1 * N);
134 v0 = ConcatEven(d, B, A);
135 v1 = ConcatOdd(d, B, A);
136}
137
138template <typename T, class V>
140 V& v0, V& v1) {
141 v0 = LoadU(d, unaligned + 0);
142 v1 = LoadU(d, unaligned + 1);
143}
144
145// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
146
147namespace detail {
148
149// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
150template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
152 const T* HWY_RESTRICT unaligned, V& A, V& B,
153 V& C) {
154 A = LoadU(d, unaligned + 0 * N);
155 B = LoadU(d, unaligned + 1 * N);
156 C = LoadU(d, unaligned + 2 * N);
157}
158
159} // namespace detail
160
161template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
163 V& v0, V& v1, V& v2) {
164 const RebindToUnsigned<decltype(d)> du;
165 // Compact notation so these fit on one line: 12 := v1[2].
166 V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
167 V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
168 V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
169 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
170 // Compress all lanes belonging to v0 into consecutive lanes.
171 constexpr uint8_t Z = 0x80;
172 alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
173 Z, Z, Z, Z, Z, Z, Z, Z};
174 alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, Z, Z, Z, 2, 5,
175 8, 11, 14, Z, Z, Z, Z, Z};
176 alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
177 Z, Z, Z, 1, 4, 7, 10, 13};
178 alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
179 Z, Z, Z, Z, Z, Z, Z, Z};
180 alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, Z, Z, 0, 3, 6,
181 9, 12, 15, Z, Z, Z, Z, Z};
182 alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
183 Z, Z, Z, 2, 5, 8, 11, 14};
184 alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
185 Z, Z, Z, Z, Z, Z, Z, Z};
186 alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, Z, Z, Z, 1, 4, 7,
187 10, 13, Z, Z, Z, Z, Z, Z};
188 alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
189 Z, Z, 0, 3, 6, 9, 12, 15};
190 const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
191 const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
192 const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
193 const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
194 const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
195 const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
196 const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
197 const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
198 const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
199 v0 = Or3(v0L, v0M, v0U);
200 v1 = Or3(v1L, v1M, v1U);
201 v2 = Or3(v2L, v2M, v2U);
202}
203
204// 8-bit lanes x8
205template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
208 V& v0, V& v1, V& v2) {
209 const RebindToUnsigned<decltype(d)> du;
210 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
211 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
212 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
213 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
214 // Compress all lanes belonging to v0 into consecutive lanes.
215 constexpr uint8_t Z = 0x80;
216 alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
217 alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
218 alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
219 alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
220 alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
221 alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
222 alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
223 alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
224 alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
225 const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
226 const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
227 const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
228 const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
229 const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
230 const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
231 const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
232 const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
233 const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
234 v0 = Or3(v0L, v0M, v0U);
235 v1 = Or3(v1L, v1M, v1U);
236 v2 = Or3(v2L, v2M, v2U);
237}
238
239// 16-bit lanes x8
240template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
242HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
243 V& v0, V& v1, V& v2) {
244 const RebindToUnsigned<decltype(d)> du;
245 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
246 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
247 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
248 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
249 // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
250 // but each element of the array contains two byte indices for a lane.
251 constexpr uint16_t Z = 0x8080;
252 alignas(16) constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
253 Z, Z, Z, Z};
254 alignas(16) constexpr uint16_t kIdx_v0B[8] = {Z, Z, Z, 0x0302,
255 0x0908, 0x0F0E, Z, Z};
256 alignas(16) constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z, Z,
257 Z, Z, 0x0504, 0x0B0A};
258 alignas(16) constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
259 Z, Z, Z, Z};
260 alignas(16) constexpr uint16_t kIdx_v1B[8] = {Z, Z, Z, 0x0504,
261 0x0B0A, Z, Z, Z};
262 alignas(16) constexpr uint16_t kIdx_v1C[8] = {Z, Z, Z, Z,
263 Z, 0x0100, 0x0706, 0x0D0C};
264 alignas(16) constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
265 Z, Z, Z, Z};
266 alignas(16) constexpr uint16_t kIdx_v2B[8] = {Z, Z, 0x0100, 0x0706,
267 0x0D0C, Z, Z, Z};
268 alignas(16) constexpr uint16_t kIdx_v2C[8] = {Z, Z, Z, Z,
269 Z, 0x0302, 0x0908, 0x0F0E};
270 const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
271 const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
272 const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
273 const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
274 const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
275 const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
276 const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
277 const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
278 const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
279 v0 = Or3(v0L, v0M, v0U);
280 v1 = Or3(v1L, v1M, v1U);
281 v2 = Or3(v2L, v2M, v2U);
282}
283
284template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
285HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
286 V& v0, V& v1, V& v2) {
287 V A; // v0[1] v2[0] v1[0] v0[0]
288 V B; // v1[2] v0[2] v2[1] v1[1]
289 V C; // v2[3] v1[3] v0[3] v2[2]
290 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
291
292 const V vxx_02_03_xx = OddEven(C, B);
293 v0 = detail::Shuffle1230(A, vxx_02_03_xx);
294
295 // Shuffle2301 takes the upper/lower halves of the output from one input, so
296 // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
297 // OddEven because it may have higher throughput than Shuffle.
298 const V vxx_xx_10_11 = OddEven(A, B);
299 const V v12_13_xx_xx = OddEven(B, C);
300 v1 = detail::Shuffle2301(vxx_xx_10_11, v12_13_xx_xx);
301
302 const V vxx_20_21_xx = OddEven(B, A);
303 v2 = detail::Shuffle3012(vxx_20_21_xx, C);
304}
305
306template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
307HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
308 V& v0, V& v1, V& v2) {
309 V A; // v1[0] v0[0]
310 V B; // v0[1] v2[0]
311 V C; // v2[1] v1[1]
312 detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
313 v0 = OddEven(B, A);
314 v1 = CombineShiftRightBytes<sizeof(T)>(d, C, A);
315 v2 = OddEven(C, B);
316}
317
318template <typename T, class V>
320 V& v0, V& v1, V& v2) {
321 v0 = LoadU(d, unaligned + 0);
322 v1 = LoadU(d, unaligned + 1);
323 v2 = LoadU(d, unaligned + 2);
324}
325
326// ------------------------------ LoadInterleaved4
327
328namespace detail {
329
330// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
331template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
333 const T* HWY_RESTRICT unaligned, V& A, V& B,
334 V& C, V& D) {
335 A = LoadU(d, unaligned + 0 * N);
336 B = LoadU(d, unaligned + 1 * N);
337 C = LoadU(d, unaligned + 2 * N);
338 D = LoadU(d, unaligned + 3 * N);
339}
340
341} // namespace detail
342
343template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
345 V& v0, V& v1, V& v2, V& v3) {
346 const Repartition<uint64_t, decltype(d)> d64;
347 using V64 = VFromD<decltype(d64)>;
348 // 16 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
349 // Here int[i] means the four interleaved values of the i-th 4-tuple and
350 // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
351 V A; // int[13..10] int[3..0]
352 V B; // int[17..14] int[7..4]
353 V C; // int[1b..18] int[b..8]
354 V D; // int[1f..1c] int[f..c]
355 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
356
357 // For brevity, the comments only list the lower block (upper = lower + 0x10)
358 const V v5140 = InterleaveLower(d, A, B); // int[5,1,4,0]
359 const V vd9c8 = InterleaveLower(d, C, D); // int[d,9,c,8]
360 const V v7362 = InterleaveUpper(d, A, B); // int[7,3,6,2]
361 const V vfbea = InterleaveUpper(d, C, D); // int[f,b,e,a]
362
363 const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0]
364 const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8]
365 const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1]
366 const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
367
368 const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0]
369 const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8]
370 const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0]
371 const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8]
372
373 v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
374 v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
375 v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
376 v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
377}
378
379template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
380HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
381 V& v0, V& v1, V& v2, V& v3) {
382 // In the last step, we interleave by half of the block size, which is usually
383 // 8 bytes but half that for 8-bit x8 vectors.
384 using TW = hwy::UnsignedFromSize<sizeof(T) * N == 8 ? 4 : 8>;
385 const Repartition<TW, decltype(d)> dw;
386 using VW = VFromD<decltype(dw)>;
387
388 // (Comments are for 256-bit vectors.)
389 // 8 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
390 V A; // v3210[9]v3210[8] v3210[1]v3210[0]
391 V B; // v3210[b]v3210[a] v3210[3]v3210[2]
392 V C; // v3210[d]v3210[c] v3210[5]v3210[4]
393 V D; // v3210[f]v3210[e] v3210[7]v3210[6]
394 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
395
396 const V va820 = InterleaveLower(d, A, B); // v3210[a,8] v3210[2,0]
397 const V vec64 = InterleaveLower(d, C, D); // v3210[e,c] v3210[6,4]
398 const V vb931 = InterleaveUpper(d, A, B); // v3210[b,9] v3210[3,1]
399 const V vfd75 = InterleaveUpper(d, C, D); // v3210[f,d] v3210[7,5]
400
401 const VW v10_b830 = // v10[b..8] v10[3..0]
402 BitCast(dw, InterleaveLower(d, va820, vb931));
403 const VW v10_fc74 = // v10[f..c] v10[7..4]
404 BitCast(dw, InterleaveLower(d, vec64, vfd75));
405 const VW v32_b830 = // v32[b..8] v32[3..0]
406 BitCast(dw, InterleaveUpper(d, va820, vb931));
407 const VW v32_fc74 = // v32[f..c] v32[7..4]
408 BitCast(dw, InterleaveUpper(d, vec64, vfd75));
409
410 v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
411 v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
412 v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
413 v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
414}
415
416template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
417HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
418 V& v0, V& v1, V& v2, V& v3) {
419 V A; // v3210[4] v3210[0]
420 V B; // v3210[5] v3210[1]
421 V C; // v3210[6] v3210[2]
422 V D; // v3210[7] v3210[3]
423 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
424 const V v10_ev = InterleaveLower(d, A, C); // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
425 const V v10_od = InterleaveLower(d, B, D); // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
426 const V v32_ev = InterleaveUpper(d, A, C); // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
427 const V v32_od = InterleaveUpper(d, B, D); // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
428
429 v0 = InterleaveLower(d, v10_ev, v10_od);
430 v1 = InterleaveUpper(d, v10_ev, v10_od);
431 v2 = InterleaveLower(d, v32_ev, v32_od);
432 v3 = InterleaveUpper(d, v32_ev, v32_od);
433}
434
435template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
436HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
437 V& v0, V& v1, V& v2, V& v3) {
438 V A, B, C, D;
439 detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
440 v0 = InterleaveLower(d, A, C);
441 v1 = InterleaveUpper(d, A, C);
442 v2 = InterleaveLower(d, B, D);
443 v3 = InterleaveUpper(d, B, D);
444}
445
446// Any T x1
447template <typename T, class V>
449 V& v0, V& v1, V& v2, V& v3) {
450 v0 = LoadU(d, unaligned + 0);
451 v1 = LoadU(d, unaligned + 1);
452 v2 = LoadU(d, unaligned + 2);
453 v3 = LoadU(d, unaligned + 3);
454}
455
456// ------------------------------ StoreInterleaved2
457
458namespace detail {
459
460// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
461template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
463 T* HWY_RESTRICT unaligned) {
464 StoreU(A, d, unaligned + 0 * N);
465 StoreU(B, d, unaligned + 1 * N);
466}
467
468} // namespace detail
469
470// >= 128 bit vector
471template <typename T, size_t N, class V, HWY_IF_GE128(T, N)>
472HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
473 T* HWY_RESTRICT unaligned) {
474 const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0]
475 const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[N/2] v0[N/2]
476 detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
477}
478
479// 64 bits
480template <typename T>
481HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
482 Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
483 // Use full vectors to reduce the number of stores.
484 const Full128<T> d_full;
485 const Vec128<T> v0{part0.raw};
486 const Vec128<T> v1{part1.raw};
487 const auto v10 = InterleaveLower(d_full, v0, v1);
488 StoreU(v10, d_full, unaligned);
489}
490
491// <= 32 bits
492template <typename T, size_t N, HWY_IF_LE32(T, N)>
493HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
494 const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
495 T* HWY_RESTRICT unaligned) {
496 // Use full vectors to reduce the number of stores.
497 const Full128<T> d_full;
498 const Vec128<T> v0{part0.raw};
499 const Vec128<T> v1{part1.raw};
500 const auto v10 = InterleaveLower(d_full, v0, v1);
501 alignas(16) T buf[16 / sizeof(T)];
502 StoreU(v10, d_full, buf);
503 CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
504}
505
506// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
507// TableLookupBytes)
508
509namespace detail {
510
511// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
512template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
513HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C,
515 T* HWY_RESTRICT unaligned) {
516 StoreU(A, d, unaligned + 0 * N);
517 StoreU(B, d, unaligned + 1 * N);
518 StoreU(C, d, unaligned + 2 * N);
519}
520
521} // namespace detail
522
523// >= 128-bit vector, 8-bit lanes
524template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
525 HWY_IF_GE128(T, N)>
526HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
527 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
528 const RebindToUnsigned<decltype(d)> du;
529 const auto k5 = Set(du, 5);
530 const auto k6 = Set(du, 6);
531
532 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
533 // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
534 // to their place, with 0x80 so lanes to be filled from other vectors are 0
535 // to enable blending by ORing together.
536 alignas(16) static constexpr uint8_t tbl_v0[16] = {
537 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
538 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
539 alignas(16) static constexpr uint8_t tbl_v1[16] = {
540 0x80, 0, 0x80, 0x80, 1, 0x80, //
541 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
542 // The interleaved vectors will be named A, B, C; temporaries with suffix
543 // 0..2 indicate which input vector's lanes they hold.
544 const auto shuf_A0 = LoadDup128(du, tbl_v0);
545 const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5)
546 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
547 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
548 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
549 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
550 const V A = BitCast(d, A0 | A1 | A2);
551
552 // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
553 const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
554 const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
555 const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
556 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
557 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
558 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
559 const V B = BitCast(d, B0 | B1 | B2);
560
561 // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
562 const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
563 const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
564 const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
565 const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
566 const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
567 const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
568 const V C = BitCast(d, C0 | C1 | C2);
569
570 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
571}
572
573// >= 128-bit vector, 16-bit lanes
574template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
575 HWY_IF_GE128(T, N)>
576HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
577 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
578 const Repartition<uint8_t, decltype(d)> du8;
579 const auto k2 = Set(du8, 2 * sizeof(T));
580 const auto k3 = Set(du8, 3 * sizeof(T));
581
582 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
583 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
584 // filled from other vectors are 0 for blending. Note that these are byte
585 // indices for 16-bit lanes.
586 alignas(16) static constexpr uint8_t tbl_v1[16] = {
587 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
588 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
589 alignas(16) static constexpr uint8_t tbl_v2[16] = {
590 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
591 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
592
593 // The interleaved vectors will be named A, B, C; temporaries with suffix
594 // 0..2 indicate which input vector's lanes they hold.
595 const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0.
596 // .2..1..0
597 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
598 const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0..
599
600 const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
601 const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
602 const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
603 const V A = BitCast(d, A0 | A1 | A2);
604
605 // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
606 const auto shuf_B0 = shuf_A1 + k3; // 5..4..3.
607 const auto shuf_B1 = shuf_A2 + k3; // ..4..3..
608 const auto shuf_B2 = shuf_A0 + k2; // .4..3..2
609 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
610 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
611 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
612 const V B = BitCast(d, B0 | B1 | B2);
613
614 // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
615 const auto shuf_C0 = shuf_B1 + k3; // ..7..6..
616 const auto shuf_C1 = shuf_B2 + k3; // .7..6..5
617 const auto shuf_C2 = shuf_B0 + k2; // 7..6..5.
618 const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
619 const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
620 const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
621 const V C = BitCast(d, C0 | C1 | C2);
622
623 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
624}
625
626// >= 128-bit vector, 32-bit lanes
627template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 4),
628 HWY_IF_GE128(T, N)>
629HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
630 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
631 const RepartitionToWide<decltype(d)> dw;
632
633 const V v10_v00 = InterleaveLower(d, v0, v1);
634 const V v01_v20 = OddEven(v0, v2);
635 // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
636 const V A = BitCast(
637 d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
638
639 const V v1_321 = ShiftRightLanes<1>(d, v1);
640 const V v0_32 = ShiftRightLanes<2>(d, v0);
641 const V v21_v11 = OddEven(v2, v1_321);
642 const V v12_v02 = OddEven(v1_321, v0_32);
643 // B: v1[2],v0[2], v2[1],v1[1]
644 const V B = BitCast(
645 d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
646
647 // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
648 const V v23_v13 = OddEven(v2, v1_321);
649 const V v03_v22 = OddEven(v0, v2);
650 // C: v2[3],v1[3],v0[3], v2[2]
651 const V C = BitCast(
652 d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
653
654 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
655}
656
657// >= 128-bit vector, 64-bit lanes
658template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
659 HWY_IF_GE128(T, N)>
660HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
661 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
662 const V A = InterleaveLower(d, v0, v1);
663 const V B = OddEven(v0, v2);
664 const V C = InterleaveUpper(d, v1, v2);
665 detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
666}
667
668// 64-bit vector, 8-bit lanes
669template <typename T, HWY_IF_LANE_SIZE(T, 1)>
670HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
671 const Vec64<T> part2, Full64<T> d,
672 T* HWY_RESTRICT unaligned) {
673 constexpr size_t N = 16 / sizeof(T);
674 // Use full vectors for the shuffles and first result.
675 const Full128<uint8_t> du;
676 const Full128<T> d_full;
677 const auto k5 = Set(du, 5);
678 const auto k6 = Set(du, 6);
679
680 const Vec128<T> v0{part0.raw};
681 const Vec128<T> v1{part1.raw};
682 const Vec128<T> v2{part2.raw};
683
684 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
685 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
686 // filled from other vectors are 0 for blending.
687 alignas(16) static constexpr uint8_t tbl_v0[16] = {
688 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
689 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
690 alignas(16) static constexpr uint8_t tbl_v1[16] = {
691 0x80, 0, 0x80, 0x80, 1, 0x80, //
692 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
693 // The interleaved vectors will be named A, B, C; temporaries with suffix
694 // 0..2 indicate which input vector's lanes they hold.
695 const auto shuf_A0 = Load(du, tbl_v0);
696 const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
697 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
698 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
699 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
700 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
701 const auto A = BitCast(d_full, A0 | A1 | A2);
702 StoreU(A, d_full, unaligned + 0 * N);
703
704 // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
705 const auto shuf_B0 = shuf_A2 + k6; // ..7..6..
706 const auto shuf_B1 = shuf_A0 + k5; // .7..6..5
707 const auto shuf_B2 = shuf_A1 + k5; // 7..6..5.
708 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
709 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
710 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
711 const Vec64<T> B{(B0 | B1 | B2).raw};
712 StoreU(B, d, unaligned + 1 * N);
713}
714
715// 64-bit vector, 16-bit lanes
716template <typename T, HWY_IF_LANE_SIZE(T, 2)>
717HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
718 const Vec64<T> part2, Full64<T> dh,
719 T* HWY_RESTRICT unaligned) {
720 const Full128<T> d;
721 const Full128<uint8_t> du8;
722 constexpr size_t N = 16 / sizeof(T);
723 const auto k2 = Set(du8, 2 * sizeof(T));
724 const auto k3 = Set(du8, 3 * sizeof(T));
725
726 const Vec128<T> v0{part0.raw};
727 const Vec128<T> v1{part1.raw};
728 const Vec128<T> v2{part2.raw};
729
730 // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
731 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
732 // to their place, with 0x80 so lanes to be filled from other vectors are 0
733 // to enable blending by ORing together.
734 alignas(16) static constexpr uint8_t tbl_v1[16] = {
735 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
736 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
737 alignas(16) static constexpr uint8_t tbl_v2[16] = {
738 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
739 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
740
741 // The interleaved vectors will be named A, B; temporaries with suffix
742 // 0..2 indicate which input vector's lanes they hold.
743 const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
744 // .2..1..0
745 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
746 const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0..
747
748 const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
749 const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
750 const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
751 const Vec128<T> A = BitCast(d, A0 | A1 | A2);
752 StoreU(A, d, unaligned + 0 * N);
753
754 // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
755 const auto shuf_B0 = shuf_A1 + k3; // ..3.
756 const auto shuf_B1 = shuf_A2 + k3; // .3..
757 const auto shuf_B2 = shuf_A0 + k2; // 3..2
758 const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
759 const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
760 const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
761 const Vec128<T> B = BitCast(d, B0 | B1 | B2);
762 StoreU(Vec64<T>{B.raw}, dh, unaligned + 1 * N);
763}
764
765// 64-bit vector, 32-bit lanes
766template <typename T, HWY_IF_LANE_SIZE(T, 4)>
767HWY_API void StoreInterleaved3(const Vec64<T> v0, const Vec64<T> v1,
768 const Vec64<T> v2, Full64<T> d,
769 T* HWY_RESTRICT unaligned) {
770 // (same code as 128-bit vector, 64-bit lanes)
771 constexpr size_t N = 2;
772 const Vec64<T> v10_v00 = InterleaveLower(d, v0, v1);
773 const Vec64<T> v01_v20 = OddEven(v0, v2);
774 const Vec64<T> v21_v11 = InterleaveUpper(d, v1, v2);
775 StoreU(v10_v00, d, unaligned + 0 * N);
776 StoreU(v01_v20, d, unaligned + 1 * N);
777 StoreU(v21_v11, d, unaligned + 2 * N);
778}
779
780// 64-bit lanes are handled by the N=1 case below.
781
782// <= 32-bit vector, 8-bit lanes
783template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
785 const Vec128<T, N> part1,
786 const Vec128<T, N> part2, Simd<T, N, 0> /*tag*/,
787 T* HWY_RESTRICT unaligned) {
788 // Use full vectors for the shuffles and result.
789 const Full128<uint8_t> du;
790 const Full128<T> d_full;
791
792 const Vec128<T> v0{part0.raw};
793 const Vec128<T> v1{part1.raw};
794 const Vec128<T> v2{part2.raw};
795
796 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
797 // so lanes to be filled from other vectors are 0 to enable blending by ORing
798 // together.
799 alignas(16) static constexpr uint8_t tbl_v0[16] = {
800 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
801 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
802 // The interleaved vector will be named A; temporaries with suffix
803 // 0..2 indicate which input vector's lanes they hold.
804 const auto shuf_A0 = Load(du, tbl_v0);
805 const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
806 const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
807 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
808 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
809 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
810 const Vec128<T> A = BitCast(d_full, A0 | A1 | A2);
811 alignas(16) T buf[16 / sizeof(T)];
812 StoreU(A, d_full, buf);
813 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
814}
815
816// 32-bit vector, 16-bit lanes
817template <typename T, HWY_IF_LANE_SIZE(T, 2)>
819 const Vec128<T, 2> part1,
820 const Vec128<T, 2> part2, Simd<T, 2, 0> /*tag*/,
821 T* HWY_RESTRICT unaligned) {
822 constexpr size_t N = 4 / sizeof(T);
823 // Use full vectors for the shuffles and result.
824 const Full128<uint8_t> du8;
825 const Full128<T> d_full;
826
827 const Vec128<T> v0{part0.raw};
828 const Vec128<T> v1{part1.raw};
829 const Vec128<T> v2{part2.raw};
830
831 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
832 // so lanes to be filled from other vectors are 0 to enable blending by ORing
833 // together.
834 alignas(16) static constexpr uint8_t tbl_v2[16] = {
835 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
836 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
837 // The interleaved vector will be named A; temporaries with suffix
838 // 0..2 indicate which input vector's lanes they hold.
839 const auto shuf_A2 = // ..1..0..
840 Load(du8, tbl_v2);
841 const auto shuf_A1 = // ...1..0.
842 CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
843 const auto shuf_A0 = // ....1..0
844 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
845 const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
846 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
847 const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
848 const auto A = BitCast(d_full, A0 | A1 | A2);
849 alignas(16) T buf[16 / sizeof(T)];
850 StoreU(A, d_full, buf);
851 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
852}
853
854// Single-element vector, any lane size: just store directly
855template <typename T>
857 const Vec128<T, 1> v2, Simd<T, 1, 0> d,
858 T* HWY_RESTRICT unaligned) {
859 StoreU(v0, d, unaligned + 0);
860 StoreU(v1, d, unaligned + 1);
861 StoreU(v2, d, unaligned + 2);
862}
863
864// ------------------------------ StoreInterleaved4
865
866namespace detail {
867
868// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
869template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
870HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D,
872 T* HWY_RESTRICT unaligned) {
873 StoreU(A, d, unaligned + 0 * N);
874 StoreU(B, d, unaligned + 1 * N);
875 StoreU(C, d, unaligned + 2 * N);
876 StoreU(D, d, unaligned + 3 * N);
877}
878
879} // namespace detail
880
881// >= 128-bit vector, 8..32-bit lanes
882template <typename T, size_t N, class V, HWY_IF_NOT_LANE_SIZE(T, 8),
883 HWY_IF_GE128(T, N)>
884HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
885 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
886 const RepartitionToWide<decltype(d)> dw;
887 const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
888 const auto v32L = ZipLower(dw, v2, v3);
889 const auto v10U = ZipUpper(dw, v0, v1);
890 const auto v32U = ZipUpper(dw, v2, v3);
891 // The interleaved vectors are A, B, C, D.
892 const auto A = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210
893 const auto B = BitCast(d, InterleaveUpper(dw, v10L, v32L));
894 const auto C = BitCast(d, InterleaveLower(dw, v10U, v32U));
895 const auto D = BitCast(d, InterleaveUpper(dw, v10U, v32U));
896 detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
897}
898
899// >= 128-bit vector, 64-bit lanes
900template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
901 HWY_IF_GE128(T, N)>
902HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
903 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
904 // The interleaved vectors are A, B, C, D.
905 const auto A = InterleaveLower(d, v0, v1); // v1[0] v0[0]
906 const auto B = InterleaveLower(d, v2, v3);
907 const auto C = InterleaveUpper(d, v0, v1);
908 const auto D = InterleaveUpper(d, v2, v3);
909 detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
910}
911
912// 64-bit vector, 8..32-bit lanes
913template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
914HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
915 const Vec64<T> part2, const Vec64<T> part3,
916 Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
917 constexpr size_t N = 16 / sizeof(T);
918 // Use full vectors to reduce the number of stores.
919 const Full128<T> d_full;
920 const RepartitionToWide<decltype(d_full)> dw;
921 const Vec128<T> v0{part0.raw};
922 const Vec128<T> v1{part1.raw};
923 const Vec128<T> v2{part2.raw};
924 const Vec128<T> v3{part3.raw};
925 const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0]
926 const auto v32 = ZipLower(dw, v2, v3);
927 const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
928 const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
929 StoreU(A, d_full, unaligned + 0 * N);
930 StoreU(B, d_full, unaligned + 1 * N);
931}
932
933// 64-bit vector, 64-bit lane
934template <typename T, HWY_IF_LANE_SIZE(T, 8)>
935HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
936 const Vec64<T> part2, const Vec64<T> part3,
937 Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
938 constexpr size_t N = 16 / sizeof(T);
939 // Use full vectors to reduce the number of stores.
940 const Full128<T> d_full;
941 const Vec128<T> v0{part0.raw};
942 const Vec128<T> v1{part1.raw};
943 const Vec128<T> v2{part2.raw};
944 const Vec128<T> v3{part3.raw};
945 const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0]
946 const auto B = InterleaveLower(d_full, v2, v3);
947 StoreU(A, d_full, unaligned + 0 * N);
948 StoreU(B, d_full, unaligned + 1 * N);
949}
950
951// <= 32-bit vectors
952template <typename T, size_t N, HWY_IF_LE32(T, N)>
953HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
954 const Vec128<T, N> part1,
955 const Vec128<T, N> part2,
956 const Vec128<T, N> part3, Simd<T, N, 0> /*tag*/,
957 T* HWY_RESTRICT unaligned) {
958 // Use full vectors to reduce the number of stores.
959 const Full128<T> d_full;
960 const RepartitionToWide<decltype(d_full)> dw;
961 const Vec128<T> v0{part0.raw};
962 const Vec128<T> v1{part1.raw};
963 const Vec128<T> v2{part2.raw};
964 const Vec128<T> v3{part3.raw};
965 const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
966 const auto v32 = ZipLower(dw, v2, v3);
967 const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
968 alignas(16) T buf[16 / sizeof(T)];
969 StoreU(v3210, d_full, buf);
970 CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
971}
972
973#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
974
975// ------------------------------ AESRound
976
977// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
978#if HWY_TARGET != HWY_SCALAR
979
980// Define for white-box testing, even if native instructions are available.
981namespace detail {
982
983// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
984// Vector Permute Instructions" and the accompanying assembly language
985// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
986// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
987//
988// A brute-force 256 byte table lookup can also be made constant-time, and
989// possibly competitive on NEON, but this is more performance-portable
990// especially for x86 and large vectors.
991template <class V> // u8
992HWY_INLINE V SubBytes(V state) {
993 const DFromV<V> du;
994 const auto mask = Set(du, 0xF);
995
996 // Change polynomial basis to GF(2^4)
997 {
998 alignas(16) static constexpr uint8_t basisL[16] = {
999 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
1000 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
1001 alignas(16) static constexpr uint8_t basisU[16] = {
1002 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
1003 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
1004 const auto sL = And(state, mask);
1005 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
1006 const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
1007 const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
1008 state = Xor(gf4L, gf4U);
1009 }
1010
1011 // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
1012 // cause TableLookupBytesOr0 to return 0.
1013 alignas(16) static constexpr uint8_t kZetaInv[16] = {
1014 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
1015 alignas(16) static constexpr uint8_t kInv[16] = {
1016 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
1017 const auto tbl = LoadDup128(du, kInv);
1018 const auto sL = And(state, mask); // L=low nibble, U=upper
1019 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
1020 const auto sX = Xor(sU, sL);
1021 const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
1022 const auto invU = TableLookupBytes(tbl, sU);
1023 const auto invX = TableLookupBytes(tbl, sX);
1024 const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
1025 const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
1026
1027 // Linear skew (cannot bake 0x63 bias into the table because out* indices
1028 // may have the infinity flag set).
1029 alignas(16) static constexpr uint8_t kAffineL[16] = {
1030 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
1031 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
1032 alignas(16) static constexpr uint8_t kAffineU[16] = {
1033 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
1034 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
1035 const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
1036 const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
1037 return Xor(Xor(affL, affU), Set(du, 0x63));
1038}
1039
1040} // namespace detail
1041
1042#endif // HWY_TARGET != HWY_SCALAR
1043
1044// "Include guard": skip if native AES instructions are available.
1045#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
1046#ifdef HWY_NATIVE_AES
1047#undef HWY_NATIVE_AES
1048#else
1049#define HWY_NATIVE_AES
1050#endif
1051
1052// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
1053#if HWY_TARGET != HWY_SCALAR
1054
1055namespace detail {
1056
1057template <class V> // u8
1058HWY_API V ShiftRows(const V state) {
1059 const DFromV<V> du;
1060 alignas(16) static constexpr uint8_t kShiftRow[16] = {
1061 0, 5, 10, 15, // transposed: state is column major
1062 4, 9, 14, 3, //
1063 8, 13, 2, 7, //
1064 12, 1, 6, 11};
1065 const auto shift_row = LoadDup128(du, kShiftRow);
1066 return TableLookupBytes(state, shift_row);
1067}
1068
1069template <class V> // u8
1070HWY_API V MixColumns(const V state) {
1071 const DFromV<V> du;
1072 // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
1073 // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
1074 // 1 2 3 1 // d are on diagonal, no permutation needed.
1075 // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
1076 // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
1077 alignas(16) static constexpr uint8_t k2301[16] = {
1078 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
1079 alignas(16) static constexpr uint8_t k1230[16] = {
1080 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
1081 const RebindToSigned<decltype(du)> di; // can only do signed comparisons
1082 const auto msb = Lt(BitCast(di, state), Zero(di));
1083 const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
1084 const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8).
1085 const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
1086 const auto d_s2301 = Xor(d, s2301);
1087 const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
1088 const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
1089 return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
1090}
1091
1092} // namespace detail
1093
1094template <class V> // u8
1095HWY_API V AESRound(V state, const V round_key) {
1096 // Intel docs swap the first two steps, but it does not matter because
1097 // ShiftRows is a permutation and SubBytes is independent of lane index.
1098 state = detail::SubBytes(state);
1099 state = detail::ShiftRows(state);
1100 state = detail::MixColumns(state);
1101 state = Xor(state, round_key); // AddRoundKey
1102 return state;
1103}
1104
1105template <class V> // u8
1106HWY_API V AESLastRound(V state, const V round_key) {
1107 // LIke AESRound, but without MixColumns.
1108 state = detail::SubBytes(state);
1109 state = detail::ShiftRows(state);
1110 state = Xor(state, round_key); // AddRoundKey
1111 return state;
1112}
1113
1114// Constant-time implementation inspired by
1115// https://www.bearssl.org/constanttime.html, but about half the cost because we
1116// use 64x64 multiplies and 128-bit XORs.
1117template <class V>
1118HWY_API V CLMulLower(V a, V b) {
1119 const DFromV<V> d;
1120 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
1121 const auto k1 = Set(d, 0x1111111111111111ULL);
1122 const auto k2 = Set(d, 0x2222222222222222ULL);
1123 const auto k4 = Set(d, 0x4444444444444444ULL);
1124 const auto k8 = Set(d, 0x8888888888888888ULL);
1125 const auto a0 = And(a, k1);
1126 const auto a1 = And(a, k2);
1127 const auto a2 = And(a, k4);
1128 const auto a3 = And(a, k8);
1129 const auto b0 = And(b, k1);
1130 const auto b1 = And(b, k2);
1131 const auto b2 = And(b, k4);
1132 const auto b3 = And(b, k8);
1133
1134 auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
1135 auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
1136 auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
1137 auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
1138 m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
1139 m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
1140 m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
1141 m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
1142 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
1143}
1144
1145template <class V>
1146HWY_API V CLMulUpper(V a, V b) {
1147 const DFromV<V> d;
1148 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
1149 const auto k1 = Set(d, 0x1111111111111111ULL);
1150 const auto k2 = Set(d, 0x2222222222222222ULL);
1151 const auto k4 = Set(d, 0x4444444444444444ULL);
1152 const auto k8 = Set(d, 0x8888888888888888ULL);
1153 const auto a0 = And(a, k1);
1154 const auto a1 = And(a, k2);
1155 const auto a2 = And(a, k4);
1156 const auto a3 = And(a, k8);
1157 const auto b0 = And(b, k1);
1158 const auto b1 = And(b, k2);
1159 const auto b2 = And(b, k4);
1160 const auto b3 = And(b, k8);
1161
1162 auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
1163 auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
1164 auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
1165 auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
1166 m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
1167 m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
1168 m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
1169 m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
1170 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
1171}
1172
1173#endif // HWY_NATIVE_AES
1174#endif // HWY_TARGET != HWY_SCALAR
1175
1176// "Include guard": skip if native POPCNT-related instructions are available.
1177#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
1178#ifdef HWY_NATIVE_POPCNT
1179#undef HWY_NATIVE_POPCNT
1180#else
1181#define HWY_NATIVE_POPCNT
1182#endif
1183
1184#undef HWY_MIN_POW2_FOR_128
1185#if HWY_TARGET == HWY_RVV
1186#define HWY_MIN_POW2_FOR_128 1
1187#else
1188// All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
1189// guarantee 128 bits anyway.
1190#define HWY_MIN_POW2_FOR_128 0
1191#endif
1192
1193// This algorithm requires vectors to be at least 16 bytes, which is the case
1194// for LMUL >= 2. If not, use the fallback below.
1195template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
1196 HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
1198 static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
1199 const D d;
1200 HWY_ALIGN constexpr uint8_t kLookup[16] = {
1201 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1202 };
1203 const auto lo = And(v, Set(d, 0xF));
1204 const auto hi = ShiftRight<4>(v);
1205 const auto lookup = LoadDup128(d, kLookup);
1206 return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
1207}
1208
1209// RVV has a specialization that avoids the Set().
1210#if HWY_TARGET != HWY_RVV
1211// Slower fallback for capped vectors.
1212template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
1214 static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
1215 const D d;
1216 // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
1217 v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
1218 v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
1219 return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
1220}
1221#endif // HWY_TARGET != HWY_RVV
1222
1223template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
1225 static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
1226 const D d;
1227 const Repartition<uint8_t, decltype(d)> d8;
1228 const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
1229 return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
1230}
1231
1232template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
1234 static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
1235 const D d;
1236 Repartition<uint16_t, decltype(d)> d16;
1237 auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
1238 return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
1239}
1240
1241#if HWY_HAVE_INTEGER64
1242template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
1244 static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
1245 const D d;
1246 Repartition<uint32_t, decltype(d)> d32;
1247 auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
1248 return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
1249}
1250#endif
1251
1252#endif // HWY_NATIVE_POPCNT
1253
1254// NOLINTNEXTLINE(google-readability-namespace-comments)
1255} // namespace HWY_NAMESPACE
1256} // namespace hwy
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES)
Definition: base.h:353
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:346
#define HWY_API
Definition: base.h:120
#define HWY_IF_NOT_LANE_SIZE(T, bytes)
Definition: base.h:348
#define HWY_IF_GE128(T, N)
Definition: base.h:337
#define HWY_INLINE
Definition: base.h:62
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
d
Definition: rvv-inl.h:1742
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec< D > NaN(D d)
Definition: generic_ops-inl.h:68
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec< D > Inf(D d)
Definition: generic_ops-inl.h:77
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:25
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition: generic_ops-inl.h:42
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:88
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:103
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
N
Definition: rvv-inl.h:1742
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2238
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API constexpr bool IsSame()
Definition: base.h:322
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:612
HWY_API constexpr T LimitsMax()
Definition: base.h:548
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: ops/shared-inl.h:40