Grok  9.7.5
wasm_128-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 128-bit WASM vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <wasm_simd128.h>
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 
26 #ifdef HWY_WASM_OLD_NAMES
27 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
28 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
29 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
30 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
31 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
46 #endif
47 
49 namespace hwy {
50 namespace HWY_NAMESPACE {
51 
52 template <typename T>
53 using Full128 = Simd<T, 16 / sizeof(T), 0>;
54 
55 template <typename T>
56 using Full64 = Simd<T, 8 / sizeof(T), 0>;
57 
58 namespace detail {
59 
60 template <typename T>
61 struct Raw128 {
62  using type = __v128_u;
63 };
64 template <>
65 struct Raw128<float> {
66  using type = __f32x4;
67 };
68 
69 } // namespace detail
70 
71 template <typename T, size_t N = 16 / sizeof(T)>
72 class Vec128 {
73  using Raw = typename detail::Raw128<T>::type;
74 
75  public:
76  // Compound assignment. Only usable if there is a corresponding non-member
77  // binary operator overload. For example, only f32 and f64 support division.
79  return *this = (*this * other);
80  }
82  return *this = (*this / other);
83  }
85  return *this = (*this + other);
86  }
88  return *this = (*this - other);
89  }
91  return *this = (*this & other);
92  }
94  return *this = (*this | other);
95  }
97  return *this = (*this ^ other);
98  }
99 
100  Raw raw;
101 };
102 
103 template <typename T>
104 using Vec64 = Vec128<T, 8 / sizeof(T)>;
105 
106 // FF..FF or 0.
107 template <typename T, size_t N = 16 / sizeof(T)>
108 struct Mask128 {
110 };
111 
112 namespace detail {
113 
114 // Deduce Simd<T, N, 0> from Vec128<T, N>
115 struct DeduceD {
116  template <typename T, size_t N>
118  return Simd<T, N, 0>();
119  }
120 };
121 
122 } // namespace detail
123 
124 template <class V>
125 using DFromV = decltype(detail::DeduceD()(V()));
126 
127 template <class V>
128 using TFromV = TFromD<DFromV<V>>;
129 
130 // ------------------------------ BitCast
131 
132 namespace detail {
133 
134 HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
135 HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
136  return static_cast<__v128_u>(v);
137 }
138 HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
139  return static_cast<__v128_u>(v);
140 }
141 
142 template <typename T, size_t N>
143 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
144  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
145 }
146 
147 // Cannot rely on function overloading because return types differ.
148 template <typename T>
150  HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
151 };
152 template <>
153 struct BitCastFromInteger128<float> {
154  HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
155 };
156 
157 template <typename T, size_t N>
159  Vec128<uint8_t, N * sizeof(T)> v) {
160  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
161 }
162 
163 } // namespace detail
164 
165 template <typename T, size_t N, typename FromT>
166 HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
167  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
169 }
170 
171 // ------------------------------ Zero
172 
173 // Returns an all-zero vector/part.
174 template <typename T, size_t N, HWY_IF_LE128(T, N)>
176  return Vec128<T, N>{wasm_i32x4_splat(0)};
177 }
178 template <size_t N, HWY_IF_LE128(float, N)>
180  return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
181 }
182 
183 template <class D>
184 using VFromD = decltype(Zero(D()));
185 
186 // ------------------------------ Set
187 
188 // Returns a vector/part with all lanes set to "t".
189 template <size_t N, HWY_IF_LE128(uint8_t, N)>
190 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
191  return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
192 }
193 template <size_t N, HWY_IF_LE128(uint16_t, N)>
195  const uint16_t t) {
196  return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
197 }
198 template <size_t N, HWY_IF_LE128(uint32_t, N)>
200  const uint32_t t) {
201  return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
202 }
203 template <size_t N, HWY_IF_LE128(uint64_t, N)>
205  const uint64_t t) {
206  return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
207 }
208 
209 template <size_t N, HWY_IF_LE128(int8_t, N)>
210 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
211  return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
212 }
213 template <size_t N, HWY_IF_LE128(int16_t, N)>
214 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
215  return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
216 }
217 template <size_t N, HWY_IF_LE128(int32_t, N)>
218 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
219  return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
220 }
221 template <size_t N, HWY_IF_LE128(int64_t, N)>
222 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
223  return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
224 }
225 
226 template <size_t N, HWY_IF_LE128(float, N)>
227 HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
228  return Vec128<float, N>{wasm_f32x4_splat(t)};
229 }
230 
231 HWY_DIAGNOSTICS(push)
232 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
233 
234 // Returns a vector with uninitialized elements.
235 template <typename T, size_t N, HWY_IF_LE128(T, N)>
237  return Zero(d);
238 }
239 
240 HWY_DIAGNOSTICS(pop)
241 
242 // Returns a vector with lane i=[0, N) set to "first" + i.
243 template <typename T, size_t N, typename T2>
244 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
245  HWY_ALIGN T lanes[16 / sizeof(T)];
246  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
247  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
248  }
249  return Load(d, lanes);
250 }
251 
252 // ================================================== ARITHMETIC
253 
254 // ------------------------------ Addition
255 
256 // Unsigned
257 template <size_t N>
259  const Vec128<uint8_t, N> b) {
260  return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
261 }
262 template <size_t N>
264  const Vec128<uint16_t, N> b) {
265  return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
266 }
267 template <size_t N>
269  const Vec128<uint32_t, N> b) {
270  return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
271 }
272 template <size_t N>
274  const Vec128<uint64_t, N> b) {
275  return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
276 }
277 
278 // Signed
279 template <size_t N>
281  const Vec128<int8_t, N> b) {
282  return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
283 }
284 template <size_t N>
286  const Vec128<int16_t, N> b) {
287  return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
288 }
289 template <size_t N>
291  const Vec128<int32_t, N> b) {
292  return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
293 }
294 template <size_t N>
296  const Vec128<int64_t, N> b) {
297  return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
298 }
299 
300 // Float
301 template <size_t N>
303  const Vec128<float, N> b) {
304  return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
305 }
306 
307 // ------------------------------ Subtraction
308 
309 // Unsigned
310 template <size_t N>
312  const Vec128<uint8_t, N> b) {
313  return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
314 }
315 template <size_t N>
318  return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
319 }
320 template <size_t N>
322  const Vec128<uint32_t, N> b) {
323  return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
324 }
325 template <size_t N>
327  const Vec128<uint64_t, N> b) {
328  return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
329 }
330 
331 // Signed
332 template <size_t N>
334  const Vec128<int8_t, N> b) {
335  return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
336 }
337 template <size_t N>
339  const Vec128<int16_t, N> b) {
340  return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
341 }
342 template <size_t N>
344  const Vec128<int32_t, N> b) {
345  return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
346 }
347 template <size_t N>
349  const Vec128<int64_t, N> b) {
350  return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
351 }
352 
353 // Float
354 template <size_t N>
356  const Vec128<float, N> b) {
357  return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
358 }
359 
360 // ------------------------------ SaturatedAdd
361 
362 // Returns a + b clamped to the destination range.
363 
364 // Unsigned
365 template <size_t N>
367  const Vec128<uint8_t, N> b) {
368  return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
369 }
370 template <size_t N>
372  const Vec128<uint16_t, N> b) {
373  return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
374 }
375 
376 // Signed
377 template <size_t N>
379  const Vec128<int8_t, N> b) {
380  return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
381 }
382 template <size_t N>
384  const Vec128<int16_t, N> b) {
385  return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
386 }
387 
388 // ------------------------------ SaturatedSub
389 
390 // Returns a - b clamped to the destination range.
391 
392 // Unsigned
393 template <size_t N>
395  const Vec128<uint8_t, N> b) {
396  return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
397 }
398 template <size_t N>
400  const Vec128<uint16_t, N> b) {
401  return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
402 }
403 
404 // Signed
405 template <size_t N>
407  const Vec128<int8_t, N> b) {
408  return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
409 }
410 template <size_t N>
412  const Vec128<int16_t, N> b) {
413  return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
414 }
415 
416 // ------------------------------ Average
417 
418 // Returns (a + b + 1) / 2
419 
420 // Unsigned
421 template <size_t N>
423  const Vec128<uint8_t, N> b) {
424  return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
425 }
426 template <size_t N>
428  const Vec128<uint16_t, N> b) {
429  return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
430 }
431 
432 // ------------------------------ Absolute value
433 
434 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
435 template <size_t N>
437  return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
438 }
439 template <size_t N>
441  return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
442 }
443 template <size_t N>
445  return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
446 }
447 template <size_t N>
449  return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
450 }
451 
452 template <size_t N>
454  return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
455 }
456 
457 // ------------------------------ Shift lanes by constant #bits
458 
459 // Unsigned
460 template <int kBits, size_t N>
462  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
463 }
464 template <int kBits, size_t N>
466  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
467 }
468 template <int kBits, size_t N>
470  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
471 }
472 template <int kBits, size_t N>
474  return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
475 }
476 template <int kBits, size_t N>
478  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
479 }
480 template <int kBits, size_t N>
482  return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
483 }
484 
485 // Signed
486 template <int kBits, size_t N>
488  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
489 }
490 template <int kBits, size_t N>
492  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
493 }
494 template <int kBits, size_t N>
496  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
497 }
498 template <int kBits, size_t N>
500  return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
501 }
502 template <int kBits, size_t N>
504  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
505 }
506 template <int kBits, size_t N>
508  return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
509 }
510 
511 // 8-bit
512 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
514  const DFromV<decltype(v)> d8;
515  // Use raw instead of BitCast to support N=1.
516  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
517  return kBits == 1
518  ? (v + v)
519  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
520 }
521 
522 template <int kBits, size_t N>
524  const DFromV<decltype(v)> d8;
525  // Use raw instead of BitCast to support N=1.
526  const Vec128<uint8_t, N> shifted{
527  ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
528  return shifted & Set(d8, 0xFF >> kBits);
529 }
530 
531 template <int kBits, size_t N>
533  const DFromV<decltype(v)> di;
534  const RebindToUnsigned<decltype(di)> du;
535  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
536  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
537  return (shifted ^ shifted_sign) - shifted_sign;
538 }
539 
540 // ------------------------------ RotateRight (ShiftRight, Or)
541 template <int kBits, typename T, size_t N>
543  constexpr size_t kSizeInBits = sizeof(T) * 8;
544  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
545  if (kBits == 0) return v;
546  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
547 }
548 
549 // ------------------------------ Shift lanes by same variable #bits
550 
551 // After https://reviews.llvm.org/D108415 shift argument became unsigned.
552 HWY_DIAGNOSTICS(push)
553 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
554 
555 // Unsigned
556 template <size_t N>
558  const int bits) {
559  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
560 }
561 template <size_t N>
563  const int bits) {
564  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
565 }
566 template <size_t N>
568  const int bits) {
569  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
570 }
571 template <size_t N>
573  const int bits) {
574  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
575 }
576 template <size_t N>
578  const int bits) {
579  return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
580 }
581 template <size_t N>
583  const int bits) {
584  return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
585 }
586 
587 // Signed
588 template <size_t N>
590  const int bits) {
591  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
592 }
593 template <size_t N>
595  const int bits) {
596  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
597 }
598 template <size_t N>
600  const int bits) {
601  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
602 }
603 template <size_t N>
605  const int bits) {
606  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
607 }
608 template <size_t N>
610  const int bits) {
611  return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
612 }
613 template <size_t N>
615  const int bits) {
616  return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
617 }
618 
619 // 8-bit
620 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
622  const DFromV<decltype(v)> d8;
623  // Use raw instead of BitCast to support N=1.
624  const Vec128<T, N> shifted{
625  ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
626  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
627 }
628 
629 template <size_t N>
631  const int bits) {
632  const DFromV<decltype(v)> d8;
633  // Use raw instead of BitCast to support N=1.
634  const Vec128<uint8_t, N> shifted{
635  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
636  return shifted & Set(d8, 0xFF >> bits);
637 }
638 
639 template <size_t N>
641  const DFromV<decltype(v)> di;
642  const RebindToUnsigned<decltype(di)> du;
643  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
644  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
645  return (shifted ^ shifted_sign) - shifted_sign;
646 }
647 
648 // ignore Wsign-conversion
649 HWY_DIAGNOSTICS(pop)
650 
651 // ------------------------------ Minimum
652 
653 // Unsigned
654 template <size_t N>
656  return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
657 }
658 template <size_t N>
660  return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
661 }
662 template <size_t N>
664  return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
665 }
666 template <size_t N>
667 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
668  // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
669  const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
670  const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
671  const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
672  const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
673  alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
674  return Vec128<uint64_t, N>{wasm_v128_load(min)};
675 }
676 
677 // Signed
678 template <size_t N>
680  return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
681 }
682 template <size_t N>
684  return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
685 }
686 template <size_t N>
688  return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
689 }
690 template <size_t N>
691 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
692  alignas(16) int64_t min[4];
693  min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
694  wasm_i64x2_extract_lane(b.raw, 0));
695  min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
696  wasm_i64x2_extract_lane(b.raw, 1));
697  return Vec128<int64_t, N>{wasm_v128_load(min)};
698 }
699 
700 // Float
701 template <size_t N>
703  return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
704 }
705 
706 // ------------------------------ Maximum
707 
708 // Unsigned
709 template <size_t N>
711  return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
712 }
713 template <size_t N>
715  return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
716 }
717 template <size_t N>
719  return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
720 }
721 template <size_t N>
722 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
723  // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
724  const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
725  const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
726  const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
727  const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
728  alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
729  return Vec128<uint64_t, N>{wasm_v128_load(max)};
730 }
731 
732 // Signed
733 template <size_t N>
735  return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
736 }
737 template <size_t N>
739  return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
740 }
741 template <size_t N>
743  return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
744 }
745 template <size_t N>
746 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
747  alignas(16) int64_t max[2];
748  max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
749  wasm_i64x2_extract_lane(b.raw, 0));
750  max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
751  wasm_i64x2_extract_lane(b.raw, 1));
752  return Vec128<int64_t, N>{wasm_v128_load(max)};
753 }
754 
755 // Float
756 template <size_t N>
758  return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
759 }
760 
761 // ------------------------------ Integer multiplication
762 
763 // Unsigned
764 template <size_t N>
766  const Vec128<uint16_t, N> b) {
767  return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
768 }
769 template <size_t N>
771  const Vec128<uint32_t, N> b) {
772  return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
773 }
774 
775 // Signed
776 template <size_t N>
778  const Vec128<int16_t, N> b) {
779  return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
780 }
781 template <size_t N>
783  const Vec128<int32_t, N> b) {
784  return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
785 }
786 
787 // Returns the upper 16 bits of a * b in each lane.
788 template <size_t N>
790  const Vec128<uint16_t, N> b) {
791  // TODO(eustas): replace, when implemented in WASM.
792  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
793  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
794  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
795  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
796  const auto l = wasm_i32x4_mul(al, bl);
797  const auto h = wasm_i32x4_mul(ah, bh);
798  // TODO(eustas): shift-right + narrow?
799  return Vec128<uint16_t, N>{
800  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
801 }
802 template <size_t N>
804  const Vec128<int16_t, N> b) {
805  // TODO(eustas): replace, when implemented in WASM.
806  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
807  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
808  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
809  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
810  const auto l = wasm_i32x4_mul(al, bl);
811  const auto h = wasm_i32x4_mul(ah, bh);
812  // TODO(eustas): shift-right + narrow?
813  return Vec128<int16_t, N>{
814  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
815 }
816 
817 template <size_t N>
819  Vec128<int16_t, N> b) {
820  const DFromV<decltype(a)> d;
821  const RebindToUnsigned<decltype(d)> du;
822 
823  const Vec128<uint16_t, N> lo = BitCast(du, Mul(a, b));
824  const Vec128<int16_t, N> hi = MulHigh(a, b);
825  // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
826  // carry that into the result. Instead isolate the top two bits because only
827  // they can influence the result.
828  const Vec128<uint16_t, N> lo_top2 = ShiftRight<14>(lo);
829  // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
830  const Vec128<uint16_t, N> rounding = ShiftRight<1>(Add(lo_top2, Set(du, 1)));
831  return Add(Add(hi, hi), BitCast(d, rounding));
832 }
833 
834 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
835 template <size_t N>
836 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
837  const Vec128<int32_t, N> b) {
838  // TODO(eustas): replace, when implemented in WASM.
839  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
840  const auto ae = wasm_v128_and(a.raw, kEvenMask);
841  const auto be = wasm_v128_and(b.raw, kEvenMask);
842  return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
843 }
844 template <size_t N>
845 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
846  const Vec128<uint32_t, N> b) {
847  // TODO(eustas): replace, when implemented in WASM.
848  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
849  const auto ae = wasm_v128_and(a.raw, kEvenMask);
850  const auto be = wasm_v128_and(b.raw, kEvenMask);
851  return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
852 }
853 
854 // ------------------------------ Negate
855 
856 template <typename T, size_t N, HWY_IF_FLOAT(T)>
858  return Xor(v, SignBit(DFromV<decltype(v)>()));
859 }
860 
861 template <size_t N>
863  return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
864 }
865 template <size_t N>
867  return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
868 }
869 template <size_t N>
871  return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
872 }
873 template <size_t N>
875  return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
876 }
877 
878 // ------------------------------ Floating-point mul / div
879 
880 template <size_t N>
882  return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
883 }
884 
885 template <size_t N>
886 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
887  const Vec128<float, N> b) {
888  return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
889 }
890 
891 // Approximate reciprocal
892 template <size_t N>
893 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
894  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
895  return one / v;
896 }
897 
898 // Absolute value of difference.
899 template <size_t N>
901  const Vec128<float, N> b) {
902  return Abs(a - b);
903 }
904 
905 // ------------------------------ Floating-point multiply-add variants
906 
907 // Returns mul * x + add
908 template <size_t N>
909 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
910  const Vec128<float, N> x,
911  const Vec128<float, N> add) {
912  // TODO(eustas): replace, when implemented in WASM.
913  // TODO(eustas): is it wasm_f32x4_qfma?
914  return mul * x + add;
915 }
916 
917 // Returns add - mul * x
918 template <size_t N>
919 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
920  const Vec128<float, N> x,
921  const Vec128<float, N> add) {
922  // TODO(eustas): replace, when implemented in WASM.
923  return add - mul * x;
924 }
925 
926 // Returns mul * x - sub
927 template <size_t N>
928 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
929  const Vec128<float, N> x,
930  const Vec128<float, N> sub) {
931  // TODO(eustas): replace, when implemented in WASM.
932  // TODO(eustas): is it wasm_f32x4_qfms?
933  return mul * x - sub;
934 }
935 
936 // Returns -mul * x - sub
937 template <size_t N>
938 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
939  const Vec128<float, N> x,
940  const Vec128<float, N> sub) {
941  // TODO(eustas): replace, when implemented in WASM.
942  return Neg(mul) * x - sub;
943 }
944 
945 // ------------------------------ Floating-point square root
946 
947 // Full precision square root
948 template <size_t N>
949 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
950  return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
951 }
952 
953 // Approximate reciprocal square root
954 template <size_t N>
955 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
956  // TODO(eustas): find cheaper a way to calculate this.
957  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
958  return one / Sqrt(v);
959 }
960 
961 // ------------------------------ Floating-point rounding
962 
963 // Toward nearest integer, ties to even
964 template <size_t N>
965 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
966  return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
967 }
968 
969 // Toward zero, aka truncate
970 template <size_t N>
971 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
972  return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
973 }
974 
975 // Toward +infinity, aka ceiling
976 template <size_t N>
977 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
978  return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
979 }
980 
981 // Toward -infinity, aka floor
982 template <size_t N>
983 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
984  return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
985 }
986 
987 // ================================================== COMPARE
988 
989 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
990 
991 template <typename TFrom, typename TTo, size_t N>
992 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
993  Mask128<TFrom, N> m) {
994  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
995  return Mask128<TTo, N>{m.raw};
996 }
997 
998 template <typename T, size_t N>
1000  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1001  return (v & bit) == bit;
1002 }
1003 
1004 // ------------------------------ Equality
1005 
1006 // Unsigned
1007 template <size_t N>
1009  const Vec128<uint8_t, N> b) {
1010  return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1011 }
1012 template <size_t N>
1014  const Vec128<uint16_t, N> b) {
1015  return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1016 }
1017 template <size_t N>
1019  const Vec128<uint32_t, N> b) {
1020  return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1021 }
1022 template <size_t N>
1024  const Vec128<uint64_t, N> b) {
1025  return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1026 }
1027 
1028 // Signed
1029 template <size_t N>
1031  const Vec128<int8_t, N> b) {
1032  return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
1033 }
1034 template <size_t N>
1036  Vec128<int16_t, N> b) {
1037  return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
1038 }
1039 template <size_t N>
1041  const Vec128<int32_t, N> b) {
1042  return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
1043 }
1044 template <size_t N>
1046  const Vec128<int64_t, N> b) {
1047  return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
1048 }
1049 
1050 // Float
1051 template <size_t N>
1053  const Vec128<float, N> b) {
1054  return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
1055 }
1056 
1057 // ------------------------------ Inequality
1058 
1059 // Unsigned
1060 template <size_t N>
1062  const Vec128<uint8_t, N> b) {
1063  return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1064 }
1065 template <size_t N>
1067  const Vec128<uint16_t, N> b) {
1068  return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1069 }
1070 template <size_t N>
1072  const Vec128<uint32_t, N> b) {
1073  return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1074 }
1075 template <size_t N>
1077  const Vec128<uint64_t, N> b) {
1078  return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1079 }
1080 
1081 // Signed
1082 template <size_t N>
1084  const Vec128<int8_t, N> b) {
1085  return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
1086 }
1087 template <size_t N>
1089  const Vec128<int16_t, N> b) {
1090  return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
1091 }
1092 template <size_t N>
1094  const Vec128<int32_t, N> b) {
1095  return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
1096 }
1097 template <size_t N>
1099  const Vec128<int64_t, N> b) {
1100  return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
1101 }
1102 
1103 // Float
1104 template <size_t N>
1106  const Vec128<float, N> b) {
1107  return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
1108 }
1109 
1110 // ------------------------------ Strict inequality
1111 
1112 template <size_t N>
1114  const Vec128<int8_t, N> b) {
1115  return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
1116 }
1117 template <size_t N>
1119  const Vec128<int16_t, N> b) {
1120  return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
1121 }
1122 template <size_t N>
1124  const Vec128<int32_t, N> b) {
1125  return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
1126 }
1127 template <size_t N>
1129  const Vec128<int64_t, N> b) {
1130  return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
1131 }
1132 
1133 template <size_t N>
1135  const Vec128<uint8_t, N> b) {
1136  return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
1137 }
1138 template <size_t N>
1140  const Vec128<uint16_t, N> b) {
1141  return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
1142 }
1143 template <size_t N>
1145  const Vec128<uint32_t, N> b) {
1146  return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
1147 }
1148 template <size_t N>
1150  const Vec128<uint64_t, N> b) {
1151  const DFromV<decltype(a)> d;
1152  const Repartition<uint32_t, decltype(d)> d32;
1153  const auto a32 = BitCast(d32, a);
1154  const auto b32 = BitCast(d32, b);
1155  // If the upper halves are not equal, this is the answer.
1156  const auto m_gt = a32 > b32;
1157 
1158  // Otherwise, the lower half decides.
1159  const auto m_eq = a32 == b32;
1160  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1161  const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
1162 
1163  const auto gt = Or(lo_gt, m_gt);
1164  // Copy result in upper 32 bits to lower 32 bits.
1165  return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
1166 }
1167 
1168 template <size_t N>
1170  const Vec128<float, N> b) {
1171  return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
1172 }
1173 
1174 template <typename T, size_t N>
1176  return operator>(b, a);
1177 }
1178 
1179 // ------------------------------ Weak inequality
1180 
1181 // Float <= >=
1182 template <size_t N>
1184  const Vec128<float, N> b) {
1185  return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
1186 }
1187 template <size_t N>
1189  const Vec128<float, N> b) {
1190  return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
1191 }
1192 
1193 // ------------------------------ FirstN (Iota, Lt)
1194 
1195 template <typename T, size_t N>
1196 HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
1197  const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
1198  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1199 }
1200 
1201 // ================================================== LOGICAL
1202 
1203 // ------------------------------ Not
1204 
1205 template <typename T, size_t N>
1207  return Vec128<T, N>{wasm_v128_not(v.raw)};
1208 }
1209 
1210 // ------------------------------ And
1211 
1212 template <typename T, size_t N>
1214  return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1215 }
1216 
1217 // ------------------------------ AndNot
1218 
1219 // Returns ~not_mask & mask.
1220 template <typename T, size_t N>
1222  return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1223 }
1224 
1225 // ------------------------------ Or
1226 
1227 template <typename T, size_t N>
1229  return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1230 }
1231 
1232 // ------------------------------ Xor
1233 
1234 template <typename T, size_t N>
1236  return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1237 }
1238 
1239 // ------------------------------ OrAnd
1240 
1241 template <typename T, size_t N>
1242 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1243  return Or(o, And(a1, a2));
1244 }
1245 
1246 // ------------------------------ IfVecThenElse
1247 
1248 template <typename T, size_t N>
1249 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
1250  Vec128<T, N> no) {
1251  return IfThenElse(MaskFromVec(mask), yes, no);
1252 }
1253 
1254 // ------------------------------ Operator overloads (internal-only if float)
1255 
1256 template <typename T, size_t N>
1257 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
1258  return And(a, b);
1259 }
1260 
1261 template <typename T, size_t N>
1262 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
1263  return Or(a, b);
1264 }
1265 
1266 template <typename T, size_t N>
1267 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
1268  return Xor(a, b);
1269 }
1270 
1271 // ------------------------------ CopySign
1272 
1273 template <typename T, size_t N>
1274 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
1275  const Vec128<T, N> sign) {
1276  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1277  const auto msb = SignBit(DFromV<decltype(magn)>());
1278  return Or(AndNot(msb, magn), And(msb, sign));
1279 }
1280 
1281 template <typename T, size_t N>
1282 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
1283  const Vec128<T, N> sign) {
1284  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1285  return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
1286 }
1287 
1288 // ------------------------------ BroadcastSignBit (compare)
1289 
1290 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1291 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
1292  return ShiftRight<sizeof(T) * 8 - 1>(v);
1293 }
1294 template <size_t N>
1296  const DFromV<decltype(v)> d;
1297  return VecFromMask(d, v < Zero(d));
1298 }
1299 
1300 // ------------------------------ Mask
1301 
1302 // Mask and Vec are the same (true = FF..FF).
1303 template <typename T, size_t N>
1304 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1305  return Mask128<T, N>{v.raw};
1306 }
1307 
1308 template <typename T, size_t N>
1309 HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
1310  return Vec128<T, N>{v.raw};
1311 }
1312 
1313 // mask ? yes : no
1314 template <typename T, size_t N>
1316  Vec128<T, N> no) {
1317  return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1318 }
1319 
1320 // mask ? yes : 0
1321 template <typename T, size_t N>
1322 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1323  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1324 }
1325 
1326 // mask ? 0 : no
1327 template <typename T, size_t N>
1328 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1329  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1330 }
1331 
1332 template <typename T, size_t N>
1333 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1334  Vec128<T, N> no) {
1335  static_assert(IsSigned<T>(), "Only works for signed/float");
1336  const DFromV<decltype(v)> d;
1337  const RebindToSigned<decltype(d)> di;
1338 
1339  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1340  return IfThenElse(MaskFromVec(v), yes, no);
1341 }
1342 
1343 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1345  const DFromV<decltype(v)> d;
1346  const auto zero = Zero(d);
1347  return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
1348 }
1349 
1350 // ------------------------------ Mask logical
1351 
1352 template <typename T, size_t N>
1353 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1354  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1355 }
1356 
1357 template <typename T, size_t N>
1358 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1359  const Simd<T, N, 0> d;
1360  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1361 }
1362 
1363 template <typename T, size_t N>
1364 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1365  const Simd<T, N, 0> d;
1366  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1367 }
1368 
1369 template <typename T, size_t N>
1370 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1371  const Simd<T, N, 0> d;
1372  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1373 }
1374 
1375 template <typename T, size_t N>
1376 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1377  const Simd<T, N, 0> d;
1378  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1379 }
1380 
1381 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1382 
1383 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
1384 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1385 // scalar count operand, per-lane shift instructions would require extract_lane
1386 // for each lane, and hoping that shuffle is correctly mapped to a native
1387 // instruction. Using non-vector shifts would incur a store-load forwarding
1388 // stall when loading the result vector. We instead test bits of the shift
1389 // count to "predicate" a shift of the entire vector by a constant.
1390 
1391 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1393  const DFromV<decltype(v)> d;
1394  Mask128<T, N> mask;
1395  // Need a signed type for BroadcastSignBit.
1396  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1397  // Move the highest valid bit of the shift count into the sign bit.
1398  test = ShiftLeft<12>(test);
1399 
1400  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1401  test = ShiftLeft<1>(test); // next bit (descending order)
1402  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1403 
1404  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1405  test = ShiftLeft<1>(test); // next bit (descending order)
1406  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1407 
1408  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1409  test = ShiftLeft<1>(test); // next bit (descending order)
1410  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1411 
1412  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1413  return IfThenElse(mask, ShiftLeft<1>(v), v);
1414 }
1415 
1416 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1417 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1418  const DFromV<decltype(v)> d;
1419  Mask128<T, N> mask;
1420  // Need a signed type for BroadcastSignBit.
1421  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1422  // Move the highest valid bit of the shift count into the sign bit.
1423  test = ShiftLeft<27>(test);
1424 
1425  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1426  test = ShiftLeft<1>(test); // next bit (descending order)
1427  v = IfThenElse(mask, ShiftLeft<16>(v), v);
1428 
1429  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1430  test = ShiftLeft<1>(test); // next bit (descending order)
1431  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1432 
1433  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1434  test = ShiftLeft<1>(test); // next bit (descending order)
1435  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1436 
1437  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1438  test = ShiftLeft<1>(test); // next bit (descending order)
1439  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1440 
1441  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1442  return IfThenElse(mask, ShiftLeft<1>(v), v);
1443 }
1444 
1445 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1446 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1447  const DFromV<decltype(v)> d;
1448  alignas(16) T lanes[2];
1449  alignas(16) T bits_lanes[2];
1450  Store(v, d, lanes);
1451  Store(bits, d, bits_lanes);
1452  lanes[0] <<= bits_lanes[0];
1453  lanes[1] <<= bits_lanes[1];
1454  return Load(d, lanes);
1455 }
1456 
1457 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1458 
1459 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1461  const DFromV<decltype(v)> d;
1462  Mask128<T, N> mask;
1463  // Need a signed type for BroadcastSignBit.
1464  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1465  // Move the highest valid bit of the shift count into the sign bit.
1466  test = ShiftLeft<12>(test);
1467 
1468  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1469  test = ShiftLeft<1>(test); // next bit (descending order)
1470  v = IfThenElse(mask, ShiftRight<8>(v), v);
1471 
1472  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1473  test = ShiftLeft<1>(test); // next bit (descending order)
1474  v = IfThenElse(mask, ShiftRight<4>(v), v);
1475 
1476  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1477  test = ShiftLeft<1>(test); // next bit (descending order)
1478  v = IfThenElse(mask, ShiftRight<2>(v), v);
1479 
1480  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1481  return IfThenElse(mask, ShiftRight<1>(v), v);
1482 }
1483 
1484 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1485 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
1486  const DFromV<decltype(v)> d;
1487  Mask128<T, N> mask;
1488  // Need a signed type for BroadcastSignBit.
1489  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1490  // Move the highest valid bit of the shift count into the sign bit.
1491  test = ShiftLeft<27>(test);
1492 
1493  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1494  test = ShiftLeft<1>(test); // next bit (descending order)
1495  v = IfThenElse(mask, ShiftRight<16>(v), v);
1496 
1497  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1498  test = ShiftLeft<1>(test); // next bit (descending order)
1499  v = IfThenElse(mask, ShiftRight<8>(v), v);
1500 
1501  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1502  test = ShiftLeft<1>(test); // next bit (descending order)
1503  v = IfThenElse(mask, ShiftRight<4>(v), v);
1504 
1505  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1506  test = ShiftLeft<1>(test); // next bit (descending order)
1507  v = IfThenElse(mask, ShiftRight<2>(v), v);
1508 
1509  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1510  return IfThenElse(mask, ShiftRight<1>(v), v);
1511 }
1512 
1513 // ================================================== MEMORY
1514 
1515 // ------------------------------ Load
1516 
1517 template <typename T>
1518 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1519  return Vec128<T>{wasm_v128_load(aligned)};
1520 }
1521 
1522 template <typename T, size_t N>
1523 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1524  const T* HWY_RESTRICT aligned) {
1525  return IfThenElseZero(m, Load(d, aligned));
1526 }
1527 
1528 // Partial load.
1529 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1531  Vec128<T, N> v;
1532  CopyBytes<sizeof(T) * N>(p, &v);
1533  return v;
1534 }
1535 
1536 // LoadU == Load.
1537 template <typename T, size_t N>
1539  return Load(d, p);
1540 }
1541 
1542 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1543 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1544 HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1545  return Load(d, p);
1546 }
1547 
1548 // ------------------------------ Store
1549 
1550 template <typename T>
1551 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1552  wasm_v128_store(aligned, v.raw);
1553 }
1554 
1555 // Partial store.
1556 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1558  CopyBytes<sizeof(T) * N>(&v, p);
1559 }
1560 
1562  float* HWY_RESTRICT p) {
1563  *p = wasm_f32x4_extract_lane(v.raw, 0);
1564 }
1565 
1566 // StoreU == Store.
1567 template <typename T, size_t N>
1569  Store(v, d, p);
1570 }
1571 
1572 template <typename T, size_t N>
1573 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
1574  T* HWY_RESTRICT p) {
1575  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
1576 }
1577 
1578 // ------------------------------ Non-temporal stores
1579 
1580 // Same as aligned stores on non-x86.
1581 
1582 template <typename T, size_t N>
1583 HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
1584  T* HWY_RESTRICT aligned) {
1585  wasm_v128_store(aligned, v.raw);
1586 }
1587 
1588 // ------------------------------ Scatter (Store)
1589 
1590 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
1591 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
1592  T* HWY_RESTRICT base,
1593  const Vec128<Offset, N> offset) {
1594  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1595 
1596  alignas(16) T lanes[N];
1597  Store(v, d, lanes);
1598 
1599  alignas(16) Offset offset_lanes[N];
1600  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1601 
1602  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1603  for (size_t i = 0; i < N; ++i) {
1604  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1605  }
1606 }
1607 
1608 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
1609 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
1610  const Vec128<Index, N> index) {
1611  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1612 
1613  alignas(16) T lanes[N];
1614  Store(v, d, lanes);
1615 
1616  alignas(16) Index index_lanes[N];
1617  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1618 
1619  for (size_t i = 0; i < N; ++i) {
1620  base[index_lanes[i]] = lanes[i];
1621  }
1622 }
1623 
1624 // ------------------------------ Gather (Load/Store)
1625 
1626 template <typename T, size_t N, typename Offset>
1627 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
1628  const T* HWY_RESTRICT base,
1629  const Vec128<Offset, N> offset) {
1630  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1631 
1632  alignas(16) Offset offset_lanes[N];
1633  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
1634 
1635  alignas(16) T lanes[N];
1636  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1637  for (size_t i = 0; i < N; ++i) {
1638  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1639  }
1640  return Load(d, lanes);
1641 }
1642 
1643 template <typename T, size_t N, typename Index>
1644 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
1645  const T* HWY_RESTRICT base,
1646  const Vec128<Index, N> index) {
1647  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1648 
1649  alignas(16) Index index_lanes[N];
1650  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
1651 
1652  alignas(16) T lanes[N];
1653  for (size_t i = 0; i < N; ++i) {
1654  lanes[i] = base[index_lanes[i]];
1655  }
1656  return Load(d, lanes);
1657 }
1658 
1659 // ================================================== SWIZZLE
1660 
1661 // ------------------------------ Extract lane
1662 
1663 // Gets the single value stored in a vector/part.
1664 template <size_t N>
1665 HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
1666  return static_cast<uint8_t>(wasm_i8x16_extract_lane(v.raw, 0));
1667 }
1668 template <size_t N>
1669 HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
1670  return static_cast<int8_t>(wasm_i8x16_extract_lane(v.raw, 0));
1671 }
1672 template <size_t N>
1673 HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
1674  return static_cast<uint16_t>(wasm_i16x8_extract_lane(v.raw, 0));
1675 }
1676 template <size_t N>
1677 HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
1678  return static_cast<int16_t>(wasm_i16x8_extract_lane(v.raw, 0));
1679 }
1680 template <size_t N>
1681 HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
1682  return static_cast<uint32_t>(wasm_i32x4_extract_lane(v.raw, 0));
1683 }
1684 template <size_t N>
1685 HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
1686  return static_cast<int32_t>(wasm_i32x4_extract_lane(v.raw, 0));
1687 }
1688 template <size_t N>
1690  return static_cast<uint64_t>(wasm_i64x2_extract_lane(v.raw, 0));
1691 }
1692 template <size_t N>
1694  return static_cast<int64_t>(wasm_i64x2_extract_lane(v.raw, 0));
1695 }
1696 
1697 template <size_t N>
1699  return wasm_f32x4_extract_lane(v.raw, 0);
1700 }
1701 
1702 // ------------------------------ LowerHalf
1703 
1704 template <typename T, size_t N>
1705 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
1706  Vec128<T, N> v) {
1707  return Vec128<T, N / 2>{v.raw};
1708 }
1709 
1710 template <typename T, size_t N>
1712  return LowerHalf(Simd<T, N / 2, 0>(), v);
1713 }
1714 
1715 // ------------------------------ ShiftLeftBytes
1716 
1717 // 0x01..0F, kBytes = 1 => 0x02..0F00
1718 template <int kBytes, typename T, size_t N>
1719 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1720  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1721  const __i8x16 zero = wasm_i8x16_splat(0);
1722  switch (kBytes) {
1723  case 0:
1724  return v;
1725 
1726  case 1:
1727  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
1728  6, 7, 8, 9, 10, 11, 12, 13, 14)};
1729 
1730  case 2:
1731  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
1732  5, 6, 7, 8, 9, 10, 11, 12, 13)};
1733 
1734  case 3:
1735  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
1736  3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1737 
1738  case 4:
1739  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
1740  2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1741 
1742  case 5:
1743  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
1744  1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1745 
1746  case 6:
1747  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1748  16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1749 
1750  case 7:
1751  return Vec128<T, N>{wasm_i8x16_shuffle(
1752  v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1753 
1754  case 8:
1755  return Vec128<T, N>{wasm_i8x16_shuffle(
1756  v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1757 
1758  case 9:
1759  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1760  16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
1761  6)};
1762 
1763  case 10:
1764  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1765  16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
1766  5)};
1767 
1768  case 11:
1769  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1770  16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
1771  4)};
1772 
1773  case 12:
1774  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1775  16, 16, 16, 16, 16, 16, 16, 0, 1,
1776  2, 3)};
1777 
1778  case 13:
1779  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1780  16, 16, 16, 16, 16, 16, 16, 16, 0,
1781  1, 2)};
1782 
1783  case 14:
1784  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1785  16, 16, 16, 16, 16, 16, 16, 16, 16,
1786  0, 1)};
1787 
1788  case 15:
1789  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1790  16, 16, 16, 16, 16, 16, 16, 16, 16,
1791  16, 0)};
1792  }
1793  return Vec128<T, N>{zero};
1794 }
1795 
1796 template <int kBytes, typename T, size_t N>
1797 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
1798  return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
1799 }
1800 
1801 // ------------------------------ ShiftLeftLanes
1802 
1803 template <int kLanes, typename T, size_t N>
1804 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1805  const Repartition<uint8_t, decltype(d)> d8;
1806  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1807 }
1808 
1809 template <int kLanes, typename T, size_t N>
1810 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
1811  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
1812 }
1813 
1814 // ------------------------------ ShiftRightBytes
1815 namespace detail {
1816 
1817 // Helper function allows zeroing invalid lanes in caller.
1818 template <int kBytes, typename T, size_t N>
1819 HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
1820  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1821  const __i8x16 zero = wasm_i8x16_splat(0);
1822 
1823  switch (kBytes) {
1824  case 0:
1825  return v.raw;
1826 
1827  case 1:
1828  return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1829  12, 13, 14, 15, 16);
1830 
1831  case 2:
1832  return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1833  13, 14, 15, 16, 16);
1834 
1835  case 3:
1836  return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1837  13, 14, 15, 16, 16, 16);
1838 
1839  case 4:
1840  return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1841  14, 15, 16, 16, 16, 16);
1842 
1843  case 5:
1844  return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1845  15, 16, 16, 16, 16, 16);
1846 
1847  case 6:
1848  return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1849  16, 16, 16, 16, 16, 16);
1850 
1851  case 7:
1852  return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1853  16, 16, 16, 16, 16, 16, 16);
1854 
1855  case 8:
1856  return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1857  16, 16, 16, 16, 16, 16, 16);
1858 
1859  case 9:
1860  return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1861  16, 16, 16, 16, 16, 16, 16);
1862 
1863  case 10:
1864  return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1865  16, 16, 16, 16, 16, 16, 16);
1866 
1867  case 11:
1868  return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1869  16, 16, 16, 16, 16, 16, 16);
1870 
1871  case 12:
1872  return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1873  16, 16, 16, 16, 16, 16, 16);
1874 
1875  case 13:
1876  return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1877  16, 16, 16, 16, 16, 16, 16);
1878 
1879  case 14:
1880  return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1881  16, 16, 16, 16, 16, 16, 16);
1882 
1883  case 15:
1884  return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1885  16, 16, 16, 16, 16, 16, 16);
1886  case 16:
1887  return zero;
1888  }
1889 }
1890 
1891 } // namespace detail
1892 
1893 // 0x01..0F, kBytes = 1 => 0x0001..0E
1894 template <int kBytes, typename T, size_t N>
1895 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1896  // For partial vectors, clear upper lanes so we shift in zeros.
1897  if (N != 16 / sizeof(T)) {
1898  const Vec128<T> vfull{v.raw};
1899  v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
1900  }
1901  return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
1902 }
1903 
1904 // ------------------------------ ShiftRightLanes
1905 template <int kLanes, typename T, size_t N>
1906 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1907  const Repartition<uint8_t, decltype(d)> d8;
1908  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
1909 }
1910 
1911 // ------------------------------ UpperHalf (ShiftRightBytes)
1912 
1913 // Full input: copy hi into lo (smaller instruction encoding than shifts).
1914 template <typename T>
1916  return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1917 }
1918 HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
1919  return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1920 }
1921 
1922 // Partial
1923 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1924 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
1925  Vec128<T, N> v) {
1926  const DFromV<decltype(v)> d;
1927  const RebindToUnsigned<decltype(d)> du;
1928  const auto vu = BitCast(du, v);
1929  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
1930  return Vec128<T, (N + 1) / 2>{upper.raw};
1931 }
1932 
1933 // ------------------------------ CombineShiftRightBytes
1934 
1935 template <int kBytes, typename T, class V = Vec128<T>>
1936 HWY_API V CombineShiftRightBytes(Full128<T> /* tag */, V hi, V lo) {
1937  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1938  switch (kBytes) {
1939  case 0:
1940  return lo;
1941 
1942  case 1:
1943  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1944  11, 12, 13, 14, 15, 16)};
1945 
1946  case 2:
1947  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1948  11, 12, 13, 14, 15, 16, 17)};
1949 
1950  case 3:
1951  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1952  12, 13, 14, 15, 16, 17, 18)};
1953 
1954  case 4:
1955  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1956  13, 14, 15, 16, 17, 18, 19)};
1957 
1958  case 5:
1959  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1960  14, 15, 16, 17, 18, 19, 20)};
1961 
1962  case 6:
1963  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1964  14, 15, 16, 17, 18, 19, 20, 21)};
1965 
1966  case 7:
1967  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1968  15, 16, 17, 18, 19, 20, 21, 22)};
1969 
1970  case 8:
1971  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1972  16, 17, 18, 19, 20, 21, 22, 23)};
1973 
1974  case 9:
1975  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1976  17, 18, 19, 20, 21, 22, 23, 24)};
1977 
1978  case 10:
1979  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1980  17, 18, 19, 20, 21, 22, 23, 24, 25)};
1981 
1982  case 11:
1983  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1984  18, 19, 20, 21, 22, 23, 24, 25, 26)};
1985 
1986  case 12:
1987  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1988  19, 20, 21, 22, 23, 24, 25, 26, 27)};
1989 
1990  case 13:
1991  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1992  20, 21, 22, 23, 24, 25, 26, 27, 28)};
1993 
1994  case 14:
1995  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1996  21, 22, 23, 24, 25, 26, 27, 28, 29)};
1997 
1998  case 15:
1999  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2000  22, 23, 24, 25, 26, 27, 28, 29, 30)};
2001  }
2002  return hi;
2003 }
2004 
2005 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
2006  class V = Vec128<T, N>>
2008  constexpr size_t kSize = N * sizeof(T);
2009  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
2010  const Repartition<uint8_t, decltype(d)> d8;
2011  const Full128<uint8_t> d_full8;
2012  using V8 = VFromD<decltype(d_full8)>;
2013  const V8 hi8{BitCast(d8, hi).raw};
2014  // Move into most-significant bytes
2015  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
2016  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
2017  return V{BitCast(Full128<T>(), r).raw};
2018 }
2019 
2020 // ------------------------------ Broadcast/splat any lane
2021 
2022 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2024  static_assert(0 <= kLane && kLane < N, "Invalid lane");
2025  return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
2026  kLane, kLane, kLane, kLane, kLane)};
2027 }
2028 
2029 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2030 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2031  static_assert(0 <= kLane && kLane < N, "Invalid lane");
2032  return Vec128<T, N>{
2033  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
2034 }
2035 
2036 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2037 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
2038  static_assert(0 <= kLane && kLane < N, "Invalid lane");
2039  return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
2040 }
2041 
2042 // ------------------------------ TableLookupBytes
2043 
2044 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
2045 // lane indices in [0, 16).
2046 template <typename T, size_t N, typename TI, size_t NI>
2048  const Vec128<TI, NI> from) {
2049 // Not yet available in all engines, see
2050 // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
2051 // V8 implementation of this had a bug, fixed on 2021-04-03:
2052 // https://chromium-review.googlesource.com/c/v8/v8/+/2822951
2053 #if 0
2054  return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
2055 #else
2056  alignas(16) uint8_t control[16];
2057  alignas(16) uint8_t input[16];
2058  alignas(16) uint8_t output[16];
2059  wasm_v128_store(control, from.raw);
2060  wasm_v128_store(input, bytes.raw);
2061  for (size_t i = 0; i < 16; ++i) {
2062  output[i] = control[i] < 16 ? input[control[i]] : 0;
2063  }
2064  return Vec128<TI, NI>{wasm_v128_load(output)};
2065 #endif
2066 }
2067 
2068 template <typename T, size_t N, typename TI, size_t NI>
2070  const Vec128<TI, NI> from) {
2071  const Simd<TI, NI, 0> d;
2072  // Mask size must match vector type, so cast everything to this type.
2073  Repartition<int8_t, decltype(d)> di8;
2075  const auto msb = BitCast(di8, from) < Zero(di8);
2076  const auto lookup =
2077  TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
2078  return BitCast(d, IfThenZeroElse(msb, lookup));
2079 }
2080 
2081 // ------------------------------ Hard-coded shuffles
2082 
2083 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
2084 // Shuffle0321 rotates one lane to the right (the previous least-significant
2085 // lane is now most-significant). These could also be implemented via
2086 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
2087 
2088 // Swap 32-bit halves in 64-bit halves.
2089 template <typename T, size_t N>
2091  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2092  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2093  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
2094 }
2095 
2096 // Swap 64-bit halves
2097 template <typename T>
2098 HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
2099  static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
2100  return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2101 }
2102 template <typename T>
2103 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
2104  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2105  return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
2106 }
2107 
2108 // Rotate right 32 bits
2109 template <typename T>
2110 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
2111  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2112  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
2113 }
2114 
2115 // Rotate left 32 bits
2116 template <typename T>
2117 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
2118  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2119  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
2120 }
2121 
2122 // Reverse
2123 template <typename T>
2124 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
2125  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
2126  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
2127 }
2128 
2129 // ------------------------------ TableLookupLanes
2130 
2131 // Returned by SetTableIndices for use by TableLookupLanes.
2132 template <typename T, size_t N>
2133 struct Indices128 {
2134  __v128_u raw;
2135 };
2136 
2137 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2139  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2140 #if HWY_IS_DEBUG_BUILD
2141  const Rebind<TI, decltype(d)> di;
2142  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2143  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
2144 #endif
2145 
2146  const Repartition<uint8_t, decltype(d)> d8;
2147  using V8 = VFromD<decltype(d8)>;
2148  const Repartition<uint16_t, decltype(d)> d16;
2149 
2150  // Broadcast each lane index to all bytes of T and shift to bytes
2151  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
2152  if (sizeof(T) == 4) {
2153  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2154  0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2155  const V8 lane_indices =
2156  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2157  const V8 byte_indices =
2158  BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
2159  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2160  0, 1, 2, 3, 0, 1, 2, 3};
2161  return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2162  } else {
2163  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2164  0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2165  const V8 lane_indices =
2166  TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
2167  const V8 byte_indices =
2168  BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
2169  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2170  0, 1, 2, 3, 4, 5, 6, 7};
2171  return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
2172  }
2173 }
2174 
2175 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
2176 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
2177  const Rebind<TI, decltype(d)> di;
2178  return IndicesFromVec(d, LoadU(di, idx));
2179 }
2180 
2181 template <typename T, size_t N>
2182 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2183  using TI = MakeSigned<T>;
2184  const DFromV<decltype(v)> d;
2185  const Rebind<TI, decltype(d)> di;
2186  return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
2187 }
2188 
2189 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
2190 
2191 // Single lane: no change
2192 template <typename T>
2193 HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
2194  return v;
2195 }
2196 
2197 // Two lanes: shuffle
2198 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2199 HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
2200  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
2201 }
2202 
2203 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2204 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2205  return Shuffle01(v);
2206 }
2207 
2208 // Four lanes: shuffle
2209 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2210 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2211  return Shuffle0123(v);
2212 }
2213 
2214 // 16-bit
2215 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2216 HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
2217  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2218  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
2219 }
2220 
2221 // ------------------------------ Reverse2
2222 
2223 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2224 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
2225  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2226  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2227 }
2228 
2229 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2230 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2231  return Shuffle2301(v);
2232 }
2233 
2234 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2235 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2236  return Shuffle01(v);
2237 }
2238 
2239 // ------------------------------ Reverse4
2240 
2241 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2242 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
2243  return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
2244  1, 0, 7, 6, 5, 4)});
2245 }
2246 
2247 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2248 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
2249  return Shuffle0123(v);
2250 }
2251 
2252 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2253 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
2254  HWY_ASSERT(0); // don't have 8 u64 lanes
2255 }
2256 
2257 // ------------------------------ Reverse8
2258 
2259 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2260 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
2261  return Reverse(d, v);
2262 }
2263 
2264 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2265 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
2266  HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
2267 }
2268 
2269 // ------------------------------ InterleaveLower
2270 
2271 template <size_t N>
2273  Vec128<uint8_t, N> b) {
2274  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
2275  a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2276 }
2277 template <size_t N>
2279  Vec128<uint16_t, N> b) {
2280  return Vec128<uint16_t, N>{
2281  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2282 }
2283 template <size_t N>
2285  Vec128<uint32_t, N> b) {
2286  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2287 }
2288 template <size_t N>
2290  Vec128<uint64_t, N> b) {
2291  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2292 }
2293 
2294 template <size_t N>
2296  Vec128<int8_t, N> b) {
2297  return Vec128<int8_t, N>{wasm_i8x16_shuffle(
2298  a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2299 }
2300 template <size_t N>
2302  Vec128<int16_t, N> b) {
2303  return Vec128<int16_t, N>{
2304  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2305 }
2306 template <size_t N>
2308  Vec128<int32_t, N> b) {
2309  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2310 }
2311 template <size_t N>
2313  Vec128<int64_t, N> b) {
2314  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2315 }
2316 
2317 template <size_t N>
2319  Vec128<float, N> b) {
2320  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2321 }
2322 
2323 // Additional overload for the optional tag.
2324 template <class V>
2325 HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2326  return InterleaveLower(a, b);
2327 }
2328 
2329 // ------------------------------ InterleaveUpper (UpperHalf)
2330 
2331 // All functions inside detail lack the required D parameter.
2332 namespace detail {
2333 
2334 template <size_t N>
2336  Vec128<uint8_t, N> b) {
2337  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2338  26, 11, 27, 12, 28, 13, 29, 14,
2339  30, 15, 31)};
2340 }
2341 template <size_t N>
2343  Vec128<uint16_t, N> b) {
2344  return Vec128<uint16_t, N>{
2345  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2346 }
2347 template <size_t N>
2349  Vec128<uint32_t, N> b) {
2350  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2351 }
2352 template <size_t N>
2354  Vec128<uint64_t, N> b) {
2355  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2356 }
2357 
2358 template <size_t N>
2360  Vec128<int8_t, N> b) {
2361  return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2362  26, 11, 27, 12, 28, 13, 29, 14,
2363  30, 15, 31)};
2364 }
2365 template <size_t N>
2367  Vec128<int16_t, N> b) {
2368  return Vec128<int16_t, N>{
2369  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2370 }
2371 template <size_t N>
2373  Vec128<int32_t, N> b) {
2374  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2375 }
2376 template <size_t N>
2378  Vec128<int64_t, N> b) {
2379  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2380 }
2381 
2382 template <size_t N>
2384  Vec128<float, N> b) {
2385  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2386 }
2387 
2388 } // namespace detail
2389 
2390 // Full
2391 template <typename T, class V = Vec128<T>>
2392 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
2393  return detail::InterleaveUpper(a, b);
2394 }
2395 
2396 // Partial
2397 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
2398 HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
2399  const Half<decltype(d)> d2;
2400  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
2401 }
2402 
2403 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2404 
2405 // Same as Interleave*, except that the return lanes are double-width integers;
2406 // this is necessary because the single-lane scalar cannot return two values.
2407 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2408 HWY_API VFromD<DW> ZipLower(V a, V b) {
2409  return BitCast(DW(), InterleaveLower(a, b));
2410 }
2411 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2412 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2413  return BitCast(dw, InterleaveLower(D(), a, b));
2414 }
2415 
2416 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2417 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2418  return BitCast(dw, InterleaveUpper(D(), a, b));
2419 }
2420 
2421 // ================================================== COMBINE
2422 
2423 // ------------------------------ Combine (InterleaveLower)
2424 
2425 // N = N/2 + N/2 (upper half undefined)
2426 template <typename T, size_t N>
2428  Vec128<T, N / 2> lo_half) {
2429  const Half<decltype(d)> d2;
2430  const RebindToUnsigned<decltype(d2)> du2;
2431  // Treat half-width input as one lane, and expand to two lanes.
2432  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
2433  const VU lo{BitCast(du2, lo_half).raw};
2434  const VU hi{BitCast(du2, hi_half).raw};
2435  return BitCast(d, InterleaveLower(lo, hi));
2436 }
2437 
2438 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
2439 
2440 template <typename T, size_t N>
2441 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
2442  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
2443 }
2444 
2445 // ------------------------------ ConcatLowerLower
2446 
2447 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2448 template <typename T>
2450  const Vec128<T> lo) {
2451  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
2452 }
2453 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2454 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2455  const Vec128<T, N> lo) {
2456  const Half<decltype(d)> d2;
2457  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
2458 }
2459 
2460 // ------------------------------ ConcatUpperUpper
2461 
2462 template <typename T>
2464  const Vec128<T> lo) {
2465  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
2466 }
2467 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2468 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2469  const Vec128<T, N> lo) {
2470  const Half<decltype(d)> d2;
2471  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
2472 }
2473 
2474 // ------------------------------ ConcatLowerUpper
2475 
2476 template <typename T>
2478  const Vec128<T> lo) {
2479  return CombineShiftRightBytes<8>(d, hi, lo);
2480 }
2481 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2482 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
2483  const Vec128<T, N> lo) {
2484  const Half<decltype(d)> d2;
2485  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
2486 }
2487 
2488 // ------------------------------ ConcatUpperLower
2489 template <typename T, size_t N>
2490 HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
2491  const Vec128<T, N> lo) {
2492  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2493 }
2494 
2495 // ------------------------------ ConcatOdd
2496 
2497 // 32-bit full
2498 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2499 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2500  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2501 }
2502 
2503 // 32-bit partial
2504 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2506  Vec128<T, 2> lo) {
2507  return InterleaveUpper(Simd<T, 2, 0>(), lo, hi);
2508 }
2509 
2510 // 64-bit full - no partial because we need at least two inputs to have
2511 // even/odd.
2512 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2513 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2514  return InterleaveUpper(Full128<T>(), lo, hi);
2515 }
2516 
2517 // ------------------------------ ConcatEven (InterleaveLower)
2518 
2519 // 32-bit full
2520 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2521 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2522  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2523 }
2524 
2525 // 32-bit partial
2526 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2528  Vec128<T, 2> lo) {
2529  return InterleaveLower(Simd<T, 2, 0>(), lo, hi);
2530 }
2531 
2532 // 64-bit full - no partial because we need at least two inputs to have
2533 // even/odd.
2534 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2535 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2536  return InterleaveLower(Full128<T>(), lo, hi);
2537 }
2538 
2539 // ------------------------------ DupEven (InterleaveLower)
2540 
2541 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2542 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
2543  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
2544 }
2545 
2546 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2547 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
2548  return InterleaveLower(DFromV<decltype(v)>(), v, v);
2549 }
2550 
2551 // ------------------------------ DupOdd (InterleaveUpper)
2552 
2553 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2554 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
2555  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
2556 }
2557 
2558 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2559 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
2560  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
2561 }
2562 
2563 // ------------------------------ OddEven
2564 
2565 namespace detail {
2566 
2567 template <typename T, size_t N>
2569  const Vec128<T, N> b) {
2570  const DFromV<decltype(a)> d;
2571  const Repartition<uint8_t, decltype(d)> d8;
2572  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2573  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2574  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
2575 }
2576 template <typename T, size_t N>
2578  const Vec128<T, N> b) {
2579  return Vec128<T, N>{
2580  wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2581 }
2582 template <typename T, size_t N>
2584  const Vec128<T, N> b) {
2585  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2586 }
2587 template <typename T, size_t N>
2589  const Vec128<T, N> b) {
2590  return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
2591 }
2592 
2593 } // namespace detail
2594 
2595 template <typename T, size_t N>
2596 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
2597  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
2598 }
2599 template <size_t N>
2601  const Vec128<float, N> b) {
2602  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2603 }
2604 
2605 // ------------------------------ OddEvenBlocks
2606 template <typename T, size_t N>
2607 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
2608  return even;
2609 }
2610 
2611 // ------------------------------ SwapAdjacentBlocks
2612 
2613 template <typename T, size_t N>
2614 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
2615  return v;
2616 }
2617 
2618 // ------------------------------ ReverseBlocks
2619 
2620 // Single block: no change
2621 template <typename T>
2622 HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
2623  return v;
2624 }
2625 
2626 // ================================================== CONVERT
2627 
2628 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2629 
2630 // Unsigned: zero-extend.
2631 template <size_t N>
2633  const Vec128<uint8_t, N> v) {
2634  return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
2635 }
2636 template <size_t N>
2638  const Vec128<uint8_t, N> v) {
2639  return Vec128<uint32_t, N>{
2640  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2641 }
2642 template <size_t N>
2644  const Vec128<uint8_t, N> v) {
2645  return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
2646 }
2647 template <size_t N>
2649  const Vec128<uint8_t, N> v) {
2650  return Vec128<int32_t, N>{
2651  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2652 }
2653 template <size_t N>
2654 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
2655  const Vec128<uint16_t, N> v) {
2656  return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
2657 }
2658 template <size_t N>
2660  const Vec128<uint32_t, N> v) {
2661  return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
2662 }
2663 
2664 template <size_t N>
2666  const Vec128<uint16_t, N> v) {
2667  return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
2668 }
2669 
2670 // Signed: replicate sign bit.
2671 template <size_t N>
2672 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
2673  const Vec128<int8_t, N> v) {
2674  return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
2675 }
2676 template <size_t N>
2677 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
2678  const Vec128<int8_t, N> v) {
2679  return Vec128<int32_t, N>{
2680  wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
2681 }
2682 template <size_t N>
2683 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
2684  const Vec128<int16_t, N> v) {
2685  return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
2686 }
2687 template <size_t N>
2688 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
2689  const Vec128<int32_t, N> v) {
2690  return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
2691 }
2692 
2693 template <size_t N>
2695  const Vec128<int32_t, N> v) {
2696  return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
2697 }
2698 
2699 template <size_t N>
2700 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
2701  const Vec128<float16_t, N> v) {
2702  const RebindToSigned<decltype(df32)> di32;
2703  const RebindToUnsigned<decltype(df32)> du32;
2704  // Expand to u32 so we can shift.
2705  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
2706  const auto sign = ShiftRight<15>(bits16);
2707  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
2708  const auto mantissa = bits16 & Set(du32, 0x3FF);
2709  const auto subnormal =
2710  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
2711  Set(df32, 1.0f / 16384 / 1024));
2712 
2713  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
2714  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
2715  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2716  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
2717  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2718 }
2719 
2720 template <size_t N>
2721 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
2722  const Vec128<bfloat16_t, N> v) {
2723  const Rebind<uint16_t, decltype(df32)> du16;
2724  const RebindToSigned<decltype(df32)> di32;
2725  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
2726 }
2727 
2728 // ------------------------------ Demotions (full -> part w/ narrow lanes)
2729 
2730 template <size_t N>
2732  const Vec128<int32_t, N> v) {
2733  return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
2734 }
2735 
2736 template <size_t N>
2738  const Vec128<int32_t, N> v) {
2739  return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
2740 }
2741 
2742 template <size_t N>
2744  const Vec128<int32_t, N> v) {
2745  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2746  return Vec128<uint8_t, N>{
2747  wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2748 }
2749 
2750 template <size_t N>
2752  const Vec128<int16_t, N> v) {
2753  return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
2754 }
2755 
2756 template <size_t N>
2758  const Vec128<int32_t, N> v) {
2759  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2760  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2761 }
2762 
2763 template <size_t N>
2765  const Vec128<int16_t, N> v) {
2766  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
2767 }
2768 
2769 template <size_t N>
2771  const Vec128<double, N> v) {
2772  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
2773 }
2774 
2775 template <size_t N>
2776 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
2777  const Vec128<float, N> v) {
2778  const RebindToUnsigned<decltype(df16)> du16;
2779  const Rebind<uint32_t, decltype(du16)> du;
2780  const RebindToSigned<decltype(du)> di;
2781  const auto bits32 = BitCast(du, v);
2782  const auto sign = ShiftRight<31>(bits32);
2783  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
2784  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
2785 
2786  const auto k15 = Set(di, 15);
2787  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
2788  const auto is_tiny = exp < Set(di, -24);
2789 
2790  const auto is_subnormal = exp < Set(di, -14);
2791  const auto biased_exp16 =
2792  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
2793  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
2794  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
2795  (mantissa32 >> (Set(du, 13) + sub_exp));
2796  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
2797  ShiftRight<13>(mantissa32)); // <1024
2798 
2799  const auto sign16 = ShiftLeft<15>(sign);
2800  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2801  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
2802  return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
2803 }
2804 
2805 template <size_t N>
2806 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
2807  const Vec128<float, N> v) {
2808  const Rebind<int32_t, decltype(dbf16)> di32;
2809  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2810  const Rebind<uint16_t, decltype(dbf16)> du16;
2811  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2812  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2813 }
2814 
2815 template <size_t N>
2816 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
2817  Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
2818  const RebindToUnsigned<decltype(dbf16)> du16;
2819  const Repartition<uint32_t, decltype(dbf16)> du32;
2820  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
2821  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2822 }
2823 
2824 // For already range-limited input [0, 255].
2825 template <size_t N>
2827  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2828  return Vec128<uint8_t, N>{
2829  wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2830 }
2831 
2832 // ------------------------------ Convert i32 <=> f32 (Round)
2833 
2834 template <size_t N>
2836  const Vec128<int32_t, N> v) {
2837  return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
2838 }
2839 // Truncates (rounds toward zero).
2840 template <size_t N>
2842  const Vec128<float, N> v) {
2843  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
2844 }
2845 
2846 template <size_t N>
2847 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
2848  return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
2849 }
2850 
2851 // ================================================== MISC
2852 
2853 // ------------------------------ SumsOf8 (ShiftRight, Add)
2854 template <size_t N>
2855 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
2856  const DFromV<decltype(v)> du8;
2857  const RepartitionToWide<decltype(du8)> du16;
2858  const RepartitionToWide<decltype(du16)> du32;
2859  const RepartitionToWide<decltype(du32)> du64;
2860  using VU16 = VFromD<decltype(du16)>;
2861 
2862  const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
2863  const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
2864  const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
2865 
2866  const VU16 szz_FE_zz_BA_zz_76_zz_32 =
2867  BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
2868  const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
2869  Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
2870  const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
2871  BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
2872  const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
2873  Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
2874  return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
2875 }
2876 
2877 // ------------------------------ LoadMaskBits (TestBit)
2878 
2879 namespace detail {
2880 
2881 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2882 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
2883  const RebindToUnsigned<decltype(d)> du;
2884  // Easier than Set(), which would require an >8-bit type, which would not
2885  // compile for T=uint8_t, N=1.
2886  const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
2887 
2888  // Replicate bytes 8x such that each byte contains the bit that governs it.
2889  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2890  1, 1, 1, 1, 1, 1, 1, 1};
2891  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
2892 
2893  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2894  1, 2, 4, 8, 16, 32, 64, 128};
2895  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
2896 }
2897 
2898 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2899 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
2900  const RebindToUnsigned<decltype(d)> du;
2901  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2902  return RebindMask(
2903  d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
2904 }
2905 
2906 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2907 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
2908  const RebindToUnsigned<decltype(d)> du;
2909  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2910  return RebindMask(
2911  d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
2912 }
2913 
2914 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2915 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
2916  const RebindToUnsigned<decltype(d)> du;
2917  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
2918  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2919 }
2920 
2921 } // namespace detail
2922 
2923 // `p` points to at least 8 readable bytes, not all of which need be valid.
2924 template <typename T, size_t N, HWY_IF_LE128(T, N)>
2925 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
2926  const uint8_t* HWY_RESTRICT bits) {
2927  uint64_t mask_bits = 0;
2928  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
2929  return detail::LoadMaskBits(d, mask_bits);
2930 }
2931 
2932 // ------------------------------ Mask
2933 
2934 namespace detail {
2935 
2936 // Full
2937 template <typename T>
2938 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
2939  const Mask128<T> mask) {
2940  alignas(16) uint64_t lanes[2];
2941  wasm_v128_store(lanes, mask.raw);
2942 
2943  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2944  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2945  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2946  return (hi + lo);
2947 }
2948 
2949 // 64-bit
2950 template <typename T>
2952  const Mask128<T, 8> mask) {
2953  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2954  return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
2955  kMagic) >>
2956  56;
2957 }
2958 
2959 // 32-bit or less: need masking
2960 template <typename T, size_t N, HWY_IF_LE32(T, N)>
2961 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
2962  const Mask128<T, N> mask) {
2963  uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
2964  // Clear potentially undefined bytes.
2965  bytes &= (1ULL << (N * 8)) - 1;
2966  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2967  return (bytes * kMagic) >> 56;
2968 }
2969 
2970 template <typename T, size_t N>
2972  const Mask128<T, N> mask) {
2973  // Remove useless lower half of each u16 while preserving the sign bit.
2974  const __i16x8 zero = wasm_i16x8_splat(0);
2975  const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
2976  return BitsFromMask(hwy::SizeTag<1>(), mask8);
2977 }
2978 
2979 template <typename T, size_t N>
2981  const Mask128<T, N> mask) {
2982  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
2983  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2984  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2985  alignas(16) uint32_t lanes[4];
2986  wasm_v128_store(lanes, sliced_mask);
2987  return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2988 }
2989 
2990 template <typename T, size_t N>
2992  const Mask128<T, N> mask) {
2993  const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
2994  const __i64x2 slice = wasm_i64x2_make(1, 2);
2995  const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
2996  alignas(16) uint64_t lanes[2];
2997  wasm_v128_store(lanes, sliced_mask);
2998  return lanes[0] | lanes[1];
2999 }
3000 
3001 // Returns the lowest N bits for the BitsFromMask result.
3002 template <typename T, size_t N>
3003 constexpr uint64_t OnlyActive(uint64_t bits) {
3004  return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
3005 }
3006 
3007 // Returns 0xFF for bytes with index >= N, otherwise 0.
3008 template <size_t N>
3009 constexpr __i8x16 BytesAbove() {
3010  return
3011  (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3012  : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3013  : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3014  : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3015  : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3016  : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3017  : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3018  : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3019  : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3020  : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3021  -1, -1, -1, -1, -1)
3022  : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3023  -1, -1, -1, -1)
3024  : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3025  -1, -1, -1, -1)
3026  : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3027  -1, -1, -1)
3028  : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3029  -1, -1, -1)
3030  : (N == 11)
3031  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3032  : (N == 13)
3033  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3034  : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3035 }
3036 
3037 template <typename T, size_t N>
3038 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
3039  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
3040 }
3041 
3042 template <typename T>
3043 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
3044  return PopCount(BitsFromMask(tag, m));
3045 }
3046 
3047 template <typename T>
3048 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
3049  return PopCount(BitsFromMask(tag, m));
3050 }
3051 
3052 template <typename T>
3053 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
3054  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3055  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3056  alignas(16) uint64_t lanes[2];
3057  wasm_v128_store(lanes, shifted_bits);
3058  return PopCount(lanes[0] | lanes[1]);
3059 }
3060 
3061 template <typename T>
3062 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
3063  alignas(16) int64_t lanes[2];
3064  wasm_v128_store(lanes, m.raw);
3065  return static_cast<size_t>(-(lanes[0] + lanes[1]));
3066 }
3067 
3068 } // namespace detail
3069 
3070 // `p` points to at least 8 writable bytes.
3071 template <typename T, size_t N>
3072 HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
3073  const Mask128<T, N> mask, uint8_t* bits) {
3074  const uint64_t mask_bits = detail::BitsFromMask(mask);
3075  const size_t kNumBytes = (N + 7) / 8;
3076  CopyBytes<kNumBytes>(&mask_bits, bits);
3077  return kNumBytes;
3078 }
3079 
3080 template <typename T, size_t N>
3081 HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3082  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
3083 }
3084 
3085 // Partial vector
3086 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3087 HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
3088  // Ensure all undefined bytes are 0.
3089  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3090  return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
3091 }
3092 
3093 // Full vector
3094 template <typename T>
3095 HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
3096 #if 0
3097  // Casting followed by wasm_i8x16_any_true results in wasm error:
3098  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
3099  const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
3100  return !wasm_i8x16_any_true(v8.raw);
3101 #else
3102  (void)d;
3103  return (wasm_i64x2_extract_lane(m.raw, 0) |
3104  wasm_i64x2_extract_lane(m.raw, 1)) == 0;
3105 #endif
3106 }
3107 
3108 // Full vector
3109 namespace detail {
3110 template <typename T>
3112  return wasm_i8x16_all_true(m.raw);
3113 }
3114 template <typename T>
3116  return wasm_i16x8_all_true(m.raw);
3117 }
3118 template <typename T>
3120  return wasm_i32x4_all_true(m.raw);
3121 }
3122 template <typename T>
3124  return wasm_i64x2_all_true(m.raw);
3125 }
3126 
3127 } // namespace detail
3128 
3129 template <typename T, size_t N>
3130 HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
3131  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
3132 }
3133 
3134 // Partial vectors
3135 
3136 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3137 HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
3138  // Ensure all undefined bytes are 0.
3139  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3140  return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
3141 }
3142 
3143 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3144 HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
3145  // Ensure all undefined bytes are FF.
3146  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3147  return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
3148 }
3149 
3150 template <typename T, size_t N>
3151 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
3152  const Mask128<T, N> mask) {
3153  const uint64_t bits = detail::BitsFromMask(mask);
3154  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
3155 }
3156 
3157 // ------------------------------ Compress
3158 
3159 namespace detail {
3160 
3161 template <typename T, size_t N>
3162 HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
3163  HWY_DASSERT(mask_bits < 256);
3164  const Simd<T, N, 0> d;
3165  const Rebind<uint8_t, decltype(d)> d8;
3166  const Simd<uint16_t, N, 0> du;
3167 
3168  // We need byte indices for TableLookupBytes (one vector's worth for each of
3169  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
3170  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
3171  // with the doubling baked into the table. Unpacking nibbles is likely more
3172  // costly than the higher cache footprint from storing bytes.
3173  alignas(16) constexpr uint8_t table[256 * 8] = {
3174  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3175  2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3176  4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
3177  2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3178  6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
3179  2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
3180  4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
3181  2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3182  8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
3183  2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
3184  4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
3185  2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
3186  6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
3187  2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
3188  4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
3189  2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3190  10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
3191  2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
3192  4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
3193  2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
3194  6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
3195  2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
3196  4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
3197  2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
3198  8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
3199  2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
3200  4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
3201  2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
3202  6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
3203  2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
3204  4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
3205  2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3206  12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
3207  2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
3208  4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
3209  2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
3210  6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
3211  2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
3212  4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
3213  2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
3214  8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
3215  2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
3216  4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
3217  2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
3218  6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
3219  2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
3220  4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
3221  2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
3222  10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
3223  2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
3224  4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
3225  2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
3226  6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
3227  2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
3228  4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
3229  2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
3230  8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
3231  2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
3232  4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
3233  2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
3234  6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
3235  2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
3236  4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
3237  2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
3238  14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
3239  2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
3240  4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
3241  2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
3242  6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
3243  2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
3244  4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
3245  2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
3246  8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
3247  2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
3248  4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
3249  2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
3250  6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
3251  2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
3252  4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
3253  2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
3254  10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
3255  2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
3256  4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
3257  2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
3258  6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
3259  2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
3260  4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
3261  2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
3262  8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
3263  2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
3264  4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
3265  2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
3266  6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
3267  2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
3268  4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
3269  2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
3270  12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
3271  2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
3272  4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
3273  2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
3274  6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
3275  2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
3276  4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
3277  2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
3278  8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
3279  2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
3280  4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
3281  2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
3282  6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
3283  2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
3284  4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
3285  2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
3286  10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
3287  2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
3288  4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
3289  2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
3290  6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
3291  2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
3292  4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
3293  2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
3294  8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
3295  2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
3296  4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
3297  2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
3298  6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
3299  2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
3300  4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
3301  2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3302 
3303  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
3304  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
3305  return BitCast(d, pairs + Set(du, 0x0100));
3306 }
3307 
3308 template <typename T, size_t N>
3309 HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
3310  HWY_DASSERT(mask_bits < 16);
3311 
3312  // There are only 4 lanes, so we can afford to load the index vector directly.
3313  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
3314  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3315  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3316  4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
3317  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3318  8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
3319  0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
3320  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
3321  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
3322  12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
3323  0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
3324  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
3325  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
3326  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
3327  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
3328  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
3329  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3330  const Simd<T, N, 0> d;
3331  const Repartition<uint8_t, decltype(d)> d8;
3332  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
3333 }
3334 
3335 template <typename T, size_t N>
3336 HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
3337  HWY_DASSERT(mask_bits < 4);
3338 
3339  // There are only 2 lanes, so we can afford to load the index vector directly.
3340  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
3341  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3342  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3343  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3344  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3345 
3346  const Simd<T, N, 0> d;
3347  const Repartition<uint8_t, decltype(d)> d8;
3348  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
3349 }
3350 
3351 // Helper functions called by both Compress and CompressStore - avoids a
3352 // redundant BitsFromMask in the latter.
3353 
3354 template <typename T, size_t N>
3356  const uint64_t mask_bits) {
3357  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
3358  const DFromV<decltype(v)> d;
3359  const RebindToSigned<decltype(d)> di;
3360  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
3361 }
3362 
3363 template <typename T, size_t N>
3365  const uint64_t mask_bits) {
3366  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
3367  const DFromV<decltype(v)> d;
3368  const RebindToSigned<decltype(d)> di;
3369  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
3370 }
3371 
3372 template <typename T, size_t N>
3374  const uint64_t mask_bits) {
3375  const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
3376  const DFromV<decltype(v)> d;
3377  const RebindToSigned<decltype(d)> di;
3378  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
3379 }
3380 
3381 } // namespace detail
3382 
3383 template <typename T>
3384 struct CompressIsPartition {
3385  enum { value = 1 };
3386 };
3387 
3388 template <typename T, size_t N>
3389 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
3390  const uint64_t mask_bits = detail::BitsFromMask(mask);
3391  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3392 }
3393 
3394 // ------------------------------ CompressBits
3395 
3396 template <typename T, size_t N>
3398  const uint8_t* HWY_RESTRICT bits) {
3399  uint64_t mask_bits = 0;
3400  constexpr size_t kNumBytes = (N + 7) / 8;
3401  CopyBytes<kNumBytes>(bits, &mask_bits);
3402  if (N < 8) {
3403  mask_bits &= (1ull << N) - 1;
3404  }
3405 
3406  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3407 }
3408 
3409 // ------------------------------ CompressStore
3410 template <typename T, size_t N>
3411 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
3412  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
3413  const uint64_t mask_bits = detail::BitsFromMask(mask);
3414  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3415  StoreU(c, d, unaligned);
3416  return PopCount(mask_bits);
3417 }
3418 
3419 // ------------------------------ CompressBlendedStore
3420 template <typename T, size_t N>
3421 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
3422  Simd<T, N, 0> d,
3423  T* HWY_RESTRICT unaligned) {
3424  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
3425  using TU = TFromD<decltype(du)>;
3426  const uint64_t mask_bits = detail::BitsFromMask(m);
3427  const size_t count = PopCount(mask_bits);
3428  const Vec128<TU, N> compressed =
3429  detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
3430  const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
3431  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
3432  return count;
3433 }
3434 
3435 // ------------------------------ CompressBitsStore
3436 
3437 template <typename T, size_t N>
3438 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
3439  const uint8_t* HWY_RESTRICT bits,
3440  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
3441  uint64_t mask_bits = 0;
3442  constexpr size_t kNumBytes = (N + 7) / 8;
3443  CopyBytes<kNumBytes>(bits, &mask_bits);
3444  if (N < 8) {
3445  mask_bits &= (1ull << N) - 1;
3446  }
3447 
3448  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3449  StoreU(c, d, unaligned);
3450  return PopCount(mask_bits);
3451 }
3452 
3453 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
3454 // TableLookupBytes)
3455 
3456 // 128 bits
3457 HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
3458  const Vec128<uint8_t> c, Full128<uint8_t> d,
3459  uint8_t* HWY_RESTRICT unaligned) {
3460  const auto k5 = Set(d, 5);
3461  const auto k6 = Set(d, 6);
3462 
3463  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
3464  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3465  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3466  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3467  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3468  alignas(16) static constexpr uint8_t tbl_g0[16] = {
3469  0x80, 0, 0x80, 0x80, 1, 0x80, //
3470  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3471  const auto shuf_r0 = Load(d, tbl_r0);
3472  const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
3473  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
3474  const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
3475  const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
3476  const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
3477  const auto int0 = r0 | g0 | b0;
3478  StoreU(int0, d, unaligned + 0 * 16);
3479 
3480  // Second vector: g10,r10, bgr[9:6], b5,g5
3481  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
3482  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
3483  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
3484  const auto r1 = TableLookupBytes(a, shuf_r1);
3485  const auto g1 = TableLookupBytes(b, shuf_g1);
3486  const auto b1 = TableLookupBytes(c, shuf_b1);
3487  const auto int1 = r1 | g1 | b1;
3488  StoreU(int1, d, unaligned + 1 * 16);
3489 
3490  // Third vector: bgr[15:11], b10
3491  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
3492  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
3493  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
3494  const auto r2 = TableLookupBytes(a, shuf_r2);
3495  const auto g2 = TableLookupBytes(b, shuf_g2);
3496  const auto b2 = TableLookupBytes(c, shuf_b2);
3497  const auto int2 = r2 | g2 | b2;
3498  StoreU(int2, d, unaligned + 2 * 16);
3499 }
3500 
3501 // 64 bits
3503  const Vec128<uint8_t, 8> b,
3505  uint8_t* HWY_RESTRICT unaligned) {
3506  // Use full vectors for the shuffles and first result.
3507  const Full128<uint8_t> d_full;
3508  const auto k5 = Set(d_full, 5);
3509  const auto k6 = Set(d_full, 6);
3510 
3511  const Vec128<uint8_t> full_a{a.raw};
3512  const Vec128<uint8_t> full_b{b.raw};
3513  const Vec128<uint8_t> full_c{c.raw};
3514 
3515  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
3516  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3517  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3518  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3519  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3520  alignas(16) static constexpr uint8_t tbl_g0[16] = {
3521  0x80, 0, 0x80, 0x80, 1, 0x80, //
3522  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3523  const auto shuf_r0 = Load(d_full, tbl_r0);
3524  const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
3525  const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
3526  const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
3527  const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
3528  const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
3529  const auto int0 = r0 | g0 | b0;
3530  StoreU(int0, d_full, unaligned + 0 * 16);
3531 
3532  // Second (HALF) vector: bgr[7:6], b5,g5
3533  const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
3534  const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
3535  const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
3536  const auto r1 = TableLookupBytes(full_a, shuf_r1);
3537  const auto g1 = TableLookupBytes(full_b, shuf_g1);
3538  const auto b1 = TableLookupBytes(full_c, shuf_b1);
3539  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
3540  StoreU(int1, d, unaligned + 1 * 16);
3541 }
3542 
3543 // <= 32 bits
3544 template <size_t N, HWY_IF_LE32(uint8_t, N)>
3545 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
3546  const Vec128<uint8_t, N> b,
3547  const Vec128<uint8_t, N> c,
3548  Simd<uint8_t, N, 0> /*tag*/,
3549  uint8_t* HWY_RESTRICT unaligned) {
3550  // Use full vectors for the shuffles and result.
3551  const Full128<uint8_t> d_full;
3552 
3553  const Vec128<uint8_t> full_a{a.raw};
3554  const Vec128<uint8_t> full_b{b.raw};
3555  const Vec128<uint8_t> full_c{c.raw};
3556 
3557  // Shuffle (a,b,c) vector bytes to bgr[3:0].
3558  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3559  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3560  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
3561  0x80, 0x80, 0x80, 0x80};
3562  const auto shuf_r0 = Load(d_full, tbl_r0);
3563  const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
3564  const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
3565  const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
3566  const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
3567  const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
3568  const auto int0 = r0 | g0 | b0;
3569  alignas(16) uint8_t buf[16];
3570  StoreU(int0, d_full, buf);
3571  CopyBytes<N * 3>(buf, unaligned);
3572 }
3573 
3574 // ------------------------------ StoreInterleaved4
3575 
3576 // 128 bits
3577 HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
3578  const Vec128<uint8_t> v1,
3579  const Vec128<uint8_t> v2,
3580  const Vec128<uint8_t> v3, Full128<uint8_t> d8,
3581  uint8_t* HWY_RESTRICT unaligned) {
3582  const RepartitionToWide<decltype(d8)> d16;
3583  const RepartitionToWide<decltype(d16)> d32;
3584  // let a,b,c,d denote v0..3.
3585  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
3586  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
3587  const auto ba8 = ZipUpper(d16, v0, v1);
3588  const auto dc8 = ZipUpper(d16, v2, v3);
3589  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
3590  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
3591  const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8
3592  const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC
3593  StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
3594  StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
3595  StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
3596  StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
3597 }
3598 
3599 // 64 bits
3601  const Vec128<uint8_t, 8> in1,
3602  const Vec128<uint8_t, 8> in2,
3603  const Vec128<uint8_t, 8> in3,
3604  Full64<uint8_t> /* tag */,
3605  uint8_t* HWY_RESTRICT unaligned) {
3606  // Use full vectors to reduce the number of stores.
3607  const Full128<uint8_t> d_full8;
3608  const RepartitionToWide<decltype(d_full8)> d16;
3609  const RepartitionToWide<decltype(d16)> d32;
3610  const Vec128<uint8_t> v0{in0.raw};
3611  const Vec128<uint8_t> v1{in1.raw};
3612  const Vec128<uint8_t> v2{in2.raw};
3613  const Vec128<uint8_t> v3{in3.raw};
3614  // let a,b,c,d denote v0..3.
3615  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
3616  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
3617  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
3618  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
3619  StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
3620  StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
3621 }
3622 
3623 // <= 32 bits
3624 template <size_t N, HWY_IF_LE32(uint8_t, N)>
3625 HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
3626  const Vec128<uint8_t, N> in1,
3627  const Vec128<uint8_t, N> in2,
3628  const Vec128<uint8_t, N> in3,
3629  Simd<uint8_t, N, 0> /*tag*/,
3630  uint8_t* HWY_RESTRICT unaligned) {
3631  // Use full vectors to reduce the number of stores.
3632  const Full128<uint8_t> d_full8;
3633  const RepartitionToWide<decltype(d_full8)> d16;
3634  const RepartitionToWide<decltype(d16)> d32;
3635  const Vec128<uint8_t> v0{in0.raw};
3636  const Vec128<uint8_t> v1{in1.raw};
3637  const Vec128<uint8_t> v2{in2.raw};
3638  const Vec128<uint8_t> v3{in3.raw};
3639  // let a,b,c,d denote v0..3.
3640  const auto ba0 = ZipLower(d16, v0, v1); // b3 a3 .. b0 a0
3641  const auto dc0 = ZipLower(d16, v2, v3); // d3 c3 .. d0 c0
3642  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
3643  alignas(16) uint8_t buf[16];
3644  StoreU(BitCast(d_full8, dcba_0), d_full8, buf);
3645  CopyBytes<4 * N>(buf, unaligned);
3646 }
3647 
3648 // ------------------------------ MulEven/Odd (Load)
3649 
3650 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
3651  const Vec128<uint64_t> b) {
3652  alignas(16) uint64_t mul[2];
3653  mul[0] =
3654  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
3655  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
3656  return Load(Full128<uint64_t>(), mul);
3657 }
3658 
3659 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
3660  const Vec128<uint64_t> b) {
3661  alignas(16) uint64_t mul[2];
3662  mul[0] =
3663  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
3664  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
3665  return Load(Full128<uint64_t>(), mul);
3666 }
3667 
3668 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3669 
3670 template <size_t N>
3671 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
3672  Vec128<bfloat16_t, 2 * N> a,
3673  Vec128<bfloat16_t, 2 * N> b,
3674  const Vec128<float, N> sum0,
3675  Vec128<float, N>& sum1) {
3676  const Repartition<uint16_t, decltype(df32)> du16;
3677  const RebindToUnsigned<decltype(df32)> du32;
3678  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
3679  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
3680  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3681  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
3682  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3683  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3684  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3685 }
3686 
3687 // ------------------------------ Reductions
3688 
3689 namespace detail {
3690 
3691 // N=1 for any T: no-op
3692 template <typename T>
3694  const Vec128<T, 1> v) {
3695  return v;
3696 }
3697 template <typename T>
3698 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
3699  const Vec128<T, 1> v) {
3700  return v;
3701 }
3702 template <typename T>
3703 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
3704  const Vec128<T, 1> v) {
3705  return v;
3706 }
3707 
3708 // u32/i32/f32:
3709 
3710 // N=2
3711 template <typename T>
3713  const Vec128<T, 2> v10) {
3714  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
3715 }
3716 template <typename T>
3718  const Vec128<T, 2> v10) {
3719  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
3720 }
3721 template <typename T>
3722 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
3723  const Vec128<T, 2> v10) {
3724  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
3725 }
3726 
3727 // N=4 (full)
3728 template <typename T>
3730  const Vec128<T> v3210) {
3731  const Vec128<T> v1032 = Shuffle1032(v3210);
3732  const Vec128<T> v31_20_31_20 = v3210 + v1032;
3733  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
3734  return v20_31_20_31 + v31_20_31_20;
3735 }
3736 template <typename T>
3738  const Vec128<T> v3210) {
3739  const Vec128<T> v1032 = Shuffle1032(v3210);
3740  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
3741  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
3742  return Min(v20_31_20_31, v31_20_31_20);
3743 }
3744 template <typename T>
3745 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
3746  const Vec128<T> v3210) {
3747  const Vec128<T> v1032 = Shuffle1032(v3210);
3748  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
3749  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
3750  return Max(v20_31_20_31, v31_20_31_20);
3751 }
3752 
3753 // u64/i64/f64:
3754 
3755 // N=2 (full)
3756 template <typename T>
3758  const Vec128<T> v10) {
3759  const Vec128<T> v01 = Shuffle01(v10);
3760  return v10 + v01;
3761 }
3762 template <typename T>
3764  const Vec128<T> v10) {
3765  const Vec128<T> v01 = Shuffle01(v10);
3766  return Min(v10, v01);
3767 }
3768 template <typename T>
3769 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
3770  const Vec128<T> v10) {
3771  const Vec128<T> v01 = Shuffle01(v10);
3772  return Max(v10, v01);
3773 }
3774 
3775 // u16/i16
3776 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
3777 HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
3778  const DFromV<decltype(v)> d;
3779  const Repartition<int32_t, decltype(d)> d32;
3780  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
3781  const auto odd = ShiftRight<16>(BitCast(d32, v));
3782  const auto min = MinOfLanes(d32, Min(even, odd));
3783  // Also broadcast into odd lanes.
3784  return BitCast(d, Or(min, ShiftLeft<16>(min)));
3785 }
3786 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
3787 HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
3788  const DFromV<decltype(v)> d;
3789  const Repartition<int32_t, decltype(d)> d32;
3790  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
3791  const auto odd = ShiftRight<16>(BitCast(d32, v));
3792  const auto min = MaxOfLanes(d32, Max(even, odd));
3793  // Also broadcast into odd lanes.
3794  return BitCast(d, Or(min, ShiftLeft<16>(min)));
3795 }
3796 
3797 } // namespace detail
3798 
3799 // Supported for u/i/f 32/64. Returns the same value in each lane.
3800 template <typename T, size_t N>
3801 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
3802  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
3803 }
3804 template <typename T, size_t N>
3805 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
3806  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
3807 }
3808 template <typename T, size_t N>
3809 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
3810  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
3811 }
3812 
3813 // ------------------------------ Lt128
3814 
3815 namespace detail {
3816 
3817 template <size_t kLanes, typename T, size_t N>
3818 Mask128<T, N> ShiftMaskLeft(Mask128<T, N> m) {
3819  return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
3820 }
3821 
3822 } // namespace detail
3823 
3824 template <typename T, size_t N, HWY_IF_LE128(T, N)>
3825 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
3826  Vec128<T, N> b) {
3827  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
3828  // Truth table of Eq and Lt for Hi and Lo u64.
3829  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
3830  // =H =L cH cL | out = cH | (=H & cL)
3831  // 0 0 0 0 | 0
3832  // 0 0 0 1 | 0
3833  // 0 0 1 0 | 1
3834  // 0 0 1 1 | 1
3835  // 0 1 0 0 | 0
3836  // 0 1 0 1 | 0
3837  // 0 1 1 0 | 1
3838  // 1 0 0 0 | 0
3839  // 1 0 0 1 | 1
3840  // 1 1 0 0 | 0
3841  const Mask128<T, N> eqHL = Eq(a, b);
3842  const Mask128<T, N> ltHL = Lt(a, b);
3843  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
3844  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
3845  // comparison result leftwards requires only 4.
3846  const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
3847  const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
3848  const Vec128<T, N> vecHx = VecFromMask(d, outHx);
3849  return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
3850 }
3851 
3852 // ------------------------------ Min128, Max128 (Lt128)
3853 
3854 // Without a native OddEven, it seems infeasible to go faster than Lt128.
3855 template <class D>
3856 HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
3857  return IfThenElse(Lt128(d, a, b), a, b);
3858 }
3859 
3860 template <class D>
3861 HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
3862  return IfThenElse(Lt128(d, a, b), b, a);
3863 }
3864 
3865 // ================================================== Operator wrapper
3866 
3867 template <class V>
3868 HWY_API V Add(V a, V b) {
3869  return a + b;
3870 }
3871 template <class V>
3872 HWY_API V Sub(V a, V b) {
3873  return a - b;
3874 }
3875 
3876 template <class V>
3877 HWY_API V Mul(V a, V b) {
3878  return a * b;
3879 }
3880 template <class V>
3881 HWY_API V Div(V a, V b) {
3882  return a / b;
3883 }
3884 
3885 template <class V>
3886 V Shl(V a, V b) {
3887  return a << b;
3888 }
3889 template <class V>
3890 V Shr(V a, V b) {
3891  return a >> b;
3892 }
3893 
3894 template <class V>
3895 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
3896  return a == b;
3897 }
3898 template <class V>
3899 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
3900  return a != b;
3901 }
3902 template <class V>
3903 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
3904  return a < b;
3905 }
3906 
3907 template <class V>
3908 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
3909  return a > b;
3910 }
3911 template <class V>
3912 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
3913  return a >= b;
3914 }
3915 
3916 template <class V>
3917 HWY_API auto Le(V a, V b) -> decltype(a == b) {
3918  return a <= b;
3919 }
3920 
3921 // NOLINTNEXTLINE(google-readability-namespace-comments)
3922 } // namespace HWY_NAMESPACE
3923 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_IF_LE64(T, N)
Definition: base.h:297
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:109
Raw raw
Definition: arm_neon-inl.h:539
Definition: arm_neon-inl.h:485
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:90
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:93
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:81
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:96
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:78
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:486
Raw raw
Definition: arm_neon-inl.h:518
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:84
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:87
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1819
HWY_INLINE Vec128< T, N > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3162
HWY_INLINE Vec128< T, N > Idx64x2FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3336
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3009
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4680
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3309
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4664
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
Mask128< T, N > ShiftMaskLeft(Mask128< T, N > m)
Definition: arm_neon-inl.h:5165
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
Simd< T, 8/sizeof(T), 0 > Full64
Definition: arm_neon-inl.h:37
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
Simd< T, 16/sizeof(T), 0 > Full128
Definition: arm_neon-inl.h:34
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:522
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:460
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_neon-inl.h:3368
__v128_u raw
Definition: wasm_128-inl.h:2134
Definition: ops/shared-inl.h:40
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:154
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:150
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:117
__f32x4 type
Definition: wasm_128-inl.h:66
Definition: x86_128-inl.h:69
__v128_u type
Definition: wasm_128-inl.h:62
Definition: base.h:317
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()