Grok  9.5.0
wasm_128-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // 128-bit WASM vectors and operations.
16 // External include guard in highway.h - see comment there.
17 
18 #include <stddef.h>
19 #include <stdint.h>
20 #include <wasm_simd128.h>
21 
22 #include "hwy/base.h"
23 #include "hwy/ops/shared-inl.h"
24 
25 #ifdef HWY_WASM_OLD_NAMES
26 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
27 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
28 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
29 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
30 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
31 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
32 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
33 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
34 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
35 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
36 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
37 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
38 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
39 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
40 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
41 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
42 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
43 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
44 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
45 #endif
46 
48 namespace hwy {
49 namespace HWY_NAMESPACE {
50 
51 template <typename T>
52 using Full128 = Simd<T, 16 / sizeof(T)>;
53 
54 namespace detail {
55 
56 template <typename T>
57 struct Raw128 {
58  using type = __v128_u;
59 };
60 template <>
61 struct Raw128<float> {
62  using type = __f32x4;
63 };
64 
65 } // namespace detail
66 
67 template <typename T, size_t N = 16 / sizeof(T)>
68 class Vec128 {
69  using Raw = typename detail::Raw128<T>::type;
70 
71  public:
72  // Compound assignment. Only usable if there is a corresponding non-member
73  // binary operator overload. For example, only f32 and f64 support division.
75  return *this = (*this * other);
76  }
78  return *this = (*this / other);
79  }
81  return *this = (*this + other);
82  }
84  return *this = (*this - other);
85  }
87  return *this = (*this & other);
88  }
90  return *this = (*this | other);
91  }
93  return *this = (*this ^ other);
94  }
95 
96  Raw raw;
97 };
98 
99 // FF..FF or 0.
100 template <typename T, size_t N = 16 / sizeof(T)>
101 struct Mask128 {
103 };
104 
105 namespace detail {
106 
107 // Deduce Simd<T, N> from Vec128<T, N>
108 struct DeduceD {
109  template <typename T, size_t N>
111  return Simd<T, N>();
112  }
113 };
114 
115 } // namespace detail
116 
117 template <class V>
118 using DFromV = decltype(detail::DeduceD()(V()));
119 
120 template <class V>
121 using TFromV = TFromD<DFromV<V>>;
122 
123 // ------------------------------ BitCast
124 
125 namespace detail {
126 
127 HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
128 HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
129  return static_cast<__v128_u>(v);
130 }
131 HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
132  return static_cast<__v128_u>(v);
133 }
134 
135 template <typename T, size_t N>
136 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
137  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
138 }
139 
140 // Cannot rely on function overloading because return types differ.
141 template <typename T>
143  HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
144 };
145 template <>
146 struct BitCastFromInteger128<float> {
147  HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
148 };
149 
150 template <typename T, size_t N>
152  Vec128<uint8_t, N * sizeof(T)> v) {
153  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
154 }
155 
156 } // namespace detail
157 
158 template <typename T, size_t N, typename FromT>
159 HWY_API Vec128<T, N> BitCast(Simd<T, N> d,
160  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
162 }
163 
164 // ------------------------------ Zero
165 
166 // Returns an all-zero vector/part.
167 template <typename T, size_t N, HWY_IF_LE128(T, N)>
169  return Vec128<T, N>{wasm_i32x4_splat(0)};
170 }
171 template <size_t N, HWY_IF_LE128(float, N)>
173  return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
174 }
175 
176 template <class D>
177 using VFromD = decltype(Zero(D()));
178 
179 // ------------------------------ Set
180 
181 // Returns a vector/part with all lanes set to "t".
182 template <size_t N, HWY_IF_LE128(uint8_t, N)>
183 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
184  return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
185 }
186 template <size_t N, HWY_IF_LE128(uint16_t, N)>
187 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
188  return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
189 }
190 template <size_t N, HWY_IF_LE128(uint32_t, N)>
191 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
192  return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
193 }
194 template <size_t N, HWY_IF_LE128(uint64_t, N)>
195 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
196  return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
197 }
198 
199 template <size_t N, HWY_IF_LE128(int8_t, N)>
200 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
201  return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
202 }
203 template <size_t N, HWY_IF_LE128(int16_t, N)>
204 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
205  return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
206 }
207 template <size_t N, HWY_IF_LE128(int32_t, N)>
208 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
209  return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
210 }
211 template <size_t N, HWY_IF_LE128(int64_t, N)>
212 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
213  return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
214 }
215 
216 template <size_t N, HWY_IF_LE128(float, N)>
217 HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
218  return Vec128<float, N>{wasm_f32x4_splat(t)};
219 }
220 
221 HWY_DIAGNOSTICS(push)
222 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
223 
224 // Returns a vector with uninitialized elements.
225 template <typename T, size_t N, HWY_IF_LE128(T, N)>
227  return Zero(d);
228 }
229 
230 HWY_DIAGNOSTICS(pop)
231 
232 // Returns a vector with lane i=[0, N) set to "first" + i.
233 template <typename T, size_t N, typename T2>
234 Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
235  HWY_ALIGN T lanes[16 / sizeof(T)];
236  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
237  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
238  }
239  return Load(d, lanes);
240 }
241 
242 // ================================================== ARITHMETIC
243 
244 // ------------------------------ Addition
245 
246 // Unsigned
247 template <size_t N>
249  const Vec128<uint8_t, N> b) {
250  return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
251 }
252 template <size_t N>
254  const Vec128<uint16_t, N> b) {
255  return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
256 }
257 template <size_t N>
259  const Vec128<uint32_t, N> b) {
260  return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
261 }
262 
263 // Signed
264 template <size_t N>
266  const Vec128<int8_t, N> b) {
267  return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
268 }
269 template <size_t N>
271  const Vec128<int16_t, N> b) {
272  return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
273 }
274 template <size_t N>
276  const Vec128<int32_t, N> b) {
277  return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
278 }
279 
280 // Float
281 template <size_t N>
283  const Vec128<float, N> b) {
284  return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
285 }
286 
287 // ------------------------------ Subtraction
288 
289 // Unsigned
290 template <size_t N>
292  const Vec128<uint8_t, N> b) {
293  return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
294 }
295 template <size_t N>
298  return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
299 }
300 template <size_t N>
302  const Vec128<uint32_t, N> b) {
303  return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
304 }
305 
306 // Signed
307 template <size_t N>
309  const Vec128<int8_t, N> b) {
310  return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
311 }
312 template <size_t N>
314  const Vec128<int16_t, N> b) {
315  return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
316 }
317 template <size_t N>
319  const Vec128<int32_t, N> b) {
320  return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
321 }
322 
323 // Float
324 template <size_t N>
326  const Vec128<float, N> b) {
327  return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
328 }
329 
330 // ------------------------------ Saturating addition
331 
332 // Returns a + b clamped to the destination range.
333 
334 // Unsigned
335 template <size_t N>
337  const Vec128<uint8_t, N> b) {
338  return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
339 }
340 template <size_t N>
342  const Vec128<uint16_t, N> b) {
343  return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
344 }
345 
346 // Signed
347 template <size_t N>
349  const Vec128<int8_t, N> b) {
350  return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
351 }
352 template <size_t N>
354  const Vec128<int16_t, N> b) {
355  return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
356 }
357 
358 // ------------------------------ Saturating subtraction
359 
360 // Returns a - b clamped to the destination range.
361 
362 // Unsigned
363 template <size_t N>
365  const Vec128<uint8_t, N> b) {
366  return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
367 }
368 template <size_t N>
370  const Vec128<uint16_t, N> b) {
371  return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
372 }
373 
374 // Signed
375 template <size_t N>
377  const Vec128<int8_t, N> b) {
378  return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
379 }
380 template <size_t N>
382  const Vec128<int16_t, N> b) {
383  return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
384 }
385 
386 // ------------------------------ Average
387 
388 // Returns (a + b + 1) / 2
389 
390 // Unsigned
391 template <size_t N>
393  const Vec128<uint8_t, N> b) {
394  return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
395 }
396 template <size_t N>
398  const Vec128<uint16_t, N> b) {
399  return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
400 }
401 
402 // ------------------------------ Absolute value
403 
404 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
405 template <size_t N>
407  return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
408 }
409 template <size_t N>
411  return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
412 }
413 template <size_t N>
415  return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
416 }
417 template <size_t N>
419  return Vec128<int32_t, N>{wasm_i62x2_abs(v.raw)};
420 }
421 
422 template <size_t N>
424  return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
425 }
426 
427 // ------------------------------ Shift lanes by constant #bits
428 
429 // Unsigned
430 template <int kBits, size_t N>
432  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
433 }
434 template <int kBits, size_t N>
436  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
437 }
438 template <int kBits, size_t N>
440  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
441 }
442 template <int kBits, size_t N>
444  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
445 }
446 
447 // Signed
448 template <int kBits, size_t N>
450  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
451 }
452 template <int kBits, size_t N>
454  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
455 }
456 template <int kBits, size_t N>
458  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
459 }
460 template <int kBits, size_t N>
462  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
463 }
464 
465 // 8-bit
466 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
468  const Simd<T, N> d8;
469  // Use raw instead of BitCast to support N=1.
470  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
471  return kBits == 1
472  ? (v + v)
473  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
474 }
475 
476 template <int kBits, size_t N>
478  const Simd<uint8_t, N> d8;
479  // Use raw instead of BitCast to support N=1.
480  const Vec128<uint8_t, N> shifted{
481  ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
482  return shifted & Set(d8, 0xFF >> kBits);
483 }
484 
485 template <int kBits, size_t N>
487  const Simd<int8_t, N> di;
488  const Simd<uint8_t, N> du;
489  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
490  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
491  return (shifted ^ shifted_sign) - shifted_sign;
492 }
493 
494 // ------------------------------ Shift lanes by same variable #bits
495 
496 // Unsigned
497 template <size_t N>
499  const int bits) {
500  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
501 }
502 template <size_t N>
504  const int bits) {
505  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
506 }
507 template <size_t N>
509  const int bits) {
510  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
511 }
512 template <size_t N>
514  const int bits) {
515  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
516 }
517 
518 // Signed
519 template <size_t N>
521  const int bits) {
522  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
523 }
524 template <size_t N>
526  const int bits) {
527  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
528 }
529 template <size_t N>
531  const int bits) {
532  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
533 }
534 template <size_t N>
536  const int bits) {
537  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
538 }
539 
540 // 8-bit
541 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
542 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
543  const Simd<T, N> d8;
544  // Use raw instead of BitCast to support N=1.
545  const Vec128<T, N> shifted{
546  ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
547  return shifted & Set(d8, (0xFF << bits) & 0xFF);
548 }
549 
550 template <size_t N>
552  const int bits) {
553  const Simd<uint8_t, N> d8;
554  // Use raw instead of BitCast to support N=1.
555  const Vec128<uint8_t, N> shifted{
556  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
557  return shifted & Set(d8, 0xFF >> bits);
558 }
559 
560 template <size_t N>
562  const Simd<int8_t, N> di;
563  const Simd<uint8_t, N> du;
564  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
565  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
566  return (shifted ^ shifted_sign) - shifted_sign;
567 }
568 
569 // ------------------------------ Minimum
570 
571 // Unsigned
572 template <size_t N>
574  const Vec128<uint8_t, N> b) {
575  return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
576 }
577 template <size_t N>
579  const Vec128<uint16_t, N> b) {
580  return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
581 }
582 template <size_t N>
584  const Vec128<uint32_t, N> b) {
585  return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
586 }
587 template <size_t N>
588 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
589  const Vec128<uint64_t, N> b) {
590  alignas(16) float min[4];
591  min[0] =
592  HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
593  min[1] =
594  HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
595  return Vec128<uint64_t, N>{wasm_v128_load(min)};
596 }
597 
598 // Signed
599 template <size_t N>
601  const Vec128<int8_t, N> b) {
602  return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
603 }
604 template <size_t N>
606  const Vec128<int16_t, N> b) {
607  return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
608 }
609 template <size_t N>
611  const Vec128<int32_t, N> b) {
612  return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
613 }
614 template <size_t N>
615 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
616  const Vec128<int64_t, N> b) {
617  alignas(16) float min[4];
618  min[0] =
619  HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
620  min[1] =
621  HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
622  return Vec128<int64_t, N>{wasm_v128_load(min)};
623 }
624 
625 // Float
626 template <size_t N>
628  const Vec128<float, N> b) {
629  return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
630 }
631 
632 // ------------------------------ Maximum
633 
634 // Unsigned
635 template <size_t N>
637  const Vec128<uint8_t, N> b) {
638  return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
639 }
640 template <size_t N>
642  const Vec128<uint16_t, N> b) {
643  return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
644 }
645 template <size_t N>
647  const Vec128<uint32_t, N> b) {
648  return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
649 }
650 template <size_t N>
651 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
652  const Vec128<uint64_t, N> b) {
653  alignas(16) float max[4];
654  max[0] =
655  HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
656  max[1] =
657  HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
658  return Vec128<int64_t, N>{wasm_v128_load(max)};
659 }
660 
661 // Signed
662 template <size_t N>
664  const Vec128<int8_t, N> b) {
665  return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
666 }
667 template <size_t N>
669  const Vec128<int16_t, N> b) {
670  return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
671 }
672 template <size_t N>
674  const Vec128<int32_t, N> b) {
675  return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
676 }
677 template <size_t N>
678 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
679  const Vec128<int64_t, N> b) {
680  alignas(16) float max[4];
681  max[0] =
682  HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
683  max[1] =
684  HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
685  return Vec128<int64_t, N>{wasm_v128_load(max)};
686 }
687 
688 // Float
689 template <size_t N>
691  const Vec128<float, N> b) {
692  return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
693 }
694 
695 // ------------------------------ Integer multiplication
696 
697 // Unsigned
698 template <size_t N>
700  const Vec128<uint16_t, N> b) {
701  return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
702 }
703 template <size_t N>
705  const Vec128<uint32_t, N> b) {
706  return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
707 }
708 
709 // Signed
710 template <size_t N>
712  const Vec128<int16_t, N> b) {
713  return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
714 }
715 template <size_t N>
717  const Vec128<int32_t, N> b) {
718  return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
719 }
720 
721 // Returns the upper 16 bits of a * b in each lane.
722 template <size_t N>
724  const Vec128<uint16_t, N> b) {
725  // TODO(eustas): replace, when implemented in WASM.
726  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
727  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
728  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
729  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
730  const auto l = wasm_i32x4_mul(al, bl);
731  const auto h = wasm_i32x4_mul(ah, bh);
732  // TODO(eustas): shift-right + narrow?
733  return Vec128<uint16_t, N>{
734  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
735 }
736 template <size_t N>
738  const Vec128<int16_t, N> b) {
739  // TODO(eustas): replace, when implemented in WASM.
740  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
741  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
742  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
743  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
744  const auto l = wasm_i32x4_mul(al, bl);
745  const auto h = wasm_i32x4_mul(ah, bh);
746  // TODO(eustas): shift-right + narrow?
747  return Vec128<int16_t, N>{
748  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
749 }
750 
751 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
752 template <size_t N>
753 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
754  const Vec128<int32_t, N> b) {
755  // TODO(eustas): replace, when implemented in WASM.
756  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
757  const auto ae = wasm_v128_and(a.raw, kEvenMask);
758  const auto be = wasm_v128_and(b.raw, kEvenMask);
759  return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
760 }
761 template <size_t N>
762 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
763  const Vec128<uint32_t, N> b) {
764  // TODO(eustas): replace, when implemented in WASM.
765  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
766  const auto ae = wasm_v128_and(a.raw, kEvenMask);
767  const auto be = wasm_v128_and(b.raw, kEvenMask);
768  return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
769 }
770 
771 // ------------------------------ Negate
772 
773 template <typename T, size_t N, HWY_IF_FLOAT(T)>
775  return Xor(v, SignBit(Simd<T, N>()));
776 }
777 
778 template <size_t N>
780  return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
781 }
782 template <size_t N>
784  return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
785 }
786 template <size_t N>
788  return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
789 }
790 template <size_t N>
792  return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
793 }
794 
795 // ------------------------------ Floating-point mul / div
796 
797 template <size_t N>
799  return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
800 }
801 
802 template <size_t N>
803 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
804  const Vec128<float, N> b) {
805  return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
806 }
807 
808 // Approximate reciprocal
809 template <size_t N>
810 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
811  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
812  return one / v;
813 }
814 
815 // Absolute value of difference.
816 template <size_t N>
818  const Vec128<float, N> b) {
819  return Abs(a - b);
820 }
821 
822 // ------------------------------ Floating-point multiply-add variants
823 
824 // Returns mul * x + add
825 template <size_t N>
826 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
827  const Vec128<float, N> x,
828  const Vec128<float, N> add) {
829  // TODO(eustas): replace, when implemented in WASM.
830  // TODO(eustas): is it wasm_f32x4_qfma?
831  return mul * x + add;
832 }
833 
834 // Returns add - mul * x
835 template <size_t N>
836 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
837  const Vec128<float, N> x,
838  const Vec128<float, N> add) {
839  // TODO(eustas): replace, when implemented in WASM.
840  return add - mul * x;
841 }
842 
843 // Returns mul * x - sub
844 template <size_t N>
845 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
846  const Vec128<float, N> x,
847  const Vec128<float, N> sub) {
848  // TODO(eustas): replace, when implemented in WASM.
849  // TODO(eustas): is it wasm_f32x4_qfms?
850  return mul * x - sub;
851 }
852 
853 // Returns -mul * x - sub
854 template <size_t N>
855 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
856  const Vec128<float, N> x,
857  const Vec128<float, N> sub) {
858  // TODO(eustas): replace, when implemented in WASM.
859  return Neg(mul) * x - sub;
860 }
861 
862 // ------------------------------ Floating-point square root
863 
864 // Full precision square root
865 template <size_t N>
866 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
867  return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
868 }
869 
870 // Approximate reciprocal square root
871 template <size_t N>
872 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
873  // TODO(eustas): find cheaper a way to calculate this.
874  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
875  return one / Sqrt(v);
876 }
877 
878 // ------------------------------ Floating-point rounding
879 
880 // Toward nearest integer, ties to even
881 template <size_t N>
882 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
883  return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
884 }
885 
886 // Toward zero, aka truncate
887 template <size_t N>
888 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
889  return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
890 }
891 
892 // Toward +infinity, aka ceiling
893 template <size_t N>
894 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
895  return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
896 }
897 
898 // Toward -infinity, aka floor
899 template <size_t N>
900 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
901  return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
902 }
903 
904 // ================================================== COMPARE
905 
906 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
907 
908 template <typename TFrom, typename TTo, size_t N>
909 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
910  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
911  return Mask128<TTo, N>{m.raw};
912 }
913 
914 template <typename T, size_t N>
916  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
917  return (v & bit) == bit;
918 }
919 
920 // ------------------------------ Equality
921 
922 // Unsigned
923 template <size_t N>
925  const Vec128<uint8_t, N> b) {
926  return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
927 }
928 template <size_t N>
930  const Vec128<uint16_t, N> b) {
931  return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
932 }
933 template <size_t N>
935  const Vec128<uint32_t, N> b) {
936  return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
937 }
938 
939 // Signed
940 template <size_t N>
942  const Vec128<int8_t, N> b) {
943  return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
944 }
945 template <size_t N>
947  Vec128<int16_t, N> b) {
948  return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
949 }
950 template <size_t N>
952  const Vec128<int32_t, N> b) {
953  return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
954 }
955 
956 // Float
957 template <size_t N>
959  const Vec128<float, N> b) {
960  return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
961 }
962 
963 // ------------------------------ Inequality
964 
965 // Unsigned
966 template <size_t N>
968  const Vec128<uint8_t, N> b) {
969  return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
970 }
971 template <size_t N>
973  const Vec128<uint16_t, N> b) {
974  return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
975 }
976 template <size_t N>
978  const Vec128<uint32_t, N> b) {
979  return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
980 }
981 
982 // Signed
983 template <size_t N>
985  const Vec128<int8_t, N> b) {
986  return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
987 }
988 template <size_t N>
990  Vec128<int16_t, N> b) {
991  return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
992 }
993 template <size_t N>
995  const Vec128<int32_t, N> b) {
996  return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
997 }
998 
999 // Float
1000 template <size_t N>
1002  const Vec128<float, N> b) {
1003  return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
1004 }
1005 
1006 // ------------------------------ Strict inequality
1007 
1008 // Signed/float >
1009 template <size_t N>
1011  const Vec128<int8_t, N> b) {
1012  return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
1013 }
1014 template <size_t N>
1016  const Vec128<int16_t, N> b) {
1017  return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
1018 }
1019 template <size_t N>
1021  const Vec128<int32_t, N> b) {
1022  return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
1023 }
1024 template <size_t N>
1026  const Vec128<int64_t, N> b) {
1027  const Simd<int32_t, N * 2> d32;
1028  const auto a32 = BitCast(d32, a);
1029  const auto b32 = BitCast(d32, b);
1030  // If the upper half is less than or greater, this is the answer.
1031  const auto m_gt = a32 < b32;
1032 
1033  // Otherwise, the lower half decides.
1034  const auto m_eq = a32 == b32;
1035  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
1036  const auto lo_gt = And(m_eq, lo_in_hi);
1037 
1038  const auto gt = Or(lo_gt, m_gt);
1039  // Copy result in upper 32 bits to lower 32 bits.
1040  return Mask128<int64_t, N>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
1041 }
1042 
1043 template <size_t N>
1045  const Vec128<float, N> b) {
1046  return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
1047 }
1048 
1049 template <typename T, size_t N>
1051  return operator>(b, a);
1052 }
1053 
1054 // ------------------------------ Weak inequality
1055 
1056 // Float <= >=
1057 template <size_t N>
1059  const Vec128<float, N> b) {
1060  return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
1061 }
1062 template <size_t N>
1064  const Vec128<float, N> b) {
1065  return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
1066 }
1067 
1068 // ------------------------------ FirstN (Iota, Lt)
1069 
1070 template <typename T, size_t N>
1071 HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
1072  const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
1073  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1074 }
1075 
1076 // ================================================== LOGICAL
1077 
1078 // ------------------------------ Not
1079 
1080 template <typename T, size_t N>
1082  return Vec128<T, N>{wasm_v128_not(v.raw)};
1083 }
1084 
1085 // ------------------------------ And
1086 
1087 template <typename T, size_t N>
1089  return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
1090 }
1091 
1092 // ------------------------------ AndNot
1093 
1094 // Returns ~not_mask & mask.
1095 template <typename T, size_t N>
1097  return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
1098 }
1099 
1100 // ------------------------------ Or
1101 
1102 template <typename T, size_t N>
1104  return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
1105 }
1106 
1107 // ------------------------------ Xor
1108 
1109 template <typename T, size_t N>
1111  return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
1112 }
1113 
1114 // ------------------------------ Operator overloads (internal-only if float)
1115 
1116 template <typename T, size_t N>
1117 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
1118  return And(a, b);
1119 }
1120 
1121 template <typename T, size_t N>
1122 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
1123  return Or(a, b);
1124 }
1125 
1126 template <typename T, size_t N>
1127 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
1128  return Xor(a, b);
1129 }
1130 
1131 // ------------------------------ CopySign
1132 
1133 template <typename T, size_t N>
1134 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
1135  const Vec128<T, N> sign) {
1136  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1137  const auto msb = SignBit(Simd<T, N>());
1138  return Or(AndNot(msb, magn), And(msb, sign));
1139 }
1140 
1141 template <typename T, size_t N>
1142 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
1143  const Vec128<T, N> sign) {
1144  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
1145  return Or(abs, And(SignBit(Simd<T, N>()), sign));
1146 }
1147 
1148 // ------------------------------ BroadcastSignBit (compare)
1149 
1150 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1152  return ShiftRight<sizeof(T) * 8 - 1>(v);
1153 }
1154 template <size_t N>
1156  return VecFromMask(Simd<int8_t, N>(), v < Zero(Simd<int8_t, N>()));
1157 }
1158 
1159 // ------------------------------ Mask
1160 
1161 // Mask and Vec are the same (true = FF..FF).
1162 template <typename T, size_t N>
1163 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1164  return Mask128<T, N>{v.raw};
1165 }
1166 
1167 template <typename T, size_t N>
1168 HWY_API Vec128<T, N> VecFromMask(Simd<T, N> /* tag */, Mask128<T, N> v) {
1169  return Vec128<T, N>{v.raw};
1170 }
1171 
1172 // DEPRECATED
1173 template <typename T, size_t N>
1174 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1175  return Vec128<T, N>{v.raw};
1176 }
1177 
1178 // mask ? yes : no
1179 template <typename T, size_t N>
1181  Vec128<T, N> no) {
1182  return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1183 }
1184 
1185 // mask ? yes : 0
1186 template <typename T, size_t N>
1187 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1188  return yes & VecFromMask(Simd<T, N>(), mask);
1189 }
1190 
1191 // mask ? 0 : no
1192 template <typename T, size_t N>
1193 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1194  return AndNot(VecFromMask(Simd<T, N>(), mask), no);
1195 }
1196 
1197 template <typename T, size_t N, HWY_IF_FLOAT(T)>
1199  const Simd<T, N> d;
1200  const auto zero = Zero(d);
1201  return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
1202 }
1203 
1204 // ------------------------------ Mask logical
1205 
1206 template <typename T, size_t N>
1207 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1208  return MaskFromVec(Not(VecFromMask(Simd<T, N>(), m)));
1209 }
1210 
1211 template <typename T, size_t N>
1212 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1213  const Simd<T, N> d;
1214  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1215 }
1216 
1217 template <typename T, size_t N>
1218 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1219  const Simd<T, N> d;
1220  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1221 }
1222 
1223 template <typename T, size_t N>
1224 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1225  const Simd<T, N> d;
1226  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1227 }
1228 
1229 template <typename T, size_t N>
1230 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1231  const Simd<T, N> d;
1232  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1233 }
1234 
1235 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1236 
1237 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
1238 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1239 // scalar count operand, per-lane shift instructions would require extract_lane
1240 // for each lane, and hoping that shuffle is correctly mapped to a native
1241 // instruction. Using non-vector shifts would incur a store-load forwarding
1242 // stall when loading the result vector. We instead test bits of the shift
1243 // count to "predicate" a shift of the entire vector by a constant.
1244 
1245 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1247  const Simd<T, N> d;
1248  Mask128<T, N> mask;
1249  // Need a signed type for BroadcastSignBit.
1250  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1251  // Move the highest valid bit of the shift count into the sign bit.
1252  test = ShiftLeft<12>(test);
1253 
1254  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1255  test = ShiftLeft<1>(test); // next bit (descending order)
1256  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1257 
1258  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1259  test = ShiftLeft<1>(test); // next bit (descending order)
1260  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1261 
1262  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1263  test = ShiftLeft<1>(test); // next bit (descending order)
1264  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1265 
1266  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1267  return IfThenElse(mask, ShiftLeft<1>(v), v);
1268 }
1269 
1270 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1271 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
1272  const Simd<T, N> d;
1273  Mask128<T, N> mask;
1274  // Need a signed type for BroadcastSignBit.
1275  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1276  // Move the highest valid bit of the shift count into the sign bit.
1277  test = ShiftLeft<27>(test);
1278 
1279  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1280  test = ShiftLeft<1>(test); // next bit (descending order)
1281  v = IfThenElse(mask, ShiftLeft<16>(v), v);
1282 
1283  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1284  test = ShiftLeft<1>(test); // next bit (descending order)
1285  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1286 
1287  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1288  test = ShiftLeft<1>(test); // next bit (descending order)
1289  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1290 
1291  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1292  test = ShiftLeft<1>(test); // next bit (descending order)
1293  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1294 
1295  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1296  return IfThenElse(mask, ShiftLeft<1>(v), v);
1297 }
1298 
1299 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1300 
1301 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1303  const Simd<T, N> d;
1304  Mask128<T, N> mask;
1305  // Need a signed type for BroadcastSignBit.
1306  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1307  // Move the highest valid bit of the shift count into the sign bit.
1308  test = ShiftLeft<12>(test);
1309 
1310  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1311  test = ShiftLeft<1>(test); // next bit (descending order)
1312  v = IfThenElse(mask, ShiftRight<8>(v), v);
1313 
1314  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1315  test = ShiftLeft<1>(test); // next bit (descending order)
1316  v = IfThenElse(mask, ShiftRight<4>(v), v);
1317 
1318  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1319  test = ShiftLeft<1>(test); // next bit (descending order)
1320  v = IfThenElse(mask, ShiftRight<2>(v), v);
1321 
1322  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1323  return IfThenElse(mask, ShiftRight<1>(v), v);
1324 }
1325 
1326 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1327 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
1328  const Simd<T, N> d;
1329  Mask128<T, N> mask;
1330  // Need a signed type for BroadcastSignBit.
1331  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1332  // Move the highest valid bit of the shift count into the sign bit.
1333  test = ShiftLeft<27>(test);
1334 
1335  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1336  test = ShiftLeft<1>(test); // next bit (descending order)
1337  v = IfThenElse(mask, ShiftRight<16>(v), v);
1338 
1339  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1340  test = ShiftLeft<1>(test); // next bit (descending order)
1341  v = IfThenElse(mask, ShiftRight<8>(v), v);
1342 
1343  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1344  test = ShiftLeft<1>(test); // next bit (descending order)
1345  v = IfThenElse(mask, ShiftRight<4>(v), v);
1346 
1347  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1348  test = ShiftLeft<1>(test); // next bit (descending order)
1349  v = IfThenElse(mask, ShiftRight<2>(v), v);
1350 
1351  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1352  return IfThenElse(mask, ShiftRight<1>(v), v);
1353 }
1354 
1355 // ================================================== MEMORY
1356 
1357 // ------------------------------ Load
1358 
1359 template <typename T>
1360 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1361  return Vec128<T>{wasm_v128_load(aligned)};
1362 }
1363 
1364 template <typename T, size_t N>
1365 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> d,
1366  const T* HWY_RESTRICT aligned) {
1367  return IfThenElseZero(m, Load(d, aligned));
1368 }
1369 
1370 // Partial load.
1371 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1373  Vec128<T, N> v;
1374  CopyBytes<sizeof(T) * N>(p, &v);
1375  return v;
1376 }
1377 
1378 // LoadU == Load.
1379 template <typename T, size_t N>
1381  return Load(d, p);
1382 }
1383 
1384 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1385 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1386 HWY_API Vec128<T, N> LoadDup128(Simd<T, N> d, const T* HWY_RESTRICT p) {
1387  return Load(d, p);
1388 }
1389 
1390 // ------------------------------ Store
1391 
1392 template <typename T>
1393 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1394  wasm_v128_store(aligned, v.raw);
1395 }
1396 
1397 // Partial store.
1398 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1400  CopyBytes<sizeof(T) * N>(&v, p);
1401 }
1402 
1404  float* HWY_RESTRICT p) {
1405  *p = wasm_f32x4_extract_lane(v.raw, 0);
1406 }
1407 
1408 // StoreU == Store.
1409 template <typename T, size_t N>
1411  Store(v, d, p);
1412 }
1413 
1414 // ------------------------------ Non-temporal stores
1415 
1416 // Same as aligned stores on non-x86.
1417 
1418 template <typename T, size_t N>
1419 HWY_API void Stream(Vec128<T, N> v, Simd<T, N> /* tag */,
1420  T* HWY_RESTRICT aligned) {
1421  wasm_v128_store(aligned, v.raw);
1422 }
1423 
1424 // ------------------------------ Scatter (Store)
1425 
1426 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
1427 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
1428  const Vec128<Offset, N> offset) {
1429  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1430 
1431  alignas(16) T lanes[N];
1432  Store(v, d, lanes);
1433 
1434  alignas(16) Offset offset_lanes[N];
1435  Store(offset, Simd<Offset, N>(), offset_lanes);
1436 
1437  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1438  for (size_t i = 0; i < N; ++i) {
1439  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1440  }
1441 }
1442 
1443 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
1444 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
1445  const Vec128<Index, N> index) {
1446  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1447 
1448  alignas(16) T lanes[N];
1449  Store(v, d, lanes);
1450 
1451  alignas(16) Index index_lanes[N];
1452  Store(index, Simd<Index, N>(), index_lanes);
1453 
1454  for (size_t i = 0; i < N; ++i) {
1455  base[index_lanes[i]] = lanes[i];
1456  }
1457 }
1458 
1459 // ------------------------------ Gather (Load/Store)
1460 
1461 template <typename T, size_t N, typename Offset>
1462 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
1463  const T* HWY_RESTRICT base,
1464  const Vec128<Offset, N> offset) {
1465  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1466 
1467  alignas(16) Offset offset_lanes[N];
1468  Store(offset, Simd<Offset, N>(), offset_lanes);
1469 
1470  alignas(16) T lanes[N];
1471  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1472  for (size_t i = 0; i < N; ++i) {
1473  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1474  }
1475  return Load(d, lanes);
1476 }
1477 
1478 template <typename T, size_t N, typename Index>
1479 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
1480  const Vec128<Index, N> index) {
1481  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1482 
1483  alignas(16) Index index_lanes[N];
1484  Store(index, Simd<Index, N>(), index_lanes);
1485 
1486  alignas(16) T lanes[N];
1487  for (size_t i = 0; i < N; ++i) {
1488  lanes[i] = base[index_lanes[i]];
1489  }
1490  return Load(d, lanes);
1491 }
1492 
1493 // ================================================== SWIZZLE
1494 
1495 // ------------------------------ Extract lane
1496 
1497 // Gets the single value stored in a vector/part.
1498 template <size_t N>
1499 HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
1500  return wasm_i8x16_extract_lane(v.raw, 0);
1501 }
1502 template <size_t N>
1503 HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
1504  return wasm_i8x16_extract_lane(v.raw, 0);
1505 }
1506 template <size_t N>
1507 HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
1508  return wasm_i16x8_extract_lane(v.raw, 0);
1509 }
1510 template <size_t N>
1511 HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
1512  return wasm_i16x8_extract_lane(v.raw, 0);
1513 }
1514 template <size_t N>
1515 HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
1516  return wasm_i32x4_extract_lane(v.raw, 0);
1517 }
1518 template <size_t N>
1519 HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
1520  return wasm_i32x4_extract_lane(v.raw, 0);
1521 }
1522 template <size_t N>
1524  return wasm_i64x2_extract_lane(v.raw, 0);
1525 }
1526 template <size_t N>
1528  return wasm_i64x2_extract_lane(v.raw, 0);
1529 }
1530 
1531 template <size_t N>
1533  return wasm_f32x4_extract_lane(v.raw, 0);
1534 }
1535 
1536 // ------------------------------ LowerHalf
1537 
1538 template <typename T, size_t N>
1539 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2> /* tag */, Vec128<T, N> v) {
1540  return Vec128<T, N / 2>{v.raw};
1541 }
1542 
1543 template <typename T, size_t N>
1545  return LowerHalf(Simd<T, N / 2>(), v);
1546 }
1547 
1548 // ------------------------------ ShiftLeftBytes
1549 
1550 // 0x01..0F, kBytes = 1 => 0x02..0F00
1551 template <int kBytes, typename T, size_t N>
1552 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N> /* tag */, Vec128<T, N> v) {
1553  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1554  const __i8x16 zero = wasm_i8x16_splat(0);
1555  switch (kBytes) {
1556  case 0:
1557  return v;
1558 
1559  case 1:
1560  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
1561  6, 7, 8, 9, 10, 11, 12, 13, 14)};
1562 
1563  case 2:
1564  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
1565  5, 6, 7, 8, 9, 10, 11, 12, 13)};
1566 
1567  case 3:
1568  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
1569  3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1570 
1571  case 4:
1572  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
1573  2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1574 
1575  case 5:
1576  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
1577  1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1578 
1579  case 6:
1580  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1581  16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1582 
1583  case 7:
1584  return Vec128<T, N>{wasm_i8x16_shuffle(
1585  v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1586 
1587  case 8:
1588  return Vec128<T, N>{wasm_i8x16_shuffle(
1589  v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1590 
1591  case 9:
1592  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1593  16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
1594  6)};
1595 
1596  case 10:
1597  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1598  16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
1599  5)};
1600 
1601  case 11:
1602  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1603  16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
1604  4)};
1605 
1606  case 12:
1607  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1608  16, 16, 16, 16, 16, 16, 16, 0, 1,
1609  2, 3)};
1610 
1611  case 13:
1612  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1613  16, 16, 16, 16, 16, 16, 16, 16, 0,
1614  1, 2)};
1615 
1616  case 14:
1617  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1618  16, 16, 16, 16, 16, 16, 16, 16, 16,
1619  0, 1)};
1620 
1621  case 15:
1622  return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
1623  16, 16, 16, 16, 16, 16, 16, 16, 16,
1624  16, 0)};
1625  }
1626  return Vec128<T, N>{zero};
1627 }
1628 
1629 template <int kBytes, typename T, size_t N>
1630 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
1631  return ShiftLeftBytes<kBytes>(Simd<T, N>(), v);
1632 }
1633 
1634 // ------------------------------ ShiftLeftLanes
1635 
1636 template <int kLanes, typename T, size_t N>
1637 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N> d, const Vec128<T, N> v) {
1638  const Repartition<uint8_t, decltype(d)> d8;
1639  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1640 }
1641 
1642 template <int kLanes, typename T, size_t N>
1643 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
1644  return ShiftLeftLanes<kLanes>(Simd<T, N>(), v);
1645 }
1646 
1647 // ------------------------------ ShiftRightBytes
1648 namespace detail {
1649 
1650 // Helper function allows zeroing invalid lanes in caller.
1651 template <int kBytes, typename T, size_t N>
1652 HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
1653  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1654  const __i8x16 zero = wasm_i8x16_splat(0);
1655 
1656  switch (kBytes) {
1657  case 0:
1658  return v.raw;
1659 
1660  case 1:
1661  return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1662  12, 13, 14, 15, 16);
1663 
1664  case 2:
1665  return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1666  13, 14, 15, 16, 16);
1667 
1668  case 3:
1669  return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1670  13, 14, 15, 16, 16, 16);
1671 
1672  case 4:
1673  return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1674  14, 15, 16, 16, 16, 16);
1675 
1676  case 5:
1677  return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1678  15, 16, 16, 16, 16, 16);
1679 
1680  case 6:
1681  return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1682  16, 16, 16, 16, 16, 16);
1683 
1684  case 7:
1685  return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1686  16, 16, 16, 16, 16, 16, 16);
1687 
1688  case 8:
1689  return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1690  16, 16, 16, 16, 16, 16, 16);
1691 
1692  case 9:
1693  return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1694  16, 16, 16, 16, 16, 16, 16);
1695 
1696  case 10:
1697  return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1698  16, 16, 16, 16, 16, 16, 16);
1699 
1700  case 11:
1701  return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1702  16, 16, 16, 16, 16, 16, 16);
1703 
1704  case 12:
1705  return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1706  16, 16, 16, 16, 16, 16, 16);
1707 
1708  case 13:
1709  return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1710  16, 16, 16, 16, 16, 16, 16);
1711 
1712  case 14:
1713  return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1714  16, 16, 16, 16, 16, 16, 16);
1715 
1716  case 15:
1717  return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1718  16, 16, 16, 16, 16, 16, 16);
1719  case 16:
1720  return zero;
1721  }
1722 }
1723 
1724 } // namespace detail
1725 
1726 // 0x01..0F, kBytes = 1 => 0x0001..0E
1727 template <int kBytes, typename T, size_t N>
1728 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N> /* tag */, Vec128<T, N> v) {
1729  // For partial vectors, clear upper lanes so we shift in zeros.
1730  if (N != 16 / sizeof(T)) {
1731  const Vec128<T> vfull{v.raw};
1732  v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
1733  }
1734  return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
1735 }
1736 
1737 // ------------------------------ ShiftRightLanes
1738 template <int kLanes, typename T, size_t N>
1739 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N> d, const Vec128<T, N> v) {
1740  const Repartition<uint8_t, decltype(d)> d8;
1741  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1742 }
1743 
1744 // ------------------------------ UpperHalf (ShiftRightBytes)
1745 
1746 // Full input: copy hi into lo (smaller instruction encoding than shifts).
1747 template <typename T>
1748 HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Half<Full128<T>> /* tag */,
1749  const Vec128<T> v) {
1750  return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1751 }
1753  const Vec128<float> v) {
1754  return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1755 }
1756 
1757 // Partial
1758 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1759 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N>> /* tag */,
1760  Vec128<T, N> v) {
1761  const Simd<T, N> d;
1762  const auto vu = BitCast(RebindToUnsigned<decltype(d)>(), v);
1763  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(vu));
1764  return Vec128<T, (N + 1) / 2>{upper.raw};
1765 }
1766 
1767 // ------------------------------ CombineShiftRightBytes
1768 
1769 template <int kBytes, typename T, class V = Vec128<T>>
1770 HWY_API V CombineShiftRightBytes(Full128<T> /* tag */, V hi, V lo) {
1771  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1772  switch (kBytes) {
1773  case 0:
1774  return lo;
1775 
1776  case 1:
1777  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1778  11, 12, 13, 14, 15, 16)};
1779 
1780  case 2:
1781  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1782  11, 12, 13, 14, 15, 16, 17)};
1783 
1784  case 3:
1785  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1786  12, 13, 14, 15, 16, 17, 18)};
1787 
1788  case 4:
1789  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1790  13, 14, 15, 16, 17, 18, 19)};
1791 
1792  case 5:
1793  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1794  14, 15, 16, 17, 18, 19, 20)};
1795 
1796  case 6:
1797  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1798  14, 15, 16, 17, 18, 19, 20, 21)};
1799 
1800  case 7:
1801  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1802  15, 16, 17, 18, 19, 20, 21, 22)};
1803 
1804  case 8:
1805  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1806  16, 17, 18, 19, 20, 21, 22, 23)};
1807 
1808  case 9:
1809  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1810  17, 18, 19, 20, 21, 22, 23, 24)};
1811 
1812  case 10:
1813  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1814  17, 18, 19, 20, 21, 22, 23, 24, 25)};
1815 
1816  case 11:
1817  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1818  18, 19, 20, 21, 22, 23, 24, 25, 26)};
1819 
1820  case 12:
1821  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1822  19, 20, 21, 22, 23, 24, 25, 26, 27)};
1823 
1824  case 13:
1825  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1826  20, 21, 22, 23, 24, 25, 26, 27, 28)};
1827 
1828  case 14:
1829  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1830  21, 22, 23, 24, 25, 26, 27, 28, 29)};
1831 
1832  case 15:
1833  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1834  22, 23, 24, 25, 26, 27, 28, 29, 30)};
1835  }
1836  return hi;
1837 }
1838 
1839 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
1840  class V = Vec128<T, N>>
1842  constexpr size_t kSize = N * sizeof(T);
1843  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
1844  const Repartition<uint8_t, decltype(d)> d8;
1845  const Full128<uint8_t> d_full8;
1846  using V8 = VFromD<decltype(d_full8)>;
1847  const V8 hi8{BitCast(d8, hi).raw};
1848  // Move into most-significant bytes
1849  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
1850  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
1851  return V{BitCast(Full128<T>(), r).raw};
1852 }
1853 
1854 // ------------------------------ Broadcast/splat any lane
1855 
1856 // Unsigned
1857 template <int kLane, size_t N>
1859  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1860  return Vec128<uint16_t, N>{wasm_i16x8_shuffle(
1861  v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1862 }
1863 template <int kLane, size_t N>
1865  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1866  return Vec128<uint32_t, N>{
1867  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1868 }
1869 
1870 // Signed
1871 template <int kLane, size_t N>
1873  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1874  return Vec128<int16_t, N>{wasm_i16x8_shuffle(
1875  v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1876 }
1877 template <int kLane, size_t N>
1879  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1880  return Vec128<int32_t, N>{
1881  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1882 }
1883 
1884 // Float
1885 template <int kLane, size_t N>
1887  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1888  return Vec128<float, N>{
1889  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1890 }
1891 
1892 // ------------------------------ TableLookupBytes
1893 
1894 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
1895 // lane indices in [0, 16).
1896 template <typename T, size_t N, typename TI, size_t NI>
1898  const Vec128<TI, NI> from) {
1899 // Not yet available in all engines, see
1900 // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
1901 // V8 implementation of this had a bug, fixed on 2021-04-03:
1902 // https://chromium-review.googlesource.com/c/v8/v8/+/2822951
1903 #if 0
1904  return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
1905 #else
1906  alignas(16) uint8_t control[16];
1907  alignas(16) uint8_t input[16];
1908  alignas(16) uint8_t output[16];
1909  wasm_v128_store(control, from.raw);
1910  wasm_v128_store(input, bytes.raw);
1911  for (size_t i = 0; i < 16; ++i) {
1912  output[i] = control[i] < 16 ? input[control[i]] : 0;
1913  }
1914  return Vec128<TI, NI>{wasm_v128_load(output)};
1915 #endif
1916 }
1917 
1918 template <typename T, size_t N, typename TI, size_t NI>
1920  const Vec128<TI, NI> from) {
1921  const Simd<TI, NI> d;
1922  // Mask size must match vector type, so cast everything to this type.
1923  Repartition<int8_t, decltype(d)> di8;
1925  const auto msb = BitCast(di8, from) < Zero(di8);
1926  const auto lookup =
1927  TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
1928  return BitCast(d, IfThenZeroElse(msb, lookup));
1929 }
1930 
1931 // ------------------------------ Hard-coded shuffles
1932 
1933 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1934 // Shuffle0321 rotates one lane to the right (the previous least-significant
1935 // lane is now most-significant). These could also be implemented via
1936 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1937 
1938 // Swap 32-bit halves in 64-bit halves.
1939 HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
1940  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1941 }
1942 HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
1943  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1944 }
1945 HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
1946  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1947 }
1948 
1949 // Swap 64-bit halves
1951  return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1952 }
1954  return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1955 }
1957  return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1958 }
1959 
1960 // Rotate right 32 bits
1962  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1963 }
1965  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1966 }
1968  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1969 }
1970 // Rotate left 32 bits
1972  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1973 }
1975  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1976 }
1978  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1979 }
1980 
1981 // Reverse
1983  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1984 }
1986  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1987 }
1989  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1990 }
1991 
1992 // ------------------------------ TableLookupLanes
1993 
1994 // Returned by SetTableIndices for use by TableLookupLanes.
1995 template <typename T, size_t N>
1996 struct Indices128 {
1997  __v128_u raw;
1998 };
1999 
2000 template <typename T, size_t N, HWY_IF_LE128(T, N)>
2001 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
2002 #if HWY_IS_DEBUG_BUILD
2003  for (size_t i = 0; i < N; ++i) {
2004  HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
2005  }
2006 #endif
2007 
2008  const Repartition<uint8_t, decltype(d)> d8;
2009  alignas(16) uint8_t control[16] = {0};
2010  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
2011  for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
2012  control[idx_lane * sizeof(T) + idx_byte] =
2013  static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
2014  }
2015  }
2016  return Indices128<T, N>{Load(d8, control).raw};
2017 }
2018 
2019 template <size_t N>
2020 HWY_API Vec128<uint32_t, N> TableLookupLanes(
2021  const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
2022  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
2023 }
2024 template <size_t N>
2025 HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
2026  const Indices128<int32_t, N> idx) {
2027  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
2028 }
2029 template <size_t N>
2030 HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
2031  const Indices128<float, N> idx) {
2032  const Simd<int32_t, N> di;
2033  const Simd<float, N> df;
2034  return BitCast(df,
2035  TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
2036 }
2037 
2038 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
2039 
2040 template <typename T>
2041 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
2042  return Shuffle0123(v);
2043 }
2044 
2045 template <typename T>
2046 HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
2047  return Vec128<T, 2>(Shuffle2301(Vec128<T>(v.raw)).raw);
2048 }
2049 
2050 template <typename T>
2051 HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
2052  return v;
2053 }
2054 
2055 // ------------------------------ InterleaveLower
2056 
2057 template <size_t N>
2059  Vec128<uint8_t, N> b) {
2060  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
2061  a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2062 }
2063 template <size_t N>
2065  Vec128<uint16_t, N> b) {
2066  return Vec128<uint16_t, N>{
2067  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2068 }
2069 template <size_t N>
2071  Vec128<uint32_t, N> b) {
2072  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2073 }
2074 template <size_t N>
2076  Vec128<uint64_t, N> b) {
2077  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2078 }
2079 
2080 template <size_t N>
2082  Vec128<int8_t, N> b) {
2083  return Vec128<int8_t, N>{wasm_i8x16_shuffle(
2084  a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2085 }
2086 template <size_t N>
2088  Vec128<int16_t, N> b) {
2089  return Vec128<int16_t, N>{
2090  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2091 }
2092 template <size_t N>
2094  Vec128<int32_t, N> b) {
2095  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2096 }
2097 template <size_t N>
2099  Vec128<int64_t, N> b) {
2100  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
2101 }
2102 
2103 template <size_t N>
2105  Vec128<float, N> b) {
2106  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
2107 }
2108 
2109 // Additional overload for the optional Simd<> tag.
2110 template <typename T, size_t N, class V = Vec128<T, N>>
2111 HWY_API V InterleaveLower(Simd<T, N> /* tag */, V a, V b) {
2112  return InterleaveLower(a, b);
2113 }
2114 
2115 // ------------------------------ InterleaveUpper (UpperHalf)
2116 
2117 // All functions inside detail lack the required D parameter.
2118 namespace detail {
2119 
2120 template <size_t N>
2122  Vec128<uint8_t, N> b) {
2123  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2124  26, 11, 27, 12, 28, 13, 29, 14,
2125  30, 15, 31)};
2126 }
2127 template <size_t N>
2129  Vec128<uint16_t, N> b) {
2130  return Vec128<uint16_t, N>{
2131  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2132 }
2133 template <size_t N>
2135  Vec128<uint32_t, N> b) {
2136  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2137 }
2138 template <size_t N>
2140  Vec128<uint64_t, N> b) {
2141  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2142 }
2143 
2144 template <size_t N>
2146  Vec128<int8_t, N> b) {
2147  return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
2148  26, 11, 27, 12, 28, 13, 29, 14,
2149  30, 15, 31)};
2150 }
2151 template <size_t N>
2153  Vec128<int16_t, N> b) {
2154  return Vec128<int16_t, N>{
2155  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2156 }
2157 template <size_t N>
2159  Vec128<int32_t, N> b) {
2160  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2161 }
2162 template <size_t N>
2164  Vec128<int64_t, N> b) {
2165  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
2166 }
2167 
2168 template <size_t N>
2170  Vec128<float, N> b) {
2171  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
2172 }
2173 
2174 } // namespace detail
2175 
2176 // Full
2177 template <typename T, class V = Vec128<T>>
2178 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
2179  return detail::InterleaveUpper(a, b);
2180 }
2181 
2182 // Partial
2183 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
2184 HWY_API V InterleaveUpper(Simd<T, N> d, V a, V b) {
2185  const Half<decltype(d)> d2;
2186  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
2187 }
2188 
2189 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2190 
2191 // Same as Interleave*, except that the return lanes are double-width integers;
2192 // this is necessary because the single-lane scalar cannot return two values.
2193 template <typename T, size_t N, class DW = RepartitionToWide<Simd<T, N>>>
2194 HWY_API VFromD<DW> ZipLower(Vec128<T, N> a, Vec128<T, N> b) {
2195  return BitCast(DW(), InterleaveLower(a, b));
2196 }
2197 template <typename T, size_t N, class D = Simd<T, N>,
2198  class DW = RepartitionToWide<D>>
2199 HWY_API VFromD<DW> ZipLower(DW dw, Vec128<T, N> a, Vec128<T, N> b) {
2200  return BitCast(dw, InterleaveLower(D(), a, b));
2201 }
2202 
2203 template <typename T, size_t N, class D = Simd<T, N>,
2204  class DW = RepartitionToWide<D>>
2205 HWY_API VFromD<DW> ZipUpper(DW dw, Vec128<T, N> a, Vec128<T, N> b) {
2206  return BitCast(dw, InterleaveUpper(D(), a, b));
2207 }
2208 
2209 // ================================================== COMBINE
2210 
2211 // ------------------------------ Combine (InterleaveLower)
2212 
2213 // N = N/2 + N/2 (upper half undefined)
2214 template <typename T, size_t N>
2216  Vec128<T, N / 2> lo_half) {
2217  const Half<decltype(d)> d2;
2218  const RebindToUnsigned<decltype(d2)> du2;
2219  // Treat half-width input as one lane, and expand to two lanes.
2220  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
2221  const VU lo{BitCast(du2, lo_half).raw};
2222  const VU hi{BitCast(du2, hi_half).raw};
2223  return BitCast(d, InterleaveLower(lo, hi));
2224 }
2225 
2226 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
2227 
2228 template <typename T, size_t N>
2229 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N> d, Vec128<T, N / 2> lo) {
2230  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
2231 }
2232 
2233 // ------------------------------ ConcatLowerLower
2234 
2235 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2236 template <typename T>
2238  const Vec128<T> lo) {
2239  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
2240 }
2241 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2242 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N> d, const Vec128<T, N> hi,
2243  const Vec128<T, N> lo) {
2244  const Half<decltype(d)> d2;
2245  return Combine(LowerHalf(d2, hi), LowerHalf(d2, lo));
2246 }
2247 
2248 // ------------------------------ ConcatUpperUpper
2249 
2250 template <typename T>
2252  const Vec128<T> lo) {
2253  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
2254 }
2255 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2256 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N> d, const Vec128<T, N> hi,
2257  const Vec128<T, N> lo) {
2258  const Half<decltype(d)> d2;
2259  return Combine(UpperHalf(d2, hi), UpperHalf(d2, lo));
2260 }
2261 
2262 // ------------------------------ ConcatLowerUpper
2263 
2264 template <typename T>
2266  const Vec128<T> lo) {
2267  return CombineShiftRightBytes<8>(d, hi, lo);
2268 }
2269 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2270 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N> d, const Vec128<T, N> hi,
2271  const Vec128<T, N> lo) {
2272  const Half<decltype(d)> d2;
2273  return Combine(LowerHalf(d2, hi), UpperHalf(d2, lo));
2274 }
2275 
2276 // ------------------------------ ConcatUpperLower
2277 template <typename T, size_t N>
2278 HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N> d, const Vec128<T, N> hi,
2279  const Vec128<T, N> lo) {
2280  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2281 }
2282 
2283 // ------------------------------ ConcatOdd
2284 
2285 // 32-bit full
2286 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2287 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2288  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2289 }
2290 
2291 // 32-bit partial
2292 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2294  Vec128<T, 2> lo) {
2295  return InterleaveUpper(Simd<T, 2>(), lo, hi);
2296 }
2297 
2298 // 64-bit full - no partial because we need at least two inputs to have
2299 // even/odd.
2300 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2301 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2302  return InterleaveUpper(Full128<T>(), lo, hi);
2303 }
2304 
2305 // ------------------------------ ConcatEven (InterleaveLower)
2306 
2307 // 32-bit full
2308 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2309 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2310  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2311 }
2312 
2313 // 32-bit partial
2314 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2316  Vec128<T, 2> lo) {
2317  return InterleaveLower(Simd<T, 2>(), lo, hi);
2318 }
2319 
2320 // 64-bit full - no partial because we need at least two inputs to have
2321 // even/odd.
2322 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2323 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
2324  return InterleaveLower(Full128<T>(), lo, hi);
2325 }
2326 
2327 // ------------------------------ OddEven
2328 
2329 namespace detail {
2330 
2331 template <typename T, size_t N>
2333  const Vec128<T, N> b) {
2334  const Simd<T, N> d;
2335  const Repartition<uint8_t, decltype(d)> d8;
2336  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2337  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2338  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
2339 }
2340 template <typename T, size_t N>
2342  const Vec128<T, N> b) {
2343  return Vec128<T, N>{
2344  wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2345 }
2346 template <typename T, size_t N>
2348  const Vec128<T, N> b) {
2349  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2350 }
2351 template <typename T, size_t N>
2353  const Vec128<T, N> b) {
2354  return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
2355 }
2356 
2357 } // namespace detail
2358 
2359 template <typename T, size_t N>
2360 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
2361  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
2362 }
2363 template <size_t N>
2365  const Vec128<float, N> b) {
2366  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2367 }
2368 
2369 // ================================================== CONVERT
2370 
2371 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2372 
2373 // Unsigned: zero-extend.
2374 template <size_t N>
2376  const Vec128<uint8_t, N> v) {
2377  return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
2378 }
2379 template <size_t N>
2381  const Vec128<uint8_t, N> v) {
2382  return Vec128<uint32_t, N>{
2383  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2384 }
2385 template <size_t N>
2387  const Vec128<uint8_t, N> v) {
2388  return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
2389 }
2390 template <size_t N>
2392  const Vec128<uint8_t, N> v) {
2393  return Vec128<int32_t, N>{
2394  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2395 }
2396 template <size_t N>
2397 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
2398  const Vec128<uint16_t, N> v) {
2399  return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
2400 }
2401 template <size_t N>
2403  const Vec128<uint16_t, N> v) {
2404  return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
2405 }
2406 
2407 // Signed: replicate sign bit.
2408 template <size_t N>
2409 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
2410  const Vec128<int8_t, N> v) {
2411  return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
2412 }
2413 template <size_t N>
2414 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
2415  const Vec128<int8_t, N> v) {
2416  return Vec128<int32_t, N>{
2417  wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
2418 }
2419 template <size_t N>
2420 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
2421  const Vec128<int16_t, N> v) {
2422  return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
2423 }
2424 
2425 template <size_t N>
2427  const Vec128<int32_t, N> v) {
2428  return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
2429 }
2430 
2431 template <size_t N>
2432 HWY_API Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
2433  const Vec128<float16_t, N> v) {
2434  const Simd<int32_t, N> di32;
2435  const Simd<uint32_t, N> du32;
2436  const Simd<float, N> df32;
2437  // Expand to u32 so we can shift.
2438  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
2439  const auto sign = ShiftRight<15>(bits16);
2440  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
2441  const auto mantissa = bits16 & Set(du32, 0x3FF);
2442  const auto subnormal =
2443  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
2444  Set(df32, 1.0f / 16384 / 1024));
2445 
2446  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
2447  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
2448  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2449  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
2450  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2451 }
2452 
2453 template <size_t N>
2454 HWY_API Vec128<float, N> PromoteTo(Simd<float, N> df32,
2455  const Vec128<bfloat16_t, N> v) {
2456  const Rebind<uint16_t, decltype(df32)> du16;
2457  const RebindToSigned<decltype(df32)> di32;
2458  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
2459 }
2460 
2461 // ------------------------------ Demotions (full -> part w/ narrow lanes)
2462 
2463 template <size_t N>
2465  const Vec128<int32_t, N> v) {
2466  return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
2467 }
2468 
2469 template <size_t N>
2471  const Vec128<int32_t, N> v) {
2472  return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
2473 }
2474 
2475 template <size_t N>
2477  const Vec128<int32_t, N> v) {
2478  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2479  return Vec128<uint8_t, N>{
2480  wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2481 }
2482 
2483 template <size_t N>
2485  const Vec128<int16_t, N> v) {
2486  return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
2487 }
2488 
2489 template <size_t N>
2491  const Vec128<int32_t, N> v) {
2492  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2493  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2494 }
2495 
2496 template <size_t N>
2498  const Vec128<int16_t, N> v) {
2499  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
2500 }
2501 
2502 template <size_t N>
2504  const Vec128<double, N> v) {
2505  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
2506 }
2507 
2508 template <size_t N>
2509 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
2510  const Vec128<float, N> v) {
2511  const Simd<int32_t, N> di;
2512  const Simd<uint32_t, N> du;
2513  const Simd<uint16_t, N> du16;
2514  const auto bits32 = BitCast(du, v);
2515  const auto sign = ShiftRight<31>(bits32);
2516  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
2517  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
2518 
2519  const auto k15 = Set(di, 15);
2520  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
2521  const auto is_tiny = exp < Set(di, -24);
2522 
2523  const auto is_subnormal = exp < Set(di, -14);
2524  const auto biased_exp16 =
2525  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
2526  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
2527  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
2528  (mantissa32 >> (Set(du, 13) + sub_exp));
2529  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
2530  ShiftRight<13>(mantissa32)); // <1024
2531 
2532  const auto sign16 = ShiftLeft<15>(sign);
2533  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2534  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
2535  return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
2536 }
2537 
2538 template <size_t N>
2539 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N> dbf16,
2540  const Vec128<float, N> v) {
2541  const Rebind<int32_t, decltype(dbf16)> di32;
2542  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2543  const Rebind<uint16_t, decltype(dbf16)> du16;
2544  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2545  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2546 }
2547 
2548 template <size_t N>
2549 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
2550  Simd<bfloat16_t, 2 * N> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
2551  const RebindToUnsigned<decltype(dbf16)> du16;
2552  const Repartition<uint32_t, decltype(dbf16)> du32;
2553  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
2554  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2555 }
2556 
2557 // For already range-limited input [0, 255].
2558 template <size_t N>
2560  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2561  return Vec128<uint8_t, N>{
2562  wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2563 }
2564 
2565 // ------------------------------ Convert i32 <=> f32 (Round)
2566 
2567 template <size_t N>
2569  const Vec128<int32_t, N> v) {
2570  return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
2571 }
2572 // Truncates (rounds toward zero).
2573 template <size_t N>
2575  const Vec128<float, N> v) {
2576  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
2577 }
2578 
2579 template <size_t N>
2580 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
2581  return ConvertTo(Simd<int32_t, N>(), Round(v));
2582 }
2583 
2584 // ================================================== MISC
2585 
2586 // ------------------------------ LoadMaskBits (TestBit)
2587 
2588 namespace detail {
2589 
2590 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2591 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t bits) {
2592  const RebindToUnsigned<decltype(d)> du;
2593  // Easier than Set(), which would require an >8-bit type, which would not
2594  // compile for T=uint8_t, N=1.
2595  const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
2596 
2597  // Replicate bytes 8x such that each byte contains the bit that governs it.
2598  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2599  1, 1, 1, 1, 1, 1, 1, 1};
2600  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
2601 
2602  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2603  1, 2, 4, 8, 16, 32, 64, 128};
2604  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
2605 }
2606 
2607 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2608 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t bits) {
2609  const RebindToUnsigned<decltype(d)> du;
2610  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2611  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2612 }
2613 
2614 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2615 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t bits) {
2616  const RebindToUnsigned<decltype(d)> du;
2617  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2618  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2619 }
2620 
2621 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2622 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t bits) {
2623  const RebindToUnsigned<decltype(d)> du;
2624  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
2625  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2626 }
2627 
2628 } // namespace detail
2629 
2630 // `p` points to at least 8 readable bytes, not all of which need be valid.
2631 template <typename T, size_t N, HWY_IF_LE128(T, N)>
2632 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N> d,
2633  const uint8_t* HWY_RESTRICT bits) {
2634  uint64_t mask_bits = 0;
2635  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
2636  return detail::LoadMaskBits(d, mask_bits);
2637 }
2638 
2639 // ------------------------------ Mask
2640 
2641 namespace detail {
2642 
2643 // Full
2644 template <typename T>
2645 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
2646  const Mask128<T> mask) {
2647  alignas(16) uint64_t lanes[2];
2648  wasm_v128_store(lanes, mask.raw);
2649 
2650  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2651  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2652  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2653  return (hi + lo);
2654 }
2655 
2656 // 64-bit
2657 template <typename T>
2659  const Mask128<T, 8> mask) {
2660  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2661  return (wasm_i64x2_extract_lane(mask.raw, 0) * kMagic) >> 56;
2662 }
2663 
2664 // 32-bit or less: need masking
2665 template <typename T, size_t N, HWY_IF_LE32(T, N)>
2666 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
2667  const Mask128<T, N> mask) {
2668  uint64_t bytes = wasm_i64x2_extract_lane(mask.raw, 0);
2669  // Clear potentially undefined bytes.
2670  bytes &= (1ULL << (N * 8)) - 1;
2671  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2672  return (bytes * kMagic) >> 56;
2673 }
2674 
2675 template <typename T, size_t N>
2677  const Mask128<T, N> mask) {
2678  // Remove useless lower half of each u16 while preserving the sign bit.
2679  const __i16x8 zero = wasm_i16x8_splat(0);
2680  const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
2681  return BitsFromMask(hwy::SizeTag<1>(), mask8);
2682 }
2683 
2684 template <typename T, size_t N>
2686  const Mask128<T, N> mask) {
2687  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
2688  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2689  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2690  alignas(16) uint32_t lanes[4];
2691  wasm_v128_store(lanes, sliced_mask);
2692  return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2693 }
2694 
2695 // Returns the lowest N bits for the BitsFromMask result.
2696 template <typename T, size_t N>
2697 constexpr uint64_t OnlyActive(uint64_t bits) {
2698  return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
2699 }
2700 
2701 // Returns 0xFF for bytes with index >= N, otherwise 0.
2702 template <size_t N>
2703 constexpr __i8x16 BytesAbove() {
2704  return
2705  (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2706  : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2707  : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2708  : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2709  : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2710  : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2711  : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2712  : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2713  : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2714  : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2715  -1, -1, -1, -1, -1)
2716  : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2717  -1, -1, -1, -1)
2718  : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2719  -1, -1, -1, -1)
2720  : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2721  -1, -1, -1)
2722  : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2723  -1, -1, -1)
2724  : (N == 11)
2725  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2726  : (N == 13)
2727  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2728  : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2729 }
2730 
2731 template <typename T, size_t N>
2732 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
2733  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
2734 }
2735 
2736 template <typename T>
2737 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
2738  return PopCount(BitsFromMask(tag, m));
2739 }
2740 
2741 template <typename T>
2742 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
2743  return PopCount(BitsFromMask(tag, m));
2744 }
2745 
2746 template <typename T>
2747 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
2748  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2749  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2750  alignas(16) uint64_t lanes[2];
2751  wasm_v128_store(lanes, shifted_bits);
2752  return PopCount(lanes[0] | lanes[1]);
2753 }
2754 
2755 } // namespace detail
2756 
2757 // `p` points to at least 8 writable bytes.
2758 template <typename T, size_t N>
2759 HWY_API size_t StoreMaskBits(const Simd<T, N> /* tag */,
2760  const Mask128<T, N> mask, uint8_t* bits) {
2761  const uint64_t mask_bits = detail::BitsFromMask(mask);
2762  const size_t kNumBytes = (N + 7) / 8;
2763  CopyBytes<kNumBytes>(&mask_bits, bits);
2764  return kNumBytes;
2765 }
2766 
2767 template <typename T, size_t N>
2768 HWY_API size_t CountTrue(const Simd<T, N> /* tag */, const Mask128<T> m) {
2769  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
2770 }
2771 
2772 // Partial vector
2773 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2774 HWY_API size_t CountTrue(const Simd<T, N> d, const Mask128<T, N> m) {
2775  // Ensure all undefined bytes are 0.
2776  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
2777  return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
2778 }
2779 
2780 // Full vector
2781 template <typename T>
2782 HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
2783 #if 0
2784  // Casting followed by wasm_i8x16_any_true results in wasm error:
2785  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
2786  const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
2787  return !wasm_i8x16_any_true(v8.raw);
2788 #else
2789  (void)d;
2790  return (wasm_i64x2_extract_lane(m.raw, 0) |
2791  wasm_i64x2_extract_lane(m.raw, 1)) == 0;
2792 #endif
2793 }
2794 
2795 // Full vector
2796 namespace detail {
2797 template <typename T>
2799  return wasm_i8x16_all_true(m.raw);
2800 }
2801 template <typename T>
2803  return wasm_i16x8_all_true(m.raw);
2804 }
2805 template <typename T>
2807  return wasm_i32x4_all_true(m.raw);
2808 }
2809 
2810 } // namespace detail
2811 
2812 template <typename T, size_t N>
2813 HWY_API bool AllTrue(const Simd<T, N> /* tag */, const Mask128<T> m) {
2814  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
2815 }
2816 
2817 // Partial vectors
2818 
2819 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2820 HWY_API bool AllFalse(Simd<T, N> /* tag */, const Mask128<T, N> m) {
2821  // Ensure all undefined bytes are 0.
2822  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
2823  return AllFalse(Mask128<T>{AndNot(mask, m).raw});
2824 }
2825 
2826 template <typename T, size_t N, HWY_IF_LE64(T, N)>
2827 HWY_API bool AllTrue(const Simd<T, N> d, const Mask128<T, N> m) {
2828  // Ensure all undefined bytes are FF.
2829  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
2830  return AllTrue(d, Mask128<T>{Or(mask, m).raw});
2831 }
2832 
2833 template <typename T, size_t N>
2834 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */,
2835  const Mask128<T, N> mask) {
2836  const uint64_t bits = detail::BitsFromMask(mask);
2837  return bits ? Num0BitsBelowLS1Bit_Nonzero64(bits) : -1;
2838 }
2839 
2840 // ------------------------------ Compress
2841 
2842 namespace detail {
2843 
2844 template <typename T, size_t N>
2845 HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
2846  HWY_DASSERT(mask_bits < 256);
2847  const Simd<T, N> d;
2848  const Rebind<uint8_t, decltype(d)> d8;
2849  const Simd<uint16_t, N> du;
2850 
2851  // We need byte indices for TableLookupBytes (one vector's worth for each of
2852  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
2853  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
2854  // with the doubling baked into the table. Unpacking nibbles is likely more
2855  // costly than the higher cache footprint from storing bytes.
2856  alignas(16) constexpr uint8_t table[256 * 8] = {
2857  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2858  0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2859  0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2860  0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2861  0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2862  6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2863  0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2864  0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2865  2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2866  0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2867  0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2868  0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2869  0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2870  6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2871  8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2872  0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2873  4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2874  10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2875  0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2876  0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2877  0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2878  4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2879  0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2880  0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2881  2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2882  10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2883  0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2884  0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2885  0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2886  0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2887  0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2888  0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2889  6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2890  12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2891  0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2892  0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2893  0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2894  8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2895  0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2896  0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2897  2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2898  8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2899  12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2900  0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2901  0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2902  10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2903  12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2904  0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2905  4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2906  6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2907  0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2908  0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2909  0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2910  4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2911  12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2912  0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2913  2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2914  0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2915  0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2916  0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2917  0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2918  14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2919  0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2920  0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2921  8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2922  14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2923  0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2924  0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2925  0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2926  6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2927  14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2928  0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2929  2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2930  14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2931  0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2932  0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2933  0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2934  6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2935  10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2936  0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2937  4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2938  8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2939  0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2940  0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2941  0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2942  4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2943  0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2944  0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2945  2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2946  14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2947  0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2948  0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2949  0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2950  12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2951  14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2952  0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2953  6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2954  8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2955  14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2956  0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2957  0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2958  10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2959  14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2960  0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2961  2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2962  10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2963  12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2964  0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2965  0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2966  8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2967  10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2968  0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2969  4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2970  6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2971 
2972  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
2973  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
2974  return BitCast(d, pairs + Set(du, 0x0100));
2975 }
2976 
2977 template <typename T, size_t N>
2978 HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
2979  HWY_DASSERT(mask_bits < 16);
2980 
2981  // There are only 4 lanes, so we can afford to load the index vector directly.
2982  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
2983  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2984  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2985  4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2986  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, //
2987  8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2988  0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2989  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2990  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, //
2991  12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2992  0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2993  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2994  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, //
2995  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2996  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2997  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2998  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2999 
3000  const Simd<T, N> d;
3001  const Repartition<uint8_t, decltype(d)> d8;
3002  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
3003 }
3004 
3005 #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
3006 
3007 template <typename T, size_t N>
3008 HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
3009  HWY_DASSERT(mask_bits < 4);
3010 
3011  // There are only 2 lanes, so we can afford to load the index vector directly.
3012  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
3013  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
3014  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
3015  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
3016  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3017 
3018  const Simd<T, N> d;
3019  const Repartition<uint8_t, decltype(d)> d8;
3020  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
3021 }
3022 
3023 #endif
3024 
3025 // Helper functions called by both Compress and CompressStore - avoids a
3026 // redundant BitsFromMask in the latter.
3027 
3028 template <typename T, size_t N>
3030  const uint64_t mask_bits) {
3031  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
3032  using D = Simd<T, N>;
3033  const RebindToSigned<D> di;
3034  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
3035 }
3036 
3037 template <typename T, size_t N>
3039  const uint64_t mask_bits) {
3040  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
3041  using D = Simd<T, N>;
3042  const RebindToSigned<D> di;
3043  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
3044 }
3045 
3046 #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
3047 
3048 template <typename T, size_t N>
3051  const uint64_t mask_bits) {
3052  const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
3053  using D = Simd<T, N>;
3054  const RebindToSigned<D> di;
3055  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
3056 }
3057 
3058 #endif
3059 
3060 } // namespace detail
3061 
3062 template <typename T, size_t N>
3063 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
3064  const uint64_t mask_bits = detail::BitsFromMask(mask);
3065  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3066 }
3067 
3068 // ------------------------------ CompressBits
3069 
3070 template <typename T, size_t N>
3072  const uint8_t* HWY_RESTRICT bits) {
3073  uint64_t mask_bits = 0;
3074  constexpr size_t kNumBytes = (N + 7) / 8;
3075  CopyBytes<kNumBytes>(bits, &mask_bits);
3076  if (N < 8) {
3077  mask_bits &= (1ull << N) - 1;
3078  }
3079 
3080  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3081 }
3082 
3083 // ------------------------------ CompressStore
3084 
3085 template <typename T, size_t N>
3086 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
3087  Simd<T, N> d, T* HWY_RESTRICT unaligned) {
3088  const uint64_t mask_bits = detail::BitsFromMask(mask);
3089  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3090  StoreU(c, d, unaligned);
3091  return PopCount(mask_bits);
3092 }
3093 
3094 // ------------------------------ CompressBitsStore
3095 
3096 template <typename T, size_t N>
3097 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
3098  const uint8_t* HWY_RESTRICT bits, Simd<T, N> d,
3099  T* HWY_RESTRICT unaligned) {
3100  uint64_t mask_bits = 0;
3101  constexpr size_t kNumBytes = (N + 7) / 8;
3102  CopyBytes<kNumBytes>(bits, &mask_bits);
3103  if (N < 8) {
3104  mask_bits &= (1ull << N) - 1;
3105  }
3106 
3107  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
3108  StoreU(c, d, unaligned);
3109  return PopCount(mask_bits);
3110 }
3111 
3112 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
3113 // TableLookupBytes)
3114 
3115 // 128 bits
3116 HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
3117  const Vec128<uint8_t> c, Full128<uint8_t> d,
3118  uint8_t* HWY_RESTRICT unaligned) {
3119  const auto k5 = Set(d, 5);
3120  const auto k6 = Set(d, 6);
3121 
3122  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
3123  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3124  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3125  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3126  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3127  alignas(16) static constexpr uint8_t tbl_g0[16] = {
3128  0x80, 0, 0x80, 0x80, 1, 0x80, //
3129  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3130  const auto shuf_r0 = Load(d, tbl_r0);
3131  const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
3132  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
3133  const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
3134  const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
3135  const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
3136  const auto int0 = r0 | g0 | b0;
3137  StoreU(int0, d, unaligned + 0 * 16);
3138 
3139  // Second vector: g10,r10, bgr[9:6], b5,g5
3140  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
3141  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
3142  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
3143  const auto r1 = TableLookupBytes(a, shuf_r1);
3144  const auto g1 = TableLookupBytes(b, shuf_g1);
3145  const auto b1 = TableLookupBytes(c, shuf_b1);
3146  const auto int1 = r1 | g1 | b1;
3147  StoreU(int1, d, unaligned + 1 * 16);
3148 
3149  // Third vector: bgr[15:11], b10
3150  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
3151  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
3152  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
3153  const auto r2 = TableLookupBytes(a, shuf_r2);
3154  const auto g2 = TableLookupBytes(b, shuf_g2);
3155  const auto b2 = TableLookupBytes(c, shuf_b2);
3156  const auto int2 = r2 | g2 | b2;
3157  StoreU(int2, d, unaligned + 2 * 16);
3158 }
3159 
3160 // 64 bits
3161 HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> a,
3162  const Vec128<uint8_t, 8> b,
3163  const Vec128<uint8_t, 8> c, Simd<uint8_t, 8> d,
3164  uint8_t* HWY_RESTRICT unaligned) {
3165  // Use full vectors for the shuffles and first result.
3166  const Full128<uint8_t> d_full;
3167  const auto k5 = Set(d_full, 5);
3168  const auto k6 = Set(d_full, 6);
3169 
3170  const Vec128<uint8_t> full_a{a.raw};
3171  const Vec128<uint8_t> full_b{b.raw};
3172  const Vec128<uint8_t> full_c{c.raw};
3173 
3174  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
3175  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3176  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3177  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3178  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3179  alignas(16) static constexpr uint8_t tbl_g0[16] = {
3180  0x80, 0, 0x80, 0x80, 1, 0x80, //
3181  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3182  const auto shuf_r0 = Load(d_full, tbl_r0);
3183  const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
3184  const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
3185  const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
3186  const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
3187  const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
3188  const auto int0 = r0 | g0 | b0;
3189  StoreU(int0, d_full, unaligned + 0 * 16);
3190 
3191  // Second (HALF) vector: bgr[7:6], b5,g5
3192  const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
3193  const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
3194  const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
3195  const auto r1 = TableLookupBytes(full_a, shuf_r1);
3196  const auto g1 = TableLookupBytes(full_b, shuf_g1);
3197  const auto b1 = TableLookupBytes(full_c, shuf_b1);
3198  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
3199  StoreU(int1, d, unaligned + 1 * 16);
3200 }
3201 
3202 // <= 32 bits
3203 template <size_t N, HWY_IF_LE32(uint8_t, N)>
3204 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
3205  const Vec128<uint8_t, N> b,
3206  const Vec128<uint8_t, N> c,
3207  Simd<uint8_t, N> /*tag*/,
3208  uint8_t* HWY_RESTRICT unaligned) {
3209  // Use full vectors for the shuffles and result.
3210  const Full128<uint8_t> d_full;
3211 
3212  const Vec128<uint8_t> full_a{a.raw};
3213  const Vec128<uint8_t> full_b{b.raw};
3214  const Vec128<uint8_t> full_c{c.raw};
3215 
3216  // Shuffle (a,b,c) vector bytes to bgr[3:0].
3217  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3218  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3219  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
3220  0x80, 0x80, 0x80, 0x80};
3221  const auto shuf_r0 = Load(d_full, tbl_r0);
3222  const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
3223  const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
3224  const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
3225  const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
3226  const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
3227  const auto int0 = r0 | g0 | b0;
3228  alignas(16) uint8_t buf[16];
3229  StoreU(int0, d_full, buf);
3230  CopyBytes<N * 3>(buf, unaligned);
3231 }
3232 
3233 // ------------------------------ StoreInterleaved4
3234 
3235 // 128 bits
3236 HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
3237  const Vec128<uint8_t> v1,
3238  const Vec128<uint8_t> v2,
3239  const Vec128<uint8_t> v3, Full128<uint8_t> d8,
3240  uint8_t* HWY_RESTRICT unaligned) {
3241  const RepartitionToWide<decltype(d8)> d16;
3242  const RepartitionToWide<decltype(d16)> d32;
3243  // let a,b,c,d denote v0..3.
3244  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
3245  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
3246  const auto ba8 = ZipUpper(d16, v0, v1);
3247  const auto dc8 = ZipUpper(d16, v2, v3);
3248  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
3249  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
3250  const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8
3251  const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC
3252  StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
3253  StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
3254  StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
3255  StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
3256 }
3257 
3258 // 64 bits
3259 HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
3260  const Vec128<uint8_t, 8> in1,
3261  const Vec128<uint8_t, 8> in2,
3262  const Vec128<uint8_t, 8> in3,
3263  Simd<uint8_t, 8> /* tag */,
3264  uint8_t* HWY_RESTRICT unaligned) {
3265  // Use full vectors to reduce the number of stores.
3266  const Full128<uint8_t> d_full8;
3267  const RepartitionToWide<decltype(d_full8)> d16;
3268  const RepartitionToWide<decltype(d16)> d32;
3269  const Vec128<uint8_t> v0{in0.raw};
3270  const Vec128<uint8_t> v1{in1.raw};
3271  const Vec128<uint8_t> v2{in2.raw};
3272  const Vec128<uint8_t> v3{in3.raw};
3273  // let a,b,c,d denote v0..3.
3274  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
3275  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
3276  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
3277  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
3278  StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
3279  StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
3280 }
3281 
3282 // <= 32 bits
3283 template <size_t N, HWY_IF_LE32(uint8_t, N)>
3284 HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
3285  const Vec128<uint8_t, N> in1,
3286  const Vec128<uint8_t, N> in2,
3287  const Vec128<uint8_t, N> in3,
3288  Simd<uint8_t, N> /*tag*/,
3289  uint8_t* HWY_RESTRICT unaligned) {
3290  // Use full vectors to reduce the number of stores.
3291  const Full128<uint8_t> d_full8;
3292  const RepartitionToWide<decltype(d_full8)> d16;
3293  const RepartitionToWide<decltype(d16)> d32;
3294  const Vec128<uint8_t> v0{in0.raw};
3295  const Vec128<uint8_t> v1{in1.raw};
3296  const Vec128<uint8_t> v2{in2.raw};
3297  const Vec128<uint8_t> v3{in3.raw};
3298  // let a,b,c,d denote v0..3.
3299  const auto ba0 = ZipLower(d16, v0, v1); // b3 a3 .. b0 a0
3300  const auto dc0 = ZipLower(d16, v2, v3); // d3 c3 .. d0 c0
3301  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
3302  alignas(16) uint8_t buf[16];
3303  StoreU(BitCast(d_full8, dcba_0), d_full8, buf);
3304  CopyBytes<4 * N>(buf, unaligned);
3305 }
3306 
3307 // ------------------------------ MulEven/Odd (Load)
3308 
3309 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
3310  const Vec128<uint64_t> b) {
3311  alignas(16) uint64_t mul[2];
3312  mul[0] =
3313  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
3314  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
3315  return Load(Full128<uint64_t>(), mul);
3316 }
3317 
3318 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
3319  const Vec128<uint64_t> b) {
3320  alignas(16) uint64_t mul[2];
3321  mul[0] =
3322  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
3323  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
3324  return Load(Full128<uint64_t>(), mul);
3325 }
3326 
3327 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3328 
3329 template <size_t N>
3330 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N> df32,
3331  Vec128<bfloat16_t, 2 * N> a,
3332  Vec128<bfloat16_t, 2 * N> b,
3333  const Vec128<float, N> sum0,
3334  Vec128<float, N>& sum1) {
3335  const Repartition<uint16_t, decltype(df32)> du16;
3336  const RebindToUnsigned<decltype(df32)> du32;
3337  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
3338  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
3339  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3340  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
3341  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3342  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3343  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3344 }
3345 
3346 // ------------------------------ Reductions
3347 
3348 namespace detail {
3349 
3350 // N=1 for any T: no-op
3351 template <typename T>
3353  const Vec128<T, 1> v) {
3354  return v;
3355 }
3356 template <typename T>
3357 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
3358  const Vec128<T, 1> v) {
3359  return v;
3360 }
3361 template <typename T>
3362 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
3363  const Vec128<T, 1> v) {
3364  return v;
3365 }
3366 
3367 // u32/i32/f32:
3368 
3369 // N=2
3370 template <typename T>
3372  const Vec128<T, 2> v10) {
3373  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
3374 }
3375 template <typename T>
3377  const Vec128<T, 2> v10) {
3378  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
3379 }
3380 template <typename T>
3381 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
3382  const Vec128<T, 2> v10) {
3383  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
3384 }
3385 
3386 // N=4 (full)
3387 template <typename T>
3389  const Vec128<T> v3210) {
3390  const Vec128<T> v1032 = Shuffle1032(v3210);
3391  const Vec128<T> v31_20_31_20 = v3210 + v1032;
3392  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
3393  return v20_31_20_31 + v31_20_31_20;
3394 }
3395 template <typename T>
3397  const Vec128<T> v3210) {
3398  const Vec128<T> v1032 = Shuffle1032(v3210);
3399  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
3400  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
3401  return Min(v20_31_20_31, v31_20_31_20);
3402 }
3403 template <typename T>
3404 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
3405  const Vec128<T> v3210) {
3406  const Vec128<T> v1032 = Shuffle1032(v3210);
3407  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
3408  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
3409  return Max(v20_31_20_31, v31_20_31_20);
3410 }
3411 
3412 // u64/i64/f64:
3413 
3414 // N=2 (full)
3415 template <typename T>
3417  const Vec128<T> v10) {
3418  const Vec128<T> v01 = Shuffle01(v10);
3419  return v10 + v01;
3420 }
3421 template <typename T>
3423  const Vec128<T> v10) {
3424  const Vec128<T> v01 = Shuffle01(v10);
3425  return Min(v10, v01);
3426 }
3427 template <typename T>
3428 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
3429  const Vec128<T> v10) {
3430  const Vec128<T> v01 = Shuffle01(v10);
3431  return Max(v10, v01);
3432 }
3433 
3434 } // namespace detail
3435 
3436 // Supported for u/i/f 32/64. Returns the same value in each lane.
3437 template <typename T, size_t N>
3438 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
3439  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
3440 }
3441 template <typename T, size_t N>
3442 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
3443  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
3444 }
3445 template <typename T, size_t N>
3446 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
3447  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
3448 }
3449 
3450 // ================================================== DEPRECATED
3451 
3452 template <typename T, size_t N>
3453 HWY_API size_t StoreMaskBits(const Mask128<T, N> mask, uint8_t* bits) {
3454  return StoreMaskBits(Simd<T, N>(), mask, bits);
3455 }
3456 
3457 template <typename T, size_t N>
3458 HWY_API bool AllTrue(const Mask128<T, N> mask) {
3459  return AllTrue(Simd<T, N>(), mask);
3460 }
3461 
3462 template <typename T, size_t N>
3463 HWY_API bool AllFalse(const Mask128<T, N> mask) {
3464  return AllFalse(Simd<T, N>(), mask);
3465 }
3466 
3467 template <typename T, size_t N>
3468 HWY_API size_t CountTrue(const Mask128<T, N> mask) {
3469  return CountTrue(Simd<T, N>(), mask);
3470 }
3471 
3472 template <typename T, size_t N>
3473 HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
3474  return SumOfLanes(Simd<T, N>(), v);
3475 }
3476 template <typename T, size_t N>
3477 HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
3478  return MinOfLanes(Simd<T, N>(), v);
3479 }
3480 template <typename T, size_t N>
3481 HWY_API Vec128<T, N> MaxOfLanes(const Vec128<T, N> v) {
3482  return MaxOfLanes(Simd<T, N>(), v);
3483 }
3484 
3485 template <typename T, size_t N>
3486 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Vec128<T, N> v) {
3487  return UpperHalf(Half<Simd<T, N>>(), v);
3488 }
3489 
3490 template <int kBytes, typename T, size_t N>
3491 HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
3492  return ShiftRightBytes<kBytes>(Simd<T, N>(), v);
3493 }
3494 
3495 template <int kLanes, typename T, size_t N>
3496 HWY_API Vec128<T, N> ShiftRightLanes(const Vec128<T, N> v) {
3497  return ShiftRightLanes<kLanes>(Simd<T, N>(), v);
3498 }
3499 
3500 template <size_t kBytes, typename T, size_t N>
3501 HWY_API Vec128<T, N> CombineShiftRightBytes(Vec128<T, N> hi, Vec128<T, N> lo) {
3502  return CombineShiftRightBytes<kBytes>(Simd<T, N>(), hi, lo);
3503 }
3504 
3505 template <typename T, size_t N>
3506 HWY_API Vec128<T, N> InterleaveUpper(Vec128<T, N> a, Vec128<T, N> b) {
3507  return InterleaveUpper(Simd<T, N>(), a, b);
3508 }
3509 
3510 template <typename T, size_t N, class D = Simd<T, N>>
3511 HWY_API VFromD<RepartitionToWide<D>> ZipUpper(Vec128<T, N> a, Vec128<T, N> b) {
3512  return InterleaveUpper(RepartitionToWide<D>(), a, b);
3513 }
3514 
3515 template <typename T, size_t N2>
3516 HWY_API Vec128<T, N2 * 2> Combine(Vec128<T, N2> hi2, Vec128<T, N2> lo2) {
3517  return Combine(Simd<T, N2 * 2>(), hi2, lo2);
3518 }
3519 
3520 template <typename T, size_t N2, HWY_IF_LE64(T, N2)>
3521 HWY_API Vec128<T, N2 * 2> ZeroExtendVector(Vec128<T, N2> lo) {
3522  return ZeroExtendVector(Simd<T, N2 * 2>(), lo);
3523 }
3524 
3525 template <typename T, size_t N>
3526 HWY_API Vec128<T, N> ConcatLowerLower(Vec128<T, N> hi, Vec128<T, N> lo) {
3527  return ConcatLowerLower(Simd<T, N>(), hi, lo);
3528 }
3529 
3530 template <typename T, size_t N>
3531 HWY_API Vec128<T, N> ConcatUpperUpper(Vec128<T, N> hi, Vec128<T, N> lo) {
3532  return ConcatUpperUpper(Simd<T, N>(), hi, lo);
3533 }
3534 
3535 template <typename T, size_t N>
3536 HWY_API Vec128<T, N> ConcatLowerUpper(const Vec128<T, N> hi,
3537  const Vec128<T, N> lo) {
3538  return ConcatLowerUpper(Simd<T, N>(), hi, lo);
3539 }
3540 
3541 template <typename T, size_t N>
3542 HWY_API Vec128<T, N> ConcatUpperLower(Vec128<T, N> hi, Vec128<T, N> lo) {
3543  return ConcatUpperLower(Simd<T, N>(), hi, lo);
3544 }
3545 
3546 // ================================================== Operator wrapper
3547 
3548 template <class V>
3549 HWY_API V Add(V a, V b) {
3550  return a + b;
3551 }
3552 template <class V>
3553 HWY_API V Sub(V a, V b) {
3554  return a - b;
3555 }
3556 
3557 template <class V>
3558 HWY_API V Mul(V a, V b) {
3559  return a * b;
3560 }
3561 template <class V>
3562 HWY_API V Div(V a, V b) {
3563  return a / b;
3564 }
3565 
3566 template <class V>
3567 V Shl(V a, V b) {
3568  return a << b;
3569 }
3570 template <class V>
3571 V Shr(V a, V b) {
3572  return a >> b;
3573 }
3574 
3575 template <class V>
3576 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
3577  return a == b;
3578 }
3579 template <class V>
3580 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
3581  return a != b;
3582 }
3583 template <class V>
3584 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
3585  return a < b;
3586 }
3587 
3588 template <class V>
3589 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
3590  return a > b;
3591 }
3592 template <class V>
3593 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
3594  return a >= b;
3595 }
3596 
3597 template <class V>
3598 HWY_API auto Le(V a, V b) -> decltype(a == b) {
3599  return a <= b;
3600 }
3601 
3602 // NOLINTNEXTLINE(google-readability-namespace-comments)
3603 } // namespace HWY_NAMESPACE
3604 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:123
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_IF_LE64(T, N)
Definition: base.h:271
#define HWY_API
Definition: base.h:117
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:506
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:102
Raw raw
Definition: arm_neon-inl.h:516
Definition: arm_neon-inl.h:468
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:86
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:89
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:77
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:92
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:74
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:469
Raw raw
Definition: arm_neon-inl.h:501
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:80
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:83
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4233
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1652
HWY_INLINE Vec128< T, N > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:2845
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:2703
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:2798
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4447
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4769
HWY_INLINE Vec128< T, N > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:2978
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4431
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
V Shl(V a, V b)
Definition: arm_neon-inl.h:5018
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
Simd< T, 16/sizeof(T)> Full128
Definition: arm_neon-inl.h:30
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5049
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
V Shr(V a, V b)
Definition: arm_neon-inl.h:5022
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:720
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:535
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4012
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:634
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:613
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:535
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: arm_neon-inl.h:3318
__v128_u raw
Definition: wasm_128-inl.h:1997
Definition: shared-inl.h:35
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:147
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:143
Simd< T, N > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:110
__f32x4 type
Definition: wasm_128-inl.h:62
Definition: x86_128-inl.h:51
__v128_u type
Definition: wasm_128-inl.h:58
Definition: base.h:290
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()