Grok  9.7.5
wasm_256-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 256-bit WASM vectors and operations. Experimental.
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <wasm_simd128.h>
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 #include "hwy/ops/wasm_128-inl.h"
26 
28 namespace hwy {
29 namespace HWY_NAMESPACE {
30 
31 template <typename T>
32 using Full256 = Simd<T, 32 / sizeof(T), 0>;
33 
34 template <typename T>
35 using Full128 = Simd<T, 16 / sizeof(T), 0>;
36 
37 // TODO(richardwinterton): add this to DeduceD in wasm_128 similar to x86_128.
38 template <typename T>
39 class Vec256 {
40  public:
41  // Compound assignment. Only usable if there is a corresponding non-member
42  // binary operator overload. For example, only f32 and f64 support division.
44  return *this = (*this * other);
45  }
47  return *this = (*this / other);
48  }
50  return *this = (*this + other);
51  }
53  return *this = (*this - other);
54  }
56  return *this = (*this & other);
57  }
59  return *this = (*this | other);
60  }
62  return *this = (*this ^ other);
63  }
64 
67 };
68 
69 template <typename T>
70 struct Mask256 {
73 };
74 
75 // ------------------------------ BitCast
76 
77 template <typename T, typename FromT>
79  const Half<decltype(d)> dh;
80  Vec256<T> ret;
81  ret.v0 = BitCast(dh, v.v0);
82  ret.v1 = BitCast(dh, v.v1);
83  return ret;
84 
85  // TODO(richardwinterton): implement other ops like this
86 }
87 
88 // ------------------------------ Zero
89 
90 // Returns an all-zero vector/part.
91 template <typename T>
93  return Vec256<T>{wasm_i32x4_splat(0)};
94 }
96  return Vec256<float>{wasm_f32x4_splat(0.0f)};
97 }
98 
99 template <class D>
100 using VFromD = decltype(Zero(D()));
101 
102 // ------------------------------ Set
103 
104 // Returns a vector/part with all lanes set to "t".
105 HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
106  return Vec256<uint8_t>{wasm_i8x16_splat(static_cast<int8_t>(t))};
107 }
108 HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
109  return Vec256<uint16_t>{wasm_i16x8_splat(static_cast<int16_t>(t))};
110 }
111 HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
112  return Vec256<uint32_t>{wasm_i32x4_splat(static_cast<int32_t>(t))};
113 }
114 HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
115  return Vec256<uint64_t>{wasm_i64x2_splat(static_cast<int64_t>(t))};
116 }
117 
118 HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
119  return Vec256<int8_t>{wasm_i8x16_splat(t)};
120 }
121 HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
122  return Vec256<int16_t>{wasm_i16x8_splat(t)};
123 }
124 HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
125  return Vec256<int32_t>{wasm_i32x4_splat(t)};
126 }
127 HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
128  return Vec256<int64_t>{wasm_i64x2_splat(t)};
129 }
130 
131 HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
132  return Vec256<float>{wasm_f32x4_splat(t)};
133 }
134 
135 HWY_DIAGNOSTICS(push)
136 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
137 
138 // Returns a vector with uninitialized elements.
139 template <typename T>
141  return Zero(d);
142 }
143 
144 HWY_DIAGNOSTICS(pop)
145 
146 // Returns a vector with lane i=[0, N) set to "first" + i.
147 template <typename T, typename T2>
148 Vec256<T> Iota(const Full256<T> d, const T2 first) {
149  HWY_ALIGN T lanes[16 / sizeof(T)];
150  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
151  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
152  }
153  return Load(d, lanes);
154 }
155 
156 // ================================================== ARITHMETIC
157 
158 // ------------------------------ Addition
159 
160 // Unsigned
162  const Vec256<uint8_t> b) {
163  return Vec256<uint8_t>{wasm_i8x16_add(a.raw, b.raw)};
164 }
166  const Vec256<uint16_t> b) {
167  return Vec256<uint16_t>{wasm_i16x8_add(a.raw, b.raw)};
168 }
170  const Vec256<uint32_t> b) {
171  return Vec256<uint32_t>{wasm_i32x4_add(a.raw, b.raw)};
172 }
173 
174 // Signed
176  const Vec256<int8_t> b) {
177  return Vec256<int8_t>{wasm_i8x16_add(a.raw, b.raw)};
178 }
180  const Vec256<int16_t> b) {
181  return Vec256<int16_t>{wasm_i16x8_add(a.raw, b.raw)};
182 }
184  const Vec256<int32_t> b) {
185  return Vec256<int32_t>{wasm_i32x4_add(a.raw, b.raw)};
186 }
187 
188 // Float
190  return Vec256<float>{wasm_f32x4_add(a.raw, b.raw)};
191 }
192 
193 // ------------------------------ Subtraction
194 
195 // Unsigned
197  const Vec256<uint8_t> b) {
198  return Vec256<uint8_t>{wasm_i8x16_sub(a.raw, b.raw)};
199 }
201  return Vec256<uint16_t>{wasm_i16x8_sub(a.raw, b.raw)};
202 }
204  const Vec256<uint32_t> b) {
205  return Vec256<uint32_t>{wasm_i32x4_sub(a.raw, b.raw)};
206 }
207 
208 // Signed
210  const Vec256<int8_t> b) {
211  return Vec256<int8_t>{wasm_i8x16_sub(a.raw, b.raw)};
212 }
214  const Vec256<int16_t> b) {
215  return Vec256<int16_t>{wasm_i16x8_sub(a.raw, b.raw)};
216 }
218  const Vec256<int32_t> b) {
219  return Vec256<int32_t>{wasm_i32x4_sub(a.raw, b.raw)};
220 }
221 
222 // Float
224  return Vec256<float>{wasm_f32x4_sub(a.raw, b.raw)};
225 }
226 
227 // ------------------------------ SumsOf8
229  HWY_ABORT("not implemented");
230 }
231 
232 // ------------------------------ SaturatedAdd
233 
234 // Returns a + b clamped to the destination range.
235 
236 // Unsigned
238  const Vec256<uint8_t> b) {
239  return Vec256<uint8_t>{wasm_u8x16_add_sat(a.raw, b.raw)};
240 }
242  const Vec256<uint16_t> b) {
243  return Vec256<uint16_t>{wasm_u16x8_add_sat(a.raw, b.raw)};
244 }
245 
246 // Signed
248  const Vec256<int8_t> b) {
249  return Vec256<int8_t>{wasm_i8x16_add_sat(a.raw, b.raw)};
250 }
252  const Vec256<int16_t> b) {
253  return Vec256<int16_t>{wasm_i16x8_add_sat(a.raw, b.raw)};
254 }
255 
256 // ------------------------------ SaturatedSub
257 
258 // Returns a - b clamped to the destination range.
259 
260 // Unsigned
262  const Vec256<uint8_t> b) {
263  return Vec256<uint8_t>{wasm_u8x16_sub_sat(a.raw, b.raw)};
264 }
266  const Vec256<uint16_t> b) {
267  return Vec256<uint16_t>{wasm_u16x8_sub_sat(a.raw, b.raw)};
268 }
269 
270 // Signed
272  const Vec256<int8_t> b) {
273  return Vec256<int8_t>{wasm_i8x16_sub_sat(a.raw, b.raw)};
274 }
276  const Vec256<int16_t> b) {
277  return Vec256<int16_t>{wasm_i16x8_sub_sat(a.raw, b.raw)};
278 }
279 
280 // ------------------------------ Average
281 
282 // Returns (a + b + 1) / 2
283 
284 // Unsigned
286  const Vec256<uint8_t> b) {
287  return Vec256<uint8_t>{wasm_u8x16_avgr(a.raw, b.raw)};
288 }
290  const Vec256<uint16_t> b) {
291  return Vec256<uint16_t>{wasm_u16x8_avgr(a.raw, b.raw)};
292 }
293 
294 // ------------------------------ Absolute value
295 
296 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
298  return Vec256<int8_t>{wasm_i8x16_abs(v.raw)};
299 }
301  return Vec256<int16_t>{wasm_i16x8_abs(v.raw)};
302 }
304  return Vec256<int32_t>{wasm_i32x4_abs(v.raw)};
305 }
307  return Vec256<int32_t>{wasm_i62x2_abs(v.raw)};
308 }
309 
311  return Vec256<float>{wasm_f32x4_abs(v.raw)};
312 }
313 
314 // ------------------------------ Shift lanes by constant #bits
315 
316 // Unsigned
317 template <int kBits>
319  return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, kBits)};
320 }
321 template <int kBits>
323  return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, kBits)};
324 }
325 template <int kBits>
327  return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, kBits)};
328 }
329 template <int kBits>
331  return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, kBits)};
332 }
333 
334 // Signed
335 template <int kBits>
337  return Vec256<int16_t>{wasm_i16x8_shl(v.raw, kBits)};
338 }
339 template <int kBits>
341  return Vec256<int16_t>{wasm_i16x8_shr(v.raw, kBits)};
342 }
343 template <int kBits>
345  return Vec256<int32_t>{wasm_i32x4_shl(v.raw, kBits)};
346 }
347 template <int kBits>
349  return Vec256<int32_t>{wasm_i32x4_shr(v.raw, kBits)};
350 }
351 
352 // 8-bit
353 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
355  const Full256<T> d8;
356  // Use raw instead of BitCast to support N=1.
357  const Vec256<T> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
358  return kBits == 1
359  ? (v + v)
360  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
361 }
362 
363 template <int kBits>
365  const Full256<uint8_t> d8;
366  // Use raw instead of BitCast to support N=1.
367  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
368  return shifted & Set(d8, 0xFF >> kBits);
369 }
370 
371 template <int kBits>
373  const Full256<int8_t> di;
374  const Full256<uint8_t> du;
375  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
376  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
377  return (shifted ^ shifted_sign) - shifted_sign;
378 }
379 
380 // ------------------------------ RotateRight (ShiftRight, Or)
381 template <int kBits, typename T>
383  constexpr size_t kSizeInBits = sizeof(T) * 8;
384  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
385  if (kBits == 0) return v;
386  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
387 }
388 
389 // ------------------------------ Shift lanes by same variable #bits
390 
391 // Unsigned
393  const int bits) {
394  return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, bits)};
395 }
397  const int bits) {
398  return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, bits)};
399 }
401  const int bits) {
402  return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, bits)};
403 }
405  const int bits) {
406  return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, bits)};
407 }
408 
409 // Signed
411  return Vec256<int16_t>{wasm_i16x8_shl(v.raw, bits)};
412 }
414  const int bits) {
415  return Vec256<int16_t>{wasm_i16x8_shr(v.raw, bits)};
416 }
418  return Vec256<int32_t>{wasm_i32x4_shl(v.raw, bits)};
419 }
421  const int bits) {
422  return Vec256<int32_t>{wasm_i32x4_shr(v.raw, bits)};
423 }
424 
425 // 8-bit
426 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
427 HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
428  const Full256<T> d8;
429  // Use raw instead of BitCast to support N=1.
430  const Vec256<T> shifted{ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
431  return shifted & Set(d8, (0xFF << bits) & 0xFF);
432 }
433 
435  const Full256<uint8_t> d8;
436  // Use raw instead of BitCast to support N=1.
437  const Vec256<uint8_t> shifted{
438  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
439  return shifted & Set(d8, 0xFF >> bits);
440 }
441 
443  const Full256<int8_t> di;
444  const Full256<uint8_t> du;
445  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
446  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
447  return (shifted ^ shifted_sign) - shifted_sign;
448 }
449 
450 // ------------------------------ Minimum
451 
452 // Unsigned
454  return Vec256<uint8_t>{wasm_u8x16_min(a.raw, b.raw)};
455 }
457  const Vec256<uint16_t> b) {
458  return Vec256<uint16_t>{wasm_u16x8_min(a.raw, b.raw)};
459 }
461  const Vec256<uint32_t> b) {
462  return Vec256<uint32_t>{wasm_u32x4_min(a.raw, b.raw)};
463 }
465  const Vec256<uint64_t> b) {
466  alignas(32) float min[4];
467  min[0] =
468  HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
469  min[1] =
470  HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
471  return Vec256<uint64_t>{wasm_v128_load(min)};
472 }
473 
474 // Signed
476  return Vec256<int8_t>{wasm_i8x16_min(a.raw, b.raw)};
477 }
479  return Vec256<int16_t>{wasm_i16x8_min(a.raw, b.raw)};
480 }
482  return Vec256<int32_t>{wasm_i32x4_min(a.raw, b.raw)};
483 }
485  alignas(32) float min[4];
486  min[0] =
487  HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
488  min[1] =
489  HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
490  return Vec256<int64_t>{wasm_v128_load(min)};
491 }
492 
493 // Float
495  return Vec256<float>{wasm_f32x4_min(a.raw, b.raw)};
496 }
497 
498 // ------------------------------ Maximum
499 
500 // Unsigned
502  return Vec256<uint8_t>{wasm_u8x16_max(a.raw, b.raw)};
503 }
505  const Vec256<uint16_t> b) {
506  return Vec256<uint16_t>{wasm_u16x8_max(a.raw, b.raw)};
507 }
509  const Vec256<uint32_t> b) {
510  return Vec256<uint32_t>{wasm_u32x4_max(a.raw, b.raw)};
511 }
513  const Vec256<uint64_t> b) {
514  alignas(32) float max[4];
515  max[0] =
516  HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
517  max[1] =
518  HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
519  return Vec256<int64_t>{wasm_v128_load(max)};
520 }
521 
522 // Signed
524  return Vec256<int8_t>{wasm_i8x16_max(a.raw, b.raw)};
525 }
527  return Vec256<int16_t>{wasm_i16x8_max(a.raw, b.raw)};
528 }
530  return Vec256<int32_t>{wasm_i32x4_max(a.raw, b.raw)};
531 }
533  alignas(32) float max[4];
534  max[0] =
535  HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
536  max[1] =
537  HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
538  return Vec256<int64_t>{wasm_v128_load(max)};
539 }
540 
541 // Float
543  return Vec256<float>{wasm_f32x4_max(a.raw, b.raw)};
544 }
545 
546 // ------------------------------ Integer multiplication
547 
548 // Unsigned
550  const Vec256<uint16_t> b) {
551  return Vec256<uint16_t>{wasm_i16x8_mul(a.raw, b.raw)};
552 }
554  const Vec256<uint32_t> b) {
555  return Vec256<uint32_t>{wasm_i32x4_mul(a.raw, b.raw)};
556 }
557 
558 // Signed
560  const Vec256<int16_t> b) {
561  return Vec256<int16_t>{wasm_i16x8_mul(a.raw, b.raw)};
562 }
564  const Vec256<int32_t> b) {
565  return Vec256<int32_t>{wasm_i32x4_mul(a.raw, b.raw)};
566 }
567 
568 // Returns the upper 16 bits of a * b in each lane.
570  const Vec256<uint16_t> b) {
571  // TODO(eustas): replace, when implemented in WASM.
572  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
573  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
574  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
575  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
576  const auto l = wasm_i32x4_mul(al, bl);
577  const auto h = wasm_i32x4_mul(ah, bh);
578  // TODO(eustas): shift-right + narrow?
579  return Vec256<uint16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
580 }
582  const Vec256<int16_t> b) {
583  // TODO(eustas): replace, when implemented in WASM.
584  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
585  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
586  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
587  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
588  const auto l = wasm_i32x4_mul(al, bl);
589  const auto h = wasm_i32x4_mul(ah, bh);
590  // TODO(eustas): shift-right + narrow?
591  return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
592 }
593 
595  HWY_ASSERT(0);
596 }
597 
598 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
600  const Vec256<int32_t> b) {
601  // TODO(eustas): replace, when implemented in WASM.
602  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
603  const auto ae = wasm_v128_and(a.raw, kEvenMask);
604  const auto be = wasm_v128_and(b.raw, kEvenMask);
605  return Vec256<int64_t>{wasm_i64x2_mul(ae, be)};
606 }
608  const Vec256<uint32_t> b) {
609  // TODO(eustas): replace, when implemented in WASM.
610  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
611  const auto ae = wasm_v128_and(a.raw, kEvenMask);
612  const auto be = wasm_v128_and(b.raw, kEvenMask);
613  return Vec256<uint64_t>{wasm_i64x2_mul(ae, be)};
614 }
615 
616 // ------------------------------ Negate
617 
618 template <typename T, HWY_IF_FLOAT(T)>
620  return Xor(v, SignBit(Full256<T>()));
621 }
622 
624  return Vec256<int8_t>{wasm_i8x16_neg(v.raw)};
625 }
627  return Vec256<int16_t>{wasm_i16x8_neg(v.raw)};
628 }
630  return Vec256<int32_t>{wasm_i32x4_neg(v.raw)};
631 }
633  return Vec256<int64_t>{wasm_i64x2_neg(v.raw)};
634 }
635 
636 // ------------------------------ Floating-point mul / div
637 
639  return Vec256<float>{wasm_f32x4_mul(a.raw, b.raw)};
640 }
641 
643  return Vec256<float>{wasm_f32x4_div(a.raw, b.raw)};
644 }
645 
646 // Approximate reciprocal
648  const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
649  return one / v;
650 }
651 
652 // Absolute value of difference.
654  return Abs(a - b);
655 }
656 
657 // ------------------------------ Floating-point multiply-add variants
658 
659 // Returns mul * x + add
661  const Vec256<float> add) {
662  // TODO(eustas): replace, when implemented in WASM.
663  // TODO(eustas): is it wasm_f32x4_qfma?
664  return mul * x + add;
665 }
666 
667 // Returns add - mul * x
669  const Vec256<float> add) {
670  // TODO(eustas): replace, when implemented in WASM.
671  return add - mul * x;
672 }
673 
674 // Returns mul * x - sub
676  const Vec256<float> sub) {
677  // TODO(eustas): replace, when implemented in WASM.
678  // TODO(eustas): is it wasm_f32x4_qfms?
679  return mul * x - sub;
680 }
681 
682 // Returns -mul * x - sub
684  const Vec256<float> sub) {
685  // TODO(eustas): replace, when implemented in WASM.
686  return Neg(mul) * x - sub;
687 }
688 
689 // ------------------------------ Floating-point square root
690 
691 // Full precision square root
693  return Vec256<float>{wasm_f32x4_sqrt(v.raw)};
694 }
695 
696 // Approximate reciprocal square root
698  // TODO(eustas): find cheaper a way to calculate this.
699  const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
700  return one / Sqrt(v);
701 }
702 
703 // ------------------------------ Floating-point rounding
704 
705 // Toward nearest integer, ties to even
707  return Vec256<float>{wasm_f32x4_nearest(v.raw)};
708 }
709 
710 // Toward zero, aka truncate
712  return Vec256<float>{wasm_f32x4_trunc(v.raw)};
713 }
714 
715 // Toward +infinity, aka ceiling
717  return Vec256<float>{wasm_f32x4_ceil(v.raw)};
718 }
719 
720 // Toward -infinity, aka floor
722  return Vec256<float>{wasm_f32x4_floor(v.raw)};
723 }
724 
725 // ================================================== COMPARE
726 
727 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
728 
729 template <typename TFrom, typename TTo>
731  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
732  return Mask256<TTo>{m.raw};
733 }
734 
735 template <typename T>
737  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
738  return (v & bit) == bit;
739 }
740 
741 // ------------------------------ Equality
742 
743 // Unsigned
745  const Vec256<uint8_t> b) {
746  return Mask256<uint8_t>{wasm_i8x16_eq(a.raw, b.raw)};
747 }
749  const Vec256<uint16_t> b) {
750  return Mask256<uint16_t>{wasm_i16x8_eq(a.raw, b.raw)};
751 }
753  const Vec256<uint32_t> b) {
754  return Mask256<uint32_t>{wasm_i32x4_eq(a.raw, b.raw)};
755 }
756 
757 // Signed
759  const Vec256<int8_t> b) {
760  return Mask256<int8_t>{wasm_i8x16_eq(a.raw, b.raw)};
761 }
763  return Mask256<int16_t>{wasm_i16x8_eq(a.raw, b.raw)};
764 }
766  const Vec256<int32_t> b) {
767  return Mask256<int32_t>{wasm_i32x4_eq(a.raw, b.raw)};
768 }
769 
770 // Float
772  const Vec256<float> b) {
773  return Mask256<float>{wasm_f32x4_eq(a.raw, b.raw)};
774 }
775 
776 // ------------------------------ Inequality
777 
778 // Unsigned
780  const Vec256<uint8_t> b) {
781  return Mask256<uint8_t>{wasm_i8x16_ne(a.raw, b.raw)};
782 }
784  const Vec256<uint16_t> b) {
785  return Mask256<uint16_t>{wasm_i16x8_ne(a.raw, b.raw)};
786 }
788  const Vec256<uint32_t> b) {
789  return Mask256<uint32_t>{wasm_i32x4_ne(a.raw, b.raw)};
790 }
791 
792 // Signed
794  const Vec256<int8_t> b) {
795  return Mask256<int8_t>{wasm_i8x16_ne(a.raw, b.raw)};
796 }
798  return Mask256<int16_t>{wasm_i16x8_ne(a.raw, b.raw)};
799 }
801  const Vec256<int32_t> b) {
802  return Mask256<int32_t>{wasm_i32x4_ne(a.raw, b.raw)};
803 }
804 
805 // Float
807  const Vec256<float> b) {
808  return Mask256<float>{wasm_f32x4_ne(a.raw, b.raw)};
809 }
810 
811 // ------------------------------ Strict inequality
812 
814  const Vec256<int8_t> b) {
815  return Mask256<int8_t>{wasm_i8x16_gt(a.raw, b.raw)};
816 }
818  const Vec256<int16_t> b) {
819  return Mask256<int16_t>{wasm_i16x8_gt(a.raw, b.raw)};
820 }
822  const Vec256<int32_t> b) {
823  return Mask256<int32_t>{wasm_i32x4_gt(a.raw, b.raw)};
824 }
826  const Vec256<int64_t> b) {
827  const Rebind < int32_t, DFromV<decltype(a)> d32;
828  const auto a32 = BitCast(d32, a);
829  const auto b32 = BitCast(d32, b);
830  // If the upper half is less than or greater, this is the answer.
831  const auto m_gt = a32 < b32;
832 
833  // Otherwise, the lower half decides.
834  const auto m_eq = a32 == b32;
835  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
836  const auto lo_gt = And(m_eq, lo_in_hi);
837 
838  const auto gt = Or(lo_gt, m_gt);
839  // Copy result in upper 32 bits to lower 32 bits.
840  return Mask256<int64_t>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
841 }
842 
843 template <typename T, HWY_IF_UNSIGNED(T)>
845  const Full256<T> du;
846  const RebindToSigned<decltype(du)> di;
847  const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
848  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
849 }
850 
852  return Mask256<float>{wasm_f32x4_gt(a.raw, b.raw)};
853 }
854 
855 template <typename T>
857  return operator>(b, a);
858 }
859 
860 // ------------------------------ Weak inequality
861 
862 // Float <= >=
864  const Vec256<float> b) {
865  return Mask256<float>{wasm_f32x4_le(a.raw, b.raw)};
866 }
868  const Vec256<float> b) {
869  return Mask256<float>{wasm_f32x4_ge(a.raw, b.raw)};
870 }
871 
872 // ------------------------------ FirstN (Iota, Lt)
873 
874 template <typename T>
875 HWY_API Mask256<T> FirstN(const Full256<T> d, size_t num) {
876  const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
877  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
878 }
879 
880 // ================================================== LOGICAL
881 
882 // ------------------------------ Not
883 
884 template <typename T>
886  return Vec256<T>{wasm_v128_not(v.raw)};
887 }
888 
889 // ------------------------------ And
890 
891 template <typename T>
893  return Vec256<T>{wasm_v128_and(a.raw, b.raw)};
894 }
895 
896 // ------------------------------ AndNot
897 
898 // Returns ~not_mask & mask.
899 template <typename T>
901  return Vec256<T>{wasm_v128_andnot(mask.raw, not_mask.raw)};
902 }
903 
904 // ------------------------------ Or
905 
906 template <typename T>
908  return Vec256<T>{wasm_v128_or(a.raw, b.raw)};
909 }
910 
911 // ------------------------------ Xor
912 
913 template <typename T>
915  return Vec256<T>{wasm_v128_xor(a.raw, b.raw)};
916 }
917 
918 // ------------------------------ OrAnd
919 
920 template <typename T>
922  return Or(o, And(a1, a2));
923 }
924 
925 // ------------------------------ IfVecThenElse
926 
927 template <typename T>
929  return IfThenElse(MaskFromVec(mask), yes, no);
930 }
931 
932 // ------------------------------ Operator overloads (internal-only if float)
933 
934 template <typename T>
936  return And(a, b);
937 }
938 
939 template <typename T>
941  return Or(a, b);
942 }
943 
944 template <typename T>
946  return Xor(a, b);
947 }
948 
949 // ------------------------------ CopySign
950 
951 template <typename T>
952 HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
953  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
954  const auto msb = SignBit(Full256<T>());
955  return Or(AndNot(msb, magn), And(msb, sign));
956 }
957 
958 template <typename T>
960  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
961  return Or(abs, And(SignBit(Full256<T>()), sign));
962 }
963 
964 // ------------------------------ BroadcastSignBit (compare)
965 
966 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
968  return ShiftRight<sizeof(T) * 8 - 1>(v);
969 }
972 }
973 
974 // ------------------------------ Mask
975 
976 // Mask and Vec are the same (true = FF..FF).
977 template <typename T>
979  return Mask256<T>{v.raw};
980 }
981 
982 template <typename T>
984  return Vec256<T>{v.raw};
985 }
986 
987 // mask ? yes : no
988 template <typename T>
990  return Vec256<T>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
991 }
992 
993 // mask ? yes : 0
994 template <typename T>
996  return yes & VecFromMask(Full256<T>(), mask);
997 }
998 
999 // mask ? 0 : no
1000 template <typename T>
1002  return AndNot(VecFromMask(Full256<T>(), mask), no);
1003 }
1004 
1005 template <typename T>
1008  HWY_ASSERT(0);
1009 }
1010 
1011 template <typename T, HWY_IF_FLOAT(T)>
1012 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
1013  const Full256<T> d;
1014  const auto zero = Zero(d);
1015  return IfThenElse(Mask256<T>{(v > zero).raw}, v, zero);
1016 }
1017 
1018 // ------------------------------ Mask logical
1019 
1020 template <typename T>
1021 HWY_API Mask256<T> Not(const Mask256<T> m) {
1022  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
1023 }
1024 
1025 template <typename T>
1026 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
1027  const Full256<T> d;
1028  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1029 }
1030 
1031 template <typename T>
1032 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
1033  const Full256<T> d;
1034  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1035 }
1036 
1037 template <typename T>
1038 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
1039  const Full256<T> d;
1040  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1041 }
1042 
1043 template <typename T>
1044 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
1045  const Full256<T> d;
1046  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1047 }
1048 
1049 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1050 
1051 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
1052 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1053 // scalar count operand, per-lane shift instructions would require extract_lane
1054 // for each lane, and hoping that shuffle is correctly mapped to a native
1055 // instruction. Using non-vector shifts would incur a store-load forwarding
1056 // stall when loading the result vector. We instead test bits of the shift
1057 // count to "predicate" a shift of the entire vector by a constant.
1058 
1059 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1060 HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
1061  const Full256<T> d;
1062  Mask256<T> mask;
1063  // Need a signed type for BroadcastSignBit.
1064  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1065  // Move the highest valid bit of the shift count into the sign bit.
1066  test = ShiftLeft<12>(test);
1067 
1068  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1069  test = ShiftLeft<1>(test); // next bit (descending order)
1070  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1071 
1072  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1073  test = ShiftLeft<1>(test); // next bit (descending order)
1074  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1075 
1076  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1077  test = ShiftLeft<1>(test); // next bit (descending order)
1078  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1079 
1080  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1081  return IfThenElse(mask, ShiftLeft<1>(v), v);
1082 }
1083 
1084 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1085 HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
1086  const Full256<T> d;
1087  Mask256<T> mask;
1088  // Need a signed type for BroadcastSignBit.
1089  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1090  // Move the highest valid bit of the shift count into the sign bit.
1091  test = ShiftLeft<27>(test);
1092 
1093  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1094  test = ShiftLeft<1>(test); // next bit (descending order)
1095  v = IfThenElse(mask, ShiftLeft<16>(v), v);
1096 
1097  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1098  test = ShiftLeft<1>(test); // next bit (descending order)
1099  v = IfThenElse(mask, ShiftLeft<8>(v), v);
1100 
1101  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1102  test = ShiftLeft<1>(test); // next bit (descending order)
1103  v = IfThenElse(mask, ShiftLeft<4>(v), v);
1104 
1105  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1106  test = ShiftLeft<1>(test); // next bit (descending order)
1107  v = IfThenElse(mask, ShiftLeft<2>(v), v);
1108 
1109  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1110  return IfThenElse(mask, ShiftLeft<1>(v), v);
1111 }
1112 
1113 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1114 
1115 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1116 HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) {
1117  const Full256<T> d;
1118  Mask256<T> mask;
1119  // Need a signed type for BroadcastSignBit.
1120  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1121  // Move the highest valid bit of the shift count into the sign bit.
1122  test = ShiftLeft<12>(test);
1123 
1124  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1125  test = ShiftLeft<1>(test); // next bit (descending order)
1126  v = IfThenElse(mask, ShiftRight<8>(v), v);
1127 
1128  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1129  test = ShiftLeft<1>(test); // next bit (descending order)
1130  v = IfThenElse(mask, ShiftRight<4>(v), v);
1131 
1132  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1133  test = ShiftLeft<1>(test); // next bit (descending order)
1134  v = IfThenElse(mask, ShiftRight<2>(v), v);
1135 
1136  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1137  return IfThenElse(mask, ShiftRight<1>(v), v);
1138 }
1139 
1140 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1142  const Full256<T> d;
1143  Mask256<T> mask;
1144  // Need a signed type for BroadcastSignBit.
1145  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1146  // Move the highest valid bit of the shift count into the sign bit.
1147  test = ShiftLeft<27>(test);
1148 
1149  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1150  test = ShiftLeft<1>(test); // next bit (descending order)
1151  v = IfThenElse(mask, ShiftRight<16>(v), v);
1152 
1153  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1154  test = ShiftLeft<1>(test); // next bit (descending order)
1155  v = IfThenElse(mask, ShiftRight<8>(v), v);
1156 
1157  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1158  test = ShiftLeft<1>(test); // next bit (descending order)
1159  v = IfThenElse(mask, ShiftRight<4>(v), v);
1160 
1161  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1162  test = ShiftLeft<1>(test); // next bit (descending order)
1163  v = IfThenElse(mask, ShiftRight<2>(v), v);
1164 
1165  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1166  return IfThenElse(mask, ShiftRight<1>(v), v);
1167 }
1168 
1169 // ================================================== MEMORY
1170 
1171 // ------------------------------ Load
1172 
1173 template <typename T>
1174 HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
1175  return Vec256<T>{wasm_v128_load(aligned)};
1176 }
1177 
1178 template <typename T>
1180  const T* HWY_RESTRICT aligned) {
1181  return IfThenElseZero(m, Load(d, aligned));
1182 }
1183 
1184 // LoadU == Load.
1185 template <typename T>
1187  return Load(d, p);
1188 }
1189 
1190 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1191 template <typename T>
1193  return Load(d, p);
1194 }
1195 
1196 // ------------------------------ Store
1197 
1198 template <typename T>
1199 HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
1200  wasm_v128_store(aligned, v.raw);
1201 }
1202 
1203 // StoreU == Store.
1204 template <typename T>
1206  Store(v, d, p);
1207 }
1208 
1209 template <typename T>
1211  T* HWY_RESTRICT p) {
1212  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
1213 }
1214 
1215 // ------------------------------ Non-temporal stores
1216 
1217 // Same as aligned stores on non-x86.
1218 
1219 template <typename T>
1221  T* HWY_RESTRICT aligned) {
1222  wasm_v128_store(aligned, v.raw);
1223 }
1224 
1225 // ------------------------------ Scatter (Store)
1226 
1227 template <typename T, typename Offset>
1229  const Vec256<Offset> offset) {
1230  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1231 
1232  alignas(32) T lanes[32 / sizeof(T)];
1233  Store(v, d, lanes);
1234 
1235  alignas(32) Offset offset_lanes[32 / sizeof(T)];
1236  Store(offset, Full256<Offset>(), offset_lanes);
1237 
1238  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1239  for (size_t i = 0; i < N; ++i) {
1240  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1241  }
1242 }
1243 
1244 template <typename T, typename Index>
1246  const Vec256<Index> index) {
1247  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1248 
1249  alignas(32) T lanes[32 / sizeof(T)];
1250  Store(v, d, lanes);
1251 
1252  alignas(32) Index index_lanes[32 / sizeof(T)];
1253  Store(index, Full256<Index>(), index_lanes);
1254 
1255  for (size_t i = 0; i < N; ++i) {
1256  base[index_lanes[i]] = lanes[i];
1257  }
1258 }
1259 
1260 // ------------------------------ Gather (Load/Store)
1261 
1262 template <typename T, typename Offset>
1264  const Vec256<Offset> offset) {
1265  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1266 
1267  alignas(32) Offset offset_lanes[32 / sizeof(T)];
1268  Store(offset, Full256<Offset>(), offset_lanes);
1269 
1270  alignas(32) T lanes[32 / sizeof(T)];
1271  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1272  for (size_t i = 0; i < N; ++i) {
1273  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1274  }
1275  return Load(d, lanes);
1276 }
1277 
1278 template <typename T, typename Index>
1280  const Vec256<Index> index) {
1281  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1282 
1283  alignas(32) Index index_lanes[32 / sizeof(T)];
1284  Store(index, Full256<Index>(), index_lanes);
1285 
1286  alignas(32) T lanes[32 / sizeof(T)];
1287  for (size_t i = 0; i < N; ++i) {
1288  lanes[i] = base[index_lanes[i]];
1289  }
1290  return Load(d, lanes);
1291 }
1292 
1293 // ================================================== SWIZZLE
1294 
1295 // ------------------------------ Extract lane
1296 
1297 // Gets the single value stored in a vector/part.
1299  return wasm_i8x16_extract_lane(v.raw, 0);
1300 }
1302  return wasm_i8x16_extract_lane(v.raw, 0);
1303 }
1305  return wasm_i16x8_extract_lane(v.raw, 0);
1306 }
1308  return wasm_i16x8_extract_lane(v.raw, 0);
1309 }
1311  return wasm_i32x4_extract_lane(v.raw, 0);
1312 }
1314  return wasm_i32x4_extract_lane(v.raw, 0);
1315 }
1317  return wasm_i64x2_extract_lane(v.raw, 0);
1318 }
1320  return wasm_i64x2_extract_lane(v.raw, 0);
1321 }
1322 
1324  return wasm_f32x4_extract_lane(v.raw, 0);
1325 }
1326 
1327 // ------------------------------ LowerHalf
1328 
1329 template <typename T>
1331  return Vec128<T>{v.raw};
1332 }
1333 
1334 template <typename T>
1336  return LowerHalf(Full128<T>(), v);
1337 }
1338 
1339 // ------------------------------ ShiftLeftBytes
1340 
1341 // 0x01..0F, kBytes = 1 => 0x02..0F00
1342 template <int kBytes, typename T>
1344  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1345  const __i8x16 zero = wasm_i8x16_splat(0);
1346  switch (kBytes) {
1347  case 0:
1348  return v;
1349 
1350  case 1:
1351  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
1352  7, 8, 9, 10, 11, 12, 13, 14)};
1353 
1354  case 2:
1355  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
1356  6, 7, 8, 9, 10, 11, 12, 13)};
1357 
1358  case 3:
1359  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
1360  4, 5, 6, 7, 8, 9, 10, 11, 12)};
1361 
1362  case 4:
1363  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
1364  3, 4, 5, 6, 7, 8, 9, 10, 11)};
1365 
1366  case 5:
1367  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
1368  2, 3, 4, 5, 6, 7, 8, 9, 10)};
1369 
1370  case 6:
1371  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1372  0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1373 
1374  case 7:
1375  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1376  16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1377 
1378  case 8:
1379  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1380  16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1381 
1382  case 9:
1383  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1384  16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
1385 
1386  case 10:
1387  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1388  16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
1389 
1390  case 11:
1391  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1392  16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
1393 
1394  case 12:
1395  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1396  16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
1397 
1398  case 13:
1399  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1400  16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
1401 
1402  case 14:
1403  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1404  16, 16, 16, 16, 16, 16, 16, 16, 0,
1405  1)};
1406 
1407  case 15:
1408  return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1409  16, 16, 16, 16, 16, 16, 16, 16, 16,
1410  0)};
1411  }
1412  return Vec256<T>{zero};
1413 }
1414 
1415 template <int kBytes, typename T>
1417  return ShiftLeftBytes<kBytes>(Full256<T>(), v);
1418 }
1419 
1420 // ------------------------------ ShiftLeftLanes
1421 
1422 template <int kLanes, typename T>
1424  const Repartition<uint8_t, decltype(d)> d8;
1425  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1426 }
1427 
1428 template <int kLanes, typename T>
1430  return ShiftLeftLanes<kLanes>(Full256<T>(), v);
1431 }
1432 
1433 // ------------------------------ ShiftRightBytes
1434 namespace detail {
1435 
1436 // Helper function allows zeroing invalid lanes in caller.
1437 template <int kBytes, typename T>
1438 HWY_API __i8x16 ShrBytes(const Vec256<T> v) {
1439  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1440  const __i8x16 zero = wasm_i8x16_splat(0);
1441 
1442  switch (kBytes) {
1443  case 0:
1444  return v.raw;
1445 
1446  case 1:
1447  return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1448  12, 13, 14, 15, 16);
1449 
1450  case 2:
1451  return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1452  13, 14, 15, 16, 16);
1453 
1454  case 3:
1455  return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1456  13, 14, 15, 16, 16, 16);
1457 
1458  case 4:
1459  return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1460  14, 15, 16, 16, 16, 16);
1461 
1462  case 5:
1463  return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1464  15, 16, 16, 16, 16, 16);
1465 
1466  case 6:
1467  return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1468  16, 16, 16, 16, 16, 16);
1469 
1470  case 7:
1471  return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1472  16, 16, 16, 16, 16, 16, 16);
1473 
1474  case 8:
1475  return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1476  16, 16, 16, 16, 16, 16, 16);
1477 
1478  case 9:
1479  return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1480  16, 16, 16, 16, 16, 16, 16);
1481 
1482  case 10:
1483  return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1484  16, 16, 16, 16, 16, 16, 16);
1485 
1486  case 11:
1487  return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1488  16, 16, 16, 16, 16, 16, 16);
1489 
1490  case 12:
1491  return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1492  16, 16, 16, 16, 16, 16, 16);
1493 
1494  case 13:
1495  return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1496  16, 16, 16, 16, 16, 16, 16);
1497 
1498  case 14:
1499  return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1500  16, 16, 16, 16, 16, 16, 16);
1501 
1502  case 15:
1503  return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1504  16, 16, 16, 16, 16, 16, 16);
1505  case 16:
1506  return zero;
1507  }
1508 }
1509 
1510 } // namespace detail
1511 
1512 // 0x01..0F, kBytes = 1 => 0x0001..0E
1513 template <int kBytes, typename T>
1515  return Vec256<T>{detail::ShrBytes<kBytes>(v)};
1516 }
1517 
1518 // ------------------------------ ShiftRightLanes
1519 template <int kLanes, typename T>
1521  const Repartition<uint8_t, decltype(d)> d8;
1522  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1523 }
1524 
1525 // ------------------------------ UpperHalf (ShiftRightBytes)
1526 
1527 // Full input: copy hi into lo (smaller instruction encoding than shifts).
1528 template <typename T>
1529 HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Full128<T> /* tag */,
1530  const Vec256<T> v) {
1531  return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1532 }
1534  const Vec128<float> v) {
1535  return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1536 }
1537 
1538 // ------------------------------ CombineShiftRightBytes
1539 
1540 template <int kBytes, typename T, class V = Vec256<T>>
1541 HWY_API V CombineShiftRightBytes(Full256<T> /* tag */, V hi, V lo) {
1542  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1543  switch (kBytes) {
1544  case 0:
1545  return lo;
1546 
1547  case 1:
1548  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1549  11, 12, 13, 14, 15, 16)};
1550 
1551  case 2:
1552  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1553  11, 12, 13, 14, 15, 16, 17)};
1554 
1555  case 3:
1556  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1557  12, 13, 14, 15, 16, 17, 18)};
1558 
1559  case 4:
1560  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1561  13, 14, 15, 16, 17, 18, 19)};
1562 
1563  case 5:
1564  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1565  14, 15, 16, 17, 18, 19, 20)};
1566 
1567  case 6:
1568  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1569  14, 15, 16, 17, 18, 19, 20, 21)};
1570 
1571  case 7:
1572  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1573  15, 16, 17, 18, 19, 20, 21, 22)};
1574 
1575  case 8:
1576  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1577  16, 17, 18, 19, 20, 21, 22, 23)};
1578 
1579  case 9:
1580  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1581  17, 18, 19, 20, 21, 22, 23, 24)};
1582 
1583  case 10:
1584  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1585  17, 18, 19, 20, 21, 22, 23, 24, 25)};
1586 
1587  case 11:
1588  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1589  18, 19, 20, 21, 22, 23, 24, 25, 26)};
1590 
1591  case 12:
1592  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1593  19, 20, 21, 22, 23, 24, 25, 26, 27)};
1594 
1595  case 13:
1596  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1597  20, 21, 22, 23, 24, 25, 26, 27, 28)};
1598 
1599  case 14:
1600  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1601  21, 22, 23, 24, 25, 26, 27, 28, 29)};
1602 
1603  case 15:
1604  return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1605  22, 23, 24, 25, 26, 27, 28, 29, 30)};
1606  }
1607  return hi;
1608 }
1609 
1610 // ------------------------------ Broadcast/splat any lane
1611 
1612 // Unsigned
1613 template <int kLane>
1615  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1616  return Vec256<uint16_t>{wasm_i16x8_shuffle(
1617  v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1618 }
1619 template <int kLane>
1621  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1622  return Vec256<uint32_t>{
1623  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1624 }
1625 
1626 // Signed
1627 template <int kLane>
1629  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1630  return Vec256<int16_t>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
1631  kLane, kLane, kLane, kLane, kLane)};
1632 }
1633 template <int kLane>
1635  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1636  return Vec256<int32_t>{
1637  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1638 }
1639 
1640 // Float
1641 template <int kLane>
1643  static_assert(0 <= kLane && kLane < N, "Invalid lane");
1644  return Vec256<float>{
1645  wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1646 }
1647 
1648 // ------------------------------ TableLookupBytes
1649 
1650 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
1651 // lane indices in [0, 16).
1652 template <typename T, typename TI>
1654  const Vec256<TI> from) {
1655 // Not yet available in all engines, see
1656 // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
1657 // V8 implementation of this had a bug, fixed on 2021-04-03:
1658 // https://chromium-review.googlesource.com/c/v8/v8/+/2822951
1659 #if 0
1660  return Vec256<TI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
1661 #else
1662  alignas(32) uint8_t control[16];
1663  alignas(32) uint8_t input[16];
1664  alignas(32) uint8_t output[16];
1665  wasm_v128_store(control, from.raw);
1666  wasm_v128_store(input, bytes.raw);
1667  for (size_t i = 0; i < 16; ++i) {
1668  output[i] = control[i] < 16 ? input[control[i]] : 0;
1669  }
1670  return Vec256<TI>{wasm_v128_load(output)};
1671 #endif
1672 }
1673 
1674 template <typename T, typename TI>
1676  const Vec256<TI> from) {
1677  const Full256<TI> d;
1678  // Mask size must match vector type, so cast everything to this type.
1679  Repartition<int8_t, decltype(d)> di8;
1681  const auto msb = BitCast(di8, from) < Zero(di8);
1682  const auto lookup =
1683  TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
1684  return BitCast(d, IfThenZeroElse(msb, lookup));
1685 }
1686 
1687 // ------------------------------ Hard-coded shuffles
1688 
1689 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1690 // Shuffle0321 rotates one lane to the right (the previous least-significant
1691 // lane is now most-significant). These could also be implemented via
1692 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1693 
1694 // Swap 32-bit halves in 64-bit halves.
1695 HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
1696  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1697 }
1698 HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
1699  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1700 }
1701 HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
1702  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1703 }
1704 
1705 // Swap 64-bit halves
1707  return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1708 }
1710  return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1711 }
1713  return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1714 }
1715 
1716 // Rotate right 32 bits
1718  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1719 }
1721  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1722 }
1724  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1725 }
1726 // Rotate left 32 bits
1728  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1729 }
1731  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1732 }
1734  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1735 }
1736 
1737 // Reverse
1739  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1740 }
1742  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1743 }
1745  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1746 }
1747 
1748 // ------------------------------ TableLookupLanes
1749 
1750 // Returned by SetTableIndices for use by TableLookupLanes.
1751 template <typename T>
1752 struct Indices256 {
1753  __v128_u raw;
1754 };
1755 
1756 template <typename T, typename TI>
1758  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
1759  return Indices256<T>{};
1760 }
1761 
1762 template <typename T, typename TI>
1764  const Rebind<TI, decltype(d)> di;
1765  return IndicesFromVec(d, LoadU(di, idx));
1766 }
1767 
1768 template <typename T>
1770  using TI = MakeSigned<T>;
1771  const Full256<T> d;
1772  const Full256<TI> di;
1773  return BitCast(d, TableLookupBytes(BitCast(di, v), Vec256<TI>{idx.raw}));
1774 }
1775 
1776 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
1777 
1778 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1780  return Shuffle01(v);
1781 }
1782 
1783 // Four lanes: shuffle
1784 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1785 HWY_API Vec256<T> Reverse(Full256<T> /* tag */, const Vec256<T> v) {
1786  return Shuffle0123(v);
1787 }
1788 
1789 // 16-bit
1790 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1791 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
1792  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
1793  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
1794 }
1795 
1796 // ------------------------------ Reverse2
1797 
1798 template <typename T>
1800  HWY_ASSERT(0);
1801 }
1802 
1803 // ------------------------------ Reverse4
1804 
1805 template <typename T>
1807  HWY_ASSERT(0);
1808 }
1809 
1810 // ------------------------------ Reverse8
1811 
1812 template <typename T>
1814  HWY_ASSERT(0);
1815 }
1816 
1817 // ------------------------------ InterleaveLower
1818 
1820  return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18,
1821  3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1822 }
1824  Vec256<uint16_t> b) {
1825  return Vec256<uint16_t>{
1826  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1827 }
1829  Vec256<uint32_t> b) {
1830  return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1831 }
1833  Vec256<uint64_t> b) {
1834  return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
1835 }
1836 
1838  return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3,
1839  19, 4, 20, 5, 21, 6, 22, 7, 23)};
1840 }
1842  return Vec256<int16_t>{
1843  wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1844 }
1846  return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1847 }
1849  return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
1850 }
1851 
1853  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1854 }
1855 
1856 // Additional overload for the optional tag.
1857 template <typename T, class V = Vec256<T>>
1858 HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
1859  return InterleaveLower(a, b);
1860 }
1861 
1862 // ------------------------------ InterleaveUpper (UpperHalf)
1863 
1864 // All functions inside detail lack the required D parameter.
1865 namespace detail {
1866 
1868  return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
1869  11, 27, 12, 28, 13, 29, 14, 30, 15,
1870  31)};
1871 }
1873  Vec256<uint16_t> b) {
1874  return Vec256<uint16_t>{
1875  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1876 }
1878  Vec256<uint32_t> b) {
1879  return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1880 }
1882  Vec256<uint64_t> b) {
1883  return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
1884 }
1885 
1887  return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
1888  11, 27, 12, 28, 13, 29, 14, 30, 15,
1889  31)};
1890 }
1892  return Vec256<int16_t>{
1893  wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1894 }
1896  return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1897 }
1899  return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
1900 }
1901 
1903  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1904 }
1905 
1906 } // namespace detail
1907 
1908 template <typename T, class V = Vec256<T>>
1909 HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
1910  return detail::InterleaveUpper(a, b);
1911 }
1912 
1913 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
1914 
1915 // Same as Interleave*, except that the return lanes are double-width integers;
1916 // this is necessary because the single-lane scalar cannot return two values.
1917 template <typename T, class DW = RepartitionToWide<Full256<T>>>
1919  return BitCast(DW(), InterleaveLower(a, b));
1920 }
1921 template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1923  return BitCast(dw, InterleaveLower(D(), a, b));
1924 }
1925 
1926 template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1928  return BitCast(dw, InterleaveUpper(D(), a, b));
1929 }
1930 
1931 // ================================================== COMBINE
1932 
1933 // ------------------------------ Combine (InterleaveLower)
1934 
1935 // N = N/2 + N/2 (upper half undefined)
1936 template <typename T>
1938  const Half<decltype(d)> d2;
1939  const RebindToUnsigned<decltype(d2)> du2;
1940  // Treat half-width input as one lane, and expand to two lanes.
1941  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
1942  const VU lo{BitCast(du2, lo_half).raw};
1943  const VU hi{BitCast(du2, hi_half).raw};
1944  return BitCast(d, InterleaveLower(lo, hi));
1945 }
1946 
1947 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
1948 
1949 template <typename T>
1951  return IfThenElseZero(FirstN(d, 16 / sizeof(T)), Vec256<T>{lo.raw});
1952 }
1953 
1954 // ------------------------------ ConcatLowerLower
1955 
1956 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
1957 template <typename T>
1959  const Vec256<T> lo) {
1960  return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
1961 }
1962 
1963 // ------------------------------ ConcatUpperUpper
1964 
1965 template <typename T>
1967  const Vec256<T> lo) {
1968  return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
1969 }
1970 
1971 // ------------------------------ ConcatLowerUpper
1972 
1973 template <typename T>
1975  const Vec256<T> lo) {
1976  return CombineShiftRightBytes<8>(d, hi, lo);
1977 }
1978 
1979 // ------------------------------ ConcatUpperLower
1980 template <typename T>
1982  const Vec256<T> lo) {
1983  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
1984 }
1985 
1986 // ------------------------------ ConcatOdd
1987 
1988 // 32-bit
1989 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1991  return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
1992 }
1993 
1994 // 64-bit full - no partial because we need at least two inputs to have
1995 // even/odd.
1996 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1997 HWY_API Vec256<T> ConcatOdd(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
1998  return InterleaveUpper(Full256<T>(), lo, hi);
1999 }
2000 
2001 // ------------------------------ ConcatEven (InterleaveLower)
2002 
2003 // 32-bit full
2004 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2006  return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2007 }
2008 
2009 // 64-bit full - no partial because we need at least two inputs to have
2010 // even/odd.
2011 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2012 HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
2013  return InterleaveLower(Full256<T>(), lo, hi);
2014 }
2015 
2016 // ------------------------------ DupEven
2017 template <typename T>
2019  HWY_ASSERT(0);
2020 }
2021 
2022 // ------------------------------ DupOdd
2023 template <typename T>
2025  HWY_ASSERT(0);
2026 }
2027 
2028 // ------------------------------ OddEven
2029 
2030 namespace detail {
2031 
2032 template <typename T>
2034  const Vec256<T> b) {
2035  const Full256<T> d;
2036  const Repartition<uint8_t, decltype(d)> d8;
2037  alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2038  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2039  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
2040 }
2041 template <typename T>
2043  const Vec256<T> b) {
2044  return Vec256<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2045 }
2046 template <typename T>
2048  const Vec256<T> b) {
2049  return Vec256<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2050 }
2051 template <typename T>
2053  const Vec256<T> b) {
2054  return Vec256<T>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
2055 }
2056 
2057 } // namespace detail
2058 
2059 template <typename T>
2061  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
2062 }
2064  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2065 }
2066 
2067 // ------------------------------ OddEvenBlocks
2068 template <typename T>
2070  return even;
2071 }
2072 
2073 // ------------------------------ SwapAdjacentBlocks
2074 
2075 template <typename T>
2077  return v;
2078 }
2079 
2080 // ------------------------------ ReverseBlocks
2081 
2082 template <typename T>
2084  return v;
2085 }
2086 
2087 // ================================================== CONVERT
2088 
2089 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2090 
2091 // Unsigned: zero-extend.
2093  const Vec128<uint8_t> v) {
2094  return Vec256<uint16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
2095 }
2097  const Vec128<uint8_t> v) {
2098  return Vec256<uint32_t>{
2099  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2100 }
2102  const Vec128<uint8_t> v) {
2103  return Vec256<int16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
2104 }
2106  const Vec128<uint8_t> v) {
2107  return Vec256<int32_t>{
2108  wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2109 }
2111  const Vec128<uint16_t> v) {
2112  return Vec256<uint32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
2113 }
2115  const Vec128<uint16_t> v) {
2116  return Vec256<int32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
2117 }
2118 
2119 // Signed: replicate sign bit.
2121  const Vec128<int8_t> v) {
2122  return Vec256<int16_t>{wasm_i16x8_extend_low_i8x16(v.raw)};
2123 }
2125  const Vec128<int8_t> v) {
2126  return Vec256<int32_t>{
2127  wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
2128 }
2130  const Vec128<int16_t> v) {
2131  return Vec256<int32_t>{wasm_i32x4_extend_low_i16x8(v.raw)};
2132 }
2133 
2135  const Vec128<int32_t> v) {
2136  return Vec256<double>{wasm_f64x2_convert_low_i32x4(v.raw)};
2137 }
2138 
2140  const Vec128<float16_t> v) {
2141  const Full256<int32_t> di32;
2142  const Full256<uint32_t> du32;
2143  const Full256<float> df32;
2144  // Expand to u32 so we can shift.
2145  const auto bits16 = PromoteTo(du32, Vec256<uint16_t>{v.raw});
2146  const auto sign = ShiftRight<15>(bits16);
2147  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
2148  const auto mantissa = bits16 & Set(du32, 0x3FF);
2149  const auto subnormal =
2150  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
2151  Set(df32, 1.0f / 16384 / 1024));
2152 
2153  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
2154  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
2155  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2156  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
2157  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2158 }
2159 
2161  const Vec128<bfloat16_t> v) {
2162  const Rebind<uint16_t, decltype(df32)> du16;
2163  const RebindToSigned<decltype(df32)> di32;
2164  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
2165 }
2166 
2167 // ------------------------------ Demotions (full -> part w/ narrow lanes)
2168 
2170  const Vec256<int32_t> v) {
2171  return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
2172 }
2173 
2175  const Vec256<int32_t> v) {
2176  return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
2177 }
2178 
2180  const Vec256<int32_t> v) {
2181  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2182  return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2183 }
2184 
2186  const Vec256<int16_t> v) {
2187  return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
2188 }
2189 
2191  const Vec256<int32_t> v) {
2192  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2193  return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2194 }
2195 
2197  const Vec256<int16_t> v) {
2198  return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
2199 }
2200 
2202  const Vec256<double> v) {
2203  return Vec128<int32_t>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
2204 }
2205 
2207  const Vec256<float> v) {
2208  const Full256<int32_t> di;
2209  const Full256<uint32_t> du;
2210  const Full256<uint16_t> du16;
2211  const auto bits32 = BitCast(du, v);
2212  const auto sign = ShiftRight<31>(bits32);
2213  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
2214  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
2215 
2216  const auto k15 = Set(di, 15);
2217  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
2218  const auto is_tiny = exp < Set(di, -24);
2219 
2220  const auto is_subnormal = exp < Set(di, -14);
2221  const auto biased_exp16 =
2222  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
2223  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
2224  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
2225  (mantissa32 >> (Set(du, 13) + sub_exp));
2226  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
2227  ShiftRight<13>(mantissa32)); // <1024
2228 
2229  const auto sign16 = ShiftLeft<15>(sign);
2230  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2231  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
2232  return Vec128<float16_t>{DemoteTo(du16, bits16).raw};
2233 }
2234 
2236  const Vec256<float> v) {
2237  const Rebind<int32_t, decltype(dbf16)> di32;
2238  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2239  const Rebind<uint16_t, decltype(dbf16)> du16;
2240  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2241  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2242 }
2243 
2246  const RebindToUnsigned<decltype(dbf16)> du16;
2247  const Repartition<uint32_t, decltype(dbf16)> du32;
2248  const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
2249  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2250 }
2251 
2252 // For already range-limited input [0, 255].
2254  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2255  return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2256 }
2257 
2258 // ------------------------------ Convert i32 <=> f32 (Round)
2259 
2261  const Vec256<int32_t> v) {
2262  return Vec256<float>{wasm_f32x4_convert_i32x4(v.raw)};
2263 }
2264 // Truncates (rounds toward zero).
2266  const Vec256<float> v) {
2267  return Vec256<int32_t>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
2268 }
2269 
2271  return ConvertTo(Full256<int32_t>(), Round(v));
2272 }
2273 
2274 // ================================================== MISC
2275 
2276 // ------------------------------ LoadMaskBits (TestBit)
2277 
2278 namespace detail {
2279 
2280 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2282  const RebindToUnsigned<decltype(d)> du;
2283  // Easier than Set(), which would require an >8-bit type, which would not
2284  // compile for T=uint8_t, N=1.
2285  const Vec256<T> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
2286 
2287  // Replicate bytes 8x such that each byte contains the bit that governs it.
2288  alignas(32) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2289  1, 1, 1, 1, 1, 1, 1, 1};
2290  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
2291 
2292  alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2293  1, 2, 4, 8, 16, 32, 64, 128};
2294  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
2295 }
2296 
2297 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2298 HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2299  const RebindToUnsigned<decltype(d)> du;
2300  alignas(32) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2301  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2302 }
2303 
2304 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2305 HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2306  const RebindToUnsigned<decltype(d)> du;
2307  alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2308  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2309 }
2310 
2311 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2312 HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2313  const RebindToUnsigned<decltype(d)> du;
2314  alignas(32) constexpr uint64_t kBit[8] = {1, 2};
2315  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2316 }
2317 
2318 } // namespace detail
2319 
2320 // `p` points to at least 8 readable bytes, not all of which need be valid.
2321 template <typename T>
2323  const uint8_t* HWY_RESTRICT bits) {
2324  uint64_t mask_bits = 0;
2325  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
2326  return detail::LoadMaskBits(d, mask_bits);
2327 }
2328 
2329 // ------------------------------ Mask
2330 
2331 namespace detail {
2332 
2333 // Full
2334 template <typename T>
2335 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
2336  const Mask128<T> mask) {
2337  alignas(32) uint64_t lanes[2];
2338  wasm_v128_store(lanes, mask.raw);
2339 
2340  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2341  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2342  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2343  return (hi + lo);
2344 }
2345 
2346 template <typename T>
2348  const Mask256<T> mask) {
2349  // Remove useless lower half of each u16 while preserving the sign bit.
2350  const __i16x8 zero = wasm_i16x8_splat(0);
2351  const Mask256<uint8_t> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
2352  return BitsFromMask(hwy::SizeTag<1>(), mask8);
2353 }
2354 
2355 template <typename T>
2357  const Mask256<T> mask) {
2358  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
2359  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2360  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2361  alignas(32) uint32_t lanes[4];
2362  wasm_v128_store(lanes, sliced_mask);
2363  return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2364 }
2365 
2366 // Returns 0xFF for bytes with index >= N, otherwise 0.
2367 constexpr __i8x16 BytesAbove() {
2368  return
2369  (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2370  : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2371  : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2372  : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2373  : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2374  : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2375  : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2376  : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2377  : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2378  : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2379  -1, -1, -1, -1, -1)
2380  : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2381  -1, -1, -1, -1)
2382  : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2383  -1, -1, -1, -1)
2384  : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2385  -1, -1, -1)
2386  : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2387  -1, -1, -1)
2388  : (N == 11)
2389  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2390  : (N == 13)
2391  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2392  : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2393 }
2394 
2395 template <typename T>
2396 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
2397  return BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask);
2398 }
2399 
2400 template <typename T>
2401 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
2402  return PopCount(BitsFromMask(tag, m));
2403 }
2404 
2405 template <typename T>
2406 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
2407  return PopCount(BitsFromMask(tag, m));
2408 }
2409 
2410 template <typename T>
2411 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
2412  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2413  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2414  alignas(32) uint64_t lanes[2];
2415  wasm_v128_store(lanes, shifted_bits);
2416  return PopCount(lanes[0] | lanes[1]);
2417 }
2418 
2419 } // namespace detail
2420 
2421 // `p` points to at least 8 writable bytes.
2422 template <typename T>
2423 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
2424  uint8_t* bits) {
2425  const uint64_t mask_bits = detail::BitsFromMask(mask);
2426  const size_t kNumBytes = (N + 7) / 8;
2427  CopyBytes<kNumBytes>(&mask_bits, bits);
2428  return kNumBytes;
2429 }
2430 
2431 template <typename T>
2432 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask128<T> m) {
2433  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
2434 }
2435 
2436 template <typename T>
2437 HWY_API bool AllFalse(const Full256<T> d, const Mask128<T> m) {
2438 #if 0
2439  // Casting followed by wasm_i8x16_any_true results in wasm error:
2440  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
2441  const auto v8 = BitCast(Full256<int8_t>(), VecFromMask(d, m));
2442  return !wasm_i8x16_any_true(v8.raw);
2443 #else
2444  (void)d;
2445  return (wasm_i64x2_extract_lane(m.raw, 0) |
2446  wasm_i64x2_extract_lane(m.raw, 1)) == 0;
2447 #endif
2448 }
2449 
2450 // Full vector
2451 namespace detail {
2452 template <typename T>
2453 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
2454  return wasm_i8x16_all_true(m.raw);
2455 }
2456 template <typename T>
2457 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
2458  return wasm_i16x8_all_true(m.raw);
2459 }
2460 template <typename T>
2461 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
2462  return wasm_i32x4_all_true(m.raw);
2463 }
2464 
2465 } // namespace detail
2466 
2467 template <typename T>
2468 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask128<T> m) {
2469  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
2470 }
2471 
2472 template <typename T>
2473 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
2474  const Mask256<T> mask) {
2475  const uint64_t bits = detail::BitsFromMask(mask);
2476  return bits ? Num0BitsBelowLS1Bit_Nonzero64(bits) : -1;
2477 }
2478 
2479 // ------------------------------ Compress
2480 
2481 namespace detail {
2482 
2483 template <typename T>
2484 HWY_INLINE Vec256<T> Idx16x8FromBits(const uint64_t mask_bits) {
2485  HWY_DASSERT(mask_bits < 256);
2486  const Full256<T> d;
2487  const Rebind<uint8_t, decltype(d)> d8;
2488  const Full256<uint16_t> du;
2489 
2490  // We need byte indices for TableLookupBytes (one vector's worth for each of
2491  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
2492  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
2493  // with the doubling baked into the table. Unpacking nibbles is likely more
2494  // costly than the higher cache footprint from storing bytes.
2495  alignas(32) constexpr uint8_t table[256 * 8] = {
2496  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2497  0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2498  0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2499  0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2500  0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2501  6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2502  0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2503  0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2504  2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2505  0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2506  0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2507  0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2508  0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2509  6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2510  8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2511  0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2512  4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2513  10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2514  0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2515  0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2516  0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2517  4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2518  0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2519  0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2520  2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2521  10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2522  0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2523  0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2524  0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2525  0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2526  0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2527  0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2528  6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2529  12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2530  0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2531  0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2532  0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2533  8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2534  0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2535  0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2536  2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2537  8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2538  12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2539  0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2540  0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2541  10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2542  12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2543  0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2544  4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2545  6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2546  0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2547  0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2548  0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2549  4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2550  12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2551  0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2552  2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2553  0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2554  0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2555  0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2556  0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2557  14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2558  0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2559  0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2560  8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2561  14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2562  0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2563  0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2564  0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2565  6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2566  14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2567  0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2568  2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2569  14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2570  0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2571  0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2572  0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2573  6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2574  10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2575  0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2576  4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2577  8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2578  0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2579  0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2580  0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2581  4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2582  0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2583  0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2584  2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2585  14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2586  0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2587  0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2588  0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2589  12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2590  14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2591  0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2592  6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2593  8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2594  14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2595  0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2596  0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2597  10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2598  14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2599  0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2600  2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2601  10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2602  12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2603  0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2604  0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2605  8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2606  10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2607  0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2608  4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2609  6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2610 
2611  const Vec256<uint8_t> byte_idx{Load(d8, table + mask_bits * 8).raw};
2612  const Vec256<uint16_t> pairs = ZipLower(byte_idx, byte_idx);
2613  return BitCast(d, pairs + Set(du, 0x0100));
2614 }
2615 
2616 template <typename T>
2617 HWY_INLINE Vec256<T> Idx32x4FromBits(const uint64_t mask_bits) {
2618  HWY_DASSERT(mask_bits < 16);
2619 
2620  // There are only 4 lanes, so we can afford to load the index vector directly.
2621  alignas(32) constexpr uint8_t packed_array[16 * 16] = {
2622  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2623  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2624  4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2625  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, //
2626  8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2627  0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2628  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2629  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, //
2630  12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2631  0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2632  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2633  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, //
2634  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2635  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2636  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2637  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2638 
2639  const Full256<T> d;
2640  const Repartition<uint8_t, decltype(d)> d8;
2641  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
2642 }
2643 
2644 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2645 
2646 template <typename T>
2647 HWY_INLINE Vec256<T> Idx64x2FromBits(const uint64_t mask_bits) {
2648  HWY_DASSERT(mask_bits < 4);
2649 
2650  // There are only 2 lanes, so we can afford to load the index vector directly.
2651  alignas(32) constexpr uint8_t packed_array[4 * 16] = {
2652  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
2653  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
2654  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
2655  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2656 
2657  const Full256<T> d;
2658  const Repartition<uint8_t, decltype(d)> d8;
2659  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
2660 }
2661 
2662 #endif
2663 
2664 // Helper functions called by both Compress and CompressStore - avoids a
2665 // redundant BitsFromMask in the latter.
2666 
2667 template <typename T>
2669  const uint64_t mask_bits) {
2670  const auto idx = detail::Idx16x8FromBits<T>(mask_bits);
2671  using D = Full256<T>;
2672  const RebindToSigned<D> di;
2673  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2674 }
2675 
2676 template <typename T>
2678  const uint64_t mask_bits) {
2679  const auto idx = detail::Idx32x4FromBits<T>(mask_bits);
2680  using D = Full256<T>;
2681  const RebindToSigned<D> di;
2682  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2683 }
2684 
2685 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2686 
2687 template <typename T>
2690  const uint64_t mask_bits) {
2691  const auto idx = detail::Idx64x2FromBits<uint64_t>(mask_bits);
2692  using D = Full256<T>;
2693  const RebindToSigned<D> di;
2694  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2695 }
2696 
2697 #endif
2698 
2699 } // namespace detail
2700 
2701 template <typename T>
2702 struct CompressIsPartition {
2703  enum { value = 1 };
2704 };
2705 
2706 template <typename T>
2708  const uint64_t mask_bits = detail::BitsFromMask(mask);
2709  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2710 }
2711 
2712 // ------------------------------ CompressBits
2713 
2714 template <typename T>
2716  uint64_t mask_bits = 0;
2717  constexpr size_t kNumBytes = (N + 7) / 8;
2718  CopyBytes<kNumBytes>(bits, &mask_bits);
2719  if (N < 8) {
2720  mask_bits &= (1ull << N) - 1;
2721  }
2722 
2723  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2724 }
2725 
2726 // ------------------------------ CompressStore
2727 template <typename T>
2729  T* HWY_RESTRICT unaligned) {
2730  const uint64_t mask_bits = detail::BitsFromMask(mask);
2731  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2732  StoreU(c, d, unaligned);
2733  return PopCount(mask_bits);
2734 }
2735 
2736 // ------------------------------ CompressBlendedStore
2737 template <typename T>
2739  T* HWY_RESTRICT unaligned) {
2740  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
2741  using TU = TFromD<decltype(du)>;
2742  const uint64_t mask_bits = detail::BitsFromMask(m);
2743  const size_t count = PopCount(mask_bits);
2744  const Mask256<TU> store_mask = FirstN(du, count);
2745  const Vec256<TU> compressed =
2746  detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
2747  const Vec256<TU> prev = BitCast(du, LoadU(d, unaligned));
2748  StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned);
2749  return count;
2750 }
2751 
2752 // ------------------------------ CompressBitsStore
2753 
2754 template <typename T>
2755 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
2756  Full256<T> d, T* HWY_RESTRICT unaligned) {
2757  uint64_t mask_bits = 0;
2758  constexpr size_t kNumBytes = (N + 7) / 8;
2759  CopyBytes<kNumBytes>(bits, &mask_bits);
2760  if (N < 8) {
2761  mask_bits &= (1ull << N) - 1;
2762  }
2763 
2764  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2765  StoreU(c, d, unaligned);
2766  return PopCount(mask_bits);
2767 }
2768 
2769 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
2770 // TableLookupBytes)
2771 
2774  uint8_t* HWY_RESTRICT unaligned) {
2775  const auto k5 = Set(d, 5);
2776  const auto k6 = Set(d, 6);
2777 
2778  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
2779  // 0x80 so lanes to be filled from other vectors are 0 for blending.
2780  alignas(32) static constexpr uint8_t tbl_r0[16] = {
2781  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
2782  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
2783  alignas(32) static constexpr uint8_t tbl_g0[16] = {
2784  0x80, 0, 0x80, 0x80, 1, 0x80, //
2785  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
2786  const auto shuf_r0 = Load(d, tbl_r0);
2787  const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
2788  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
2789  const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
2790  const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
2791  const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
2792  const auto int0 = r0 | g0 | b0;
2793  StoreU(int0, d, unaligned + 0 * 16);
2794 
2795  // Second vector: g10,r10, bgr[9:6], b5,g5
2796  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
2797  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
2798  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
2799  const auto r1 = TableLookupBytes(a, shuf_r1);
2800  const auto g1 = TableLookupBytes(b, shuf_g1);
2801  const auto b1 = TableLookupBytes(c, shuf_b1);
2802  const auto int1 = r1 | g1 | b1;
2803  StoreU(int1, d, unaligned + 1 * 16);
2804 
2805  // Third vector: bgr[15:11], b10
2806  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
2807  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
2808  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
2809  const auto r2 = TableLookupBytes(a, shuf_r2);
2810  const auto g2 = TableLookupBytes(b, shuf_g2);
2811  const auto b2 = TableLookupBytes(c, shuf_b2);
2812  const auto int2 = r2 | g2 | b2;
2813  StoreU(int2, d, unaligned + 2 * 16);
2814 }
2815 
2816 // ------------------------------ StoreInterleaved4
2817 
2819  const Vec256<uint8_t> v1,
2820  const Vec256<uint8_t> v2,
2821  const Vec256<uint8_t> v3, Full256<uint8_t> d8,
2822  uint8_t* HWY_RESTRICT unaligned) {
2823  const RepartitionToWide<decltype(d8)> d16;
2824  const RepartitionToWide<decltype(d16)> d32;
2825  // let a,b,c,d denote v0..3.
2826  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
2827  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
2828  const auto ba8 = ZipUpper(d16, v0, v1);
2829  const auto dc8 = ZipUpper(d16, v2, v3);
2830  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
2831  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
2832  const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8
2833  const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC
2834  StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
2835  StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
2836  StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
2837  StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
2838 }
2839 
2840 // ------------------------------ MulEven/Odd (Load)
2841 
2843  const Vec256<uint64_t> b) {
2844  alignas(32) uint64_t mul[2];
2845  mul[0] =
2846  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
2847  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
2848  return Load(Full256<uint64_t>(), mul);
2849 }
2850 
2852  const Vec256<uint64_t> b) {
2853  alignas(32) uint64_t mul[2];
2854  mul[0] =
2855  Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
2856  static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
2857  return Load(Full256<uint64_t>(), mul);
2858 }
2859 
2860 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2861 
2865  const Vec256<float> sum0,
2866  Vec256<float>& sum1) {
2867  const Repartition<uint16_t, decltype(df32)> du16;
2868  const RebindToUnsigned<decltype(df32)> du32;
2869  const Vec256<uint16_t> zero = Zero(du16);
2870  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
2871  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
2872  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
2873  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
2874  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2875  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2876 }
2877 
2878 // ------------------------------ Reductions
2879 
2880 namespace detail {
2881 
2882 // u32/i32/f32:
2883 
2884 template <typename T>
2886  const Vec256<T> v3210) {
2887  const Vec256<T> v1032 = Shuffle1032(v3210);
2888  const Vec256<T> v31_20_31_20 = v3210 + v1032;
2889  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2890  return v20_31_20_31 + v31_20_31_20;
2891 }
2892 template <typename T>
2894  const Vec256<T> v3210) {
2895  const Vec256<T> v1032 = Shuffle1032(v3210);
2896  const Vec256<T> v31_20_31_20 = Min(v3210, v1032);
2897  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2898  return Min(v20_31_20_31, v31_20_31_20);
2899 }
2900 template <typename T>
2902  const Vec256<T> v3210) {
2903  const Vec256<T> v1032 = Shuffle1032(v3210);
2904  const Vec256<T> v31_20_31_20 = Max(v3210, v1032);
2905  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2906  return Max(v20_31_20_31, v31_20_31_20);
2907 }
2908 
2909 // u64/i64/f64:
2910 
2911 template <typename T>
2913  const Vec256<T> v10) {
2914  const Vec256<T> v01 = Shuffle01(v10);
2915  return v10 + v01;
2916 }
2917 template <typename T>
2919  const Vec256<T> v10) {
2920  const Vec256<T> v01 = Shuffle01(v10);
2921  return Min(v10, v01);
2922 }
2923 template <typename T>
2925  const Vec256<T> v10) {
2926  const Vec256<T> v01 = Shuffle01(v10);
2927  return Max(v10, v01);
2928 }
2929 
2930 // u16/i16
2931 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2934  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
2935  const auto odd = ShiftRight<16>(BitCast(d32, v));
2936  const auto min = MinOfLanes(d32, Min(even, odd));
2937  // Also broadcast into odd lanes.
2938  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
2939 }
2940 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2943  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
2944  const auto odd = ShiftRight<16>(BitCast(d32, v));
2945  const auto min = MaxOfLanes(d32, Max(even, odd));
2946  // Also broadcast into odd lanes.
2947  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
2948 }
2949 
2950 } // namespace detail
2951 
2952 // Supported for u/i/f 32/64. Returns the same value in each lane.
2953 template <typename T>
2955  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2956 }
2957 template <typename T>
2959  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2960 }
2961 template <typename T>
2963  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2964 }
2965 
2966 // ------------------------------ Lt128
2967 
2968 template <typename T>
2970 
2971 template <typename T>
2973 
2974 template <typename T>
2976 
2977 // NOLINTNEXTLINE(google-readability-namespace-comments)
2978 } // namespace HWY_NAMESPACE
2979 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_ABORT(format,...)
Definition: base.h:143
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
Raw raw
Definition: arm_neon-inl.h:539
Definition: arm_neon-inl.h:485
Raw raw
Definition: arm_neon-inl.h:518
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:46
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:43
Raw raw
Definition: x86_256-inl.h:94
Vec128< T > v1
Definition: wasm_256-inl.h:66
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:58
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:61
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:49
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:55
Vec128< T > v0
Definition: wasm_256-inl.h:65
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1819
HWY_INLINE Vec128< T, N > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3162
HWY_INLINE Vec128< T, N > Idx64x2FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3336
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3009
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4680
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3309
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:460
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: wasm_256-inl.h:1752
__v128_u raw
Definition: wasm_256-inl.h:1753
Definition: wasm_256-inl.h:70
Mask128< T > m1
Definition: wasm_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:131
Mask128< T > m0
Definition: wasm_256-inl.h:71
Definition: ops/shared-inl.h:40
Definition: base.h:317
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()