Grok 10.0.1
wasm_256-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 256-bit WASM vectors and operations. Experimental.
17// External include guard in highway.h - see comment there.
18
19#include <stddef.h>
20#include <stdint.h>
21#include <wasm_simd128.h>
22
23#include "hwy/base.h"
24#include "hwy/ops/shared-inl.h"
26
28namespace hwy {
29namespace HWY_NAMESPACE {
30
31template <typename T>
32using Full256 = Simd<T, 32 / sizeof(T), 0>;
33
34template <typename T>
35using Full128 = Simd<T, 16 / sizeof(T), 0>;
36
37// TODO(richardwinterton): add this to DeduceD in wasm_128 similar to x86_128.
38template <typename T>
39class Vec256 {
40 public:
41 // Compound assignment. Only usable if there is a corresponding non-member
42 // binary operator overload. For example, only f32 and f64 support division.
44 return *this = (*this * other);
45 }
47 return *this = (*this / other);
48 }
50 return *this = (*this + other);
51 }
53 return *this = (*this - other);
54 }
56 return *this = (*this & other);
57 }
59 return *this = (*this | other);
60 }
62 return *this = (*this ^ other);
63 }
64
67};
68
69template <typename T>
70struct Mask256 {
73};
74
75// ------------------------------ BitCast
76
77template <typename T, typename FromT>
79 const Half<decltype(d)> dh;
80 Vec256<T> ret;
81 ret.v0 = BitCast(dh, v.v0);
82 ret.v1 = BitCast(dh, v.v1);
83 return ret;
84
85 // TODO(richardwinterton): implement other ops like this
86}
87
88// ------------------------------ Zero
89
90// Returns an all-zero vector/part.
91template <typename T>
93 return Vec256<T>{wasm_i32x4_splat(0)};
94}
96 return Vec256<float>{wasm_f32x4_splat(0.0f)};
97}
98
99template <class D>
100using VFromD = decltype(Zero(D()));
101
102// ------------------------------ Set
103
104// Returns a vector/part with all lanes set to "t".
105HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
106 return Vec256<uint8_t>{wasm_i8x16_splat(static_cast<int8_t>(t))};
107}
108HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
109 return Vec256<uint16_t>{wasm_i16x8_splat(static_cast<int16_t>(t))};
110}
111HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
112 return Vec256<uint32_t>{wasm_i32x4_splat(static_cast<int32_t>(t))};
113}
114HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
115 return Vec256<uint64_t>{wasm_i64x2_splat(static_cast<int64_t>(t))};
116}
117
118HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
119 return Vec256<int8_t>{wasm_i8x16_splat(t)};
120}
121HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
122 return Vec256<int16_t>{wasm_i16x8_splat(t)};
123}
124HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
125 return Vec256<int32_t>{wasm_i32x4_splat(t)};
126}
127HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
128 return Vec256<int64_t>{wasm_i64x2_splat(t)};
129}
130
131HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
132 return Vec256<float>{wasm_f32x4_splat(t)};
133}
134
135HWY_DIAGNOSTICS(push)
136HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
137
138// Returns a vector with uninitialized elements.
139template <typename T>
141 return Zero(d);
142}
143
145
146// Returns a vector with lane i=[0, N) set to "first" + i.
147template <typename T, typename T2>
148Vec256<T> Iota(const Full256<T> d, const T2 first) {
149 HWY_ALIGN T lanes[16 / sizeof(T)];
150 for (size_t i = 0; i < 16 / sizeof(T); ++i) {
151 lanes[i] = static_cast<T>(first + static_cast<T2>(i));
152 }
153 return Load(d, lanes);
154}
155
156// ================================================== ARITHMETIC
157
158// ------------------------------ Addition
159
160// Unsigned
162 const Vec256<uint8_t> b) {
163 return Vec256<uint8_t>{wasm_i8x16_add(a.raw, b.raw)};
164}
166 const Vec256<uint16_t> b) {
167 return Vec256<uint16_t>{wasm_i16x8_add(a.raw, b.raw)};
168}
170 const Vec256<uint32_t> b) {
171 return Vec256<uint32_t>{wasm_i32x4_add(a.raw, b.raw)};
172}
173
174// Signed
176 const Vec256<int8_t> b) {
177 return Vec256<int8_t>{wasm_i8x16_add(a.raw, b.raw)};
178}
180 const Vec256<int16_t> b) {
181 return Vec256<int16_t>{wasm_i16x8_add(a.raw, b.raw)};
182}
184 const Vec256<int32_t> b) {
185 return Vec256<int32_t>{wasm_i32x4_add(a.raw, b.raw)};
186}
187
188// Float
190 return Vec256<float>{wasm_f32x4_add(a.raw, b.raw)};
191}
192
193// ------------------------------ Subtraction
194
195// Unsigned
197 const Vec256<uint8_t> b) {
198 return Vec256<uint8_t>{wasm_i8x16_sub(a.raw, b.raw)};
199}
201 return Vec256<uint16_t>{wasm_i16x8_sub(a.raw, b.raw)};
202}
204 const Vec256<uint32_t> b) {
205 return Vec256<uint32_t>{wasm_i32x4_sub(a.raw, b.raw)};
206}
207
208// Signed
210 const Vec256<int8_t> b) {
211 return Vec256<int8_t>{wasm_i8x16_sub(a.raw, b.raw)};
212}
214 const Vec256<int16_t> b) {
215 return Vec256<int16_t>{wasm_i16x8_sub(a.raw, b.raw)};
216}
218 const Vec256<int32_t> b) {
219 return Vec256<int32_t>{wasm_i32x4_sub(a.raw, b.raw)};
220}
221
222// Float
224 return Vec256<float>{wasm_f32x4_sub(a.raw, b.raw)};
225}
226
227// ------------------------------ SumsOf8
229 HWY_ABORT("not implemented");
230}
231
232// ------------------------------ SaturatedAdd
233
234// Returns a + b clamped to the destination range.
235
236// Unsigned
238 const Vec256<uint8_t> b) {
239 return Vec256<uint8_t>{wasm_u8x16_add_sat(a.raw, b.raw)};
240}
242 const Vec256<uint16_t> b) {
243 return Vec256<uint16_t>{wasm_u16x8_add_sat(a.raw, b.raw)};
244}
245
246// Signed
248 const Vec256<int8_t> b) {
249 return Vec256<int8_t>{wasm_i8x16_add_sat(a.raw, b.raw)};
250}
252 const Vec256<int16_t> b) {
253 return Vec256<int16_t>{wasm_i16x8_add_sat(a.raw, b.raw)};
254}
255
256// ------------------------------ SaturatedSub
257
258// Returns a - b clamped to the destination range.
259
260// Unsigned
262 const Vec256<uint8_t> b) {
263 return Vec256<uint8_t>{wasm_u8x16_sub_sat(a.raw, b.raw)};
264}
266 const Vec256<uint16_t> b) {
267 return Vec256<uint16_t>{wasm_u16x8_sub_sat(a.raw, b.raw)};
268}
269
270// Signed
272 const Vec256<int8_t> b) {
273 return Vec256<int8_t>{wasm_i8x16_sub_sat(a.raw, b.raw)};
274}
276 const Vec256<int16_t> b) {
277 return Vec256<int16_t>{wasm_i16x8_sub_sat(a.raw, b.raw)};
278}
279
280// ------------------------------ Average
281
282// Returns (a + b + 1) / 2
283
284// Unsigned
286 const Vec256<uint8_t> b) {
287 return Vec256<uint8_t>{wasm_u8x16_avgr(a.raw, b.raw)};
288}
290 const Vec256<uint16_t> b) {
291 return Vec256<uint16_t>{wasm_u16x8_avgr(a.raw, b.raw)};
292}
293
294// ------------------------------ Absolute value
295
296// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
298 return Vec256<int8_t>{wasm_i8x16_abs(v.raw)};
299}
301 return Vec256<int16_t>{wasm_i16x8_abs(v.raw)};
302}
304 return Vec256<int32_t>{wasm_i32x4_abs(v.raw)};
305}
307 return Vec256<int32_t>{wasm_i62x2_abs(v.raw)};
308}
309
311 return Vec256<float>{wasm_f32x4_abs(v.raw)};
312}
313
314// ------------------------------ Shift lanes by constant #bits
315
316// Unsigned
317template <int kBits>
319 return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, kBits)};
320}
321template <int kBits>
323 return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, kBits)};
324}
325template <int kBits>
327 return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, kBits)};
328}
329template <int kBits>
331 return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, kBits)};
332}
333
334// Signed
335template <int kBits>
337 return Vec256<int16_t>{wasm_i16x8_shl(v.raw, kBits)};
338}
339template <int kBits>
341 return Vec256<int16_t>{wasm_i16x8_shr(v.raw, kBits)};
342}
343template <int kBits>
345 return Vec256<int32_t>{wasm_i32x4_shl(v.raw, kBits)};
346}
347template <int kBits>
349 return Vec256<int32_t>{wasm_i32x4_shr(v.raw, kBits)};
350}
351
352// 8-bit
353template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
355 const Full256<T> d8;
356 // Use raw instead of BitCast to support N=1.
357 const Vec256<T> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
358 return kBits == 1
359 ? (v + v)
360 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
361}
362
363template <int kBits>
365 const Full256<uint8_t> d8;
366 // Use raw instead of BitCast to support N=1.
367 const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
368 return shifted & Set(d8, 0xFF >> kBits);
369}
370
371template <int kBits>
373 const Full256<int8_t> di;
374 const Full256<uint8_t> du;
375 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
376 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
377 return (shifted ^ shifted_sign) - shifted_sign;
378}
379
380// ------------------------------ RotateRight (ShiftRight, Or)
381template <int kBits, typename T>
383 constexpr size_t kSizeInBits = sizeof(T) * 8;
384 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
385 if (kBits == 0) return v;
386 return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
387}
388
389// ------------------------------ Shift lanes by same variable #bits
390
391// Unsigned
393 const int bits) {
394 return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, bits)};
395}
397 const int bits) {
398 return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, bits)};
399}
401 const int bits) {
402 return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, bits)};
403}
405 const int bits) {
406 return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, bits)};
407}
408
409// Signed
411 return Vec256<int16_t>{wasm_i16x8_shl(v.raw, bits)};
412}
414 const int bits) {
415 return Vec256<int16_t>{wasm_i16x8_shr(v.raw, bits)};
416}
418 return Vec256<int32_t>{wasm_i32x4_shl(v.raw, bits)};
419}
421 const int bits) {
422 return Vec256<int32_t>{wasm_i32x4_shr(v.raw, bits)};
423}
424
425// 8-bit
426template <typename T, HWY_IF_LANE_SIZE(T, 1)>
427HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
428 const Full256<T> d8;
429 // Use raw instead of BitCast to support N=1.
430 const Vec256<T> shifted{ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
431 return shifted & Set(d8, (0xFF << bits) & 0xFF);
432}
433
435 const Full256<uint8_t> d8;
436 // Use raw instead of BitCast to support N=1.
437 const Vec256<uint8_t> shifted{
438 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
439 return shifted & Set(d8, 0xFF >> bits);
440}
441
443 const Full256<int8_t> di;
444 const Full256<uint8_t> du;
445 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
446 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
447 return (shifted ^ shifted_sign) - shifted_sign;
448}
449
450// ------------------------------ Minimum
451
452// Unsigned
454 return Vec256<uint8_t>{wasm_u8x16_min(a.raw, b.raw)};
455}
457 const Vec256<uint16_t> b) {
458 return Vec256<uint16_t>{wasm_u16x8_min(a.raw, b.raw)};
459}
461 const Vec256<uint32_t> b) {
462 return Vec256<uint32_t>{wasm_u32x4_min(a.raw, b.raw)};
463}
465 const Vec256<uint64_t> b) {
466 alignas(32) float min[4];
467 min[0] =
468 HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
469 min[1] =
470 HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
471 return Vec256<uint64_t>{wasm_v128_load(min)};
472}
473
474// Signed
476 return Vec256<int8_t>{wasm_i8x16_min(a.raw, b.raw)};
477}
479 return Vec256<int16_t>{wasm_i16x8_min(a.raw, b.raw)};
480}
482 return Vec256<int32_t>{wasm_i32x4_min(a.raw, b.raw)};
483}
485 alignas(32) float min[4];
486 min[0] =
487 HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
488 min[1] =
489 HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
490 return Vec256<int64_t>{wasm_v128_load(min)};
491}
492
493// Float
495 return Vec256<float>{wasm_f32x4_min(a.raw, b.raw)};
496}
497
498// ------------------------------ Maximum
499
500// Unsigned
502 return Vec256<uint8_t>{wasm_u8x16_max(a.raw, b.raw)};
503}
505 const Vec256<uint16_t> b) {
506 return Vec256<uint16_t>{wasm_u16x8_max(a.raw, b.raw)};
507}
509 const Vec256<uint32_t> b) {
510 return Vec256<uint32_t>{wasm_u32x4_max(a.raw, b.raw)};
511}
513 const Vec256<uint64_t> b) {
514 alignas(32) float max[4];
515 max[0] =
516 HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
517 max[1] =
518 HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
519 return Vec256<int64_t>{wasm_v128_load(max)};
520}
521
522// Signed
524 return Vec256<int8_t>{wasm_i8x16_max(a.raw, b.raw)};
525}
527 return Vec256<int16_t>{wasm_i16x8_max(a.raw, b.raw)};
528}
530 return Vec256<int32_t>{wasm_i32x4_max(a.raw, b.raw)};
531}
533 alignas(32) float max[4];
534 max[0] =
535 HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
536 max[1] =
537 HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
538 return Vec256<int64_t>{wasm_v128_load(max)};
539}
540
541// Float
543 return Vec256<float>{wasm_f32x4_max(a.raw, b.raw)};
544}
545
546// ------------------------------ Integer multiplication
547
548// Unsigned
550 const Vec256<uint16_t> b) {
551 return Vec256<uint16_t>{wasm_i16x8_mul(a.raw, b.raw)};
552}
554 const Vec256<uint32_t> b) {
555 return Vec256<uint32_t>{wasm_i32x4_mul(a.raw, b.raw)};
556}
557
558// Signed
560 const Vec256<int16_t> b) {
561 return Vec256<int16_t>{wasm_i16x8_mul(a.raw, b.raw)};
562}
564 const Vec256<int32_t> b) {
565 return Vec256<int32_t>{wasm_i32x4_mul(a.raw, b.raw)};
566}
567
568// Returns the upper 16 bits of a * b in each lane.
570 const Vec256<uint16_t> b) {
571 // TODO(eustas): replace, when implemented in WASM.
572 const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
573 const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
574 const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
575 const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
576 const auto l = wasm_i32x4_mul(al, bl);
577 const auto h = wasm_i32x4_mul(ah, bh);
578 // TODO(eustas): shift-right + narrow?
579 return Vec256<uint16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
580}
582 const Vec256<int16_t> b) {
583 // TODO(eustas): replace, when implemented in WASM.
584 const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
585 const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
586 const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
587 const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
588 const auto l = wasm_i32x4_mul(al, bl);
589 const auto h = wasm_i32x4_mul(ah, bh);
590 // TODO(eustas): shift-right + narrow?
591 return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
592}
593
595 HWY_ASSERT(0);
596}
597
598// Multiplies even lanes (0, 2 ..) and returns the double-width result.
600 const Vec256<int32_t> b) {
601 // TODO(eustas): replace, when implemented in WASM.
602 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
603 const auto ae = wasm_v128_and(a.raw, kEvenMask);
604 const auto be = wasm_v128_and(b.raw, kEvenMask);
605 return Vec256<int64_t>{wasm_i64x2_mul(ae, be)};
606}
608 const Vec256<uint32_t> b) {
609 // TODO(eustas): replace, when implemented in WASM.
610 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
611 const auto ae = wasm_v128_and(a.raw, kEvenMask);
612 const auto be = wasm_v128_and(b.raw, kEvenMask);
613 return Vec256<uint64_t>{wasm_i64x2_mul(ae, be)};
614}
615
616// ------------------------------ Negate
617
618template <typename T, HWY_IF_FLOAT(T)>
620 return Xor(v, SignBit(Full256<T>()));
621}
622
624 return Vec256<int8_t>{wasm_i8x16_neg(v.raw)};
625}
627 return Vec256<int16_t>{wasm_i16x8_neg(v.raw)};
628}
630 return Vec256<int32_t>{wasm_i32x4_neg(v.raw)};
631}
633 return Vec256<int64_t>{wasm_i64x2_neg(v.raw)};
634}
635
636// ------------------------------ Floating-point mul / div
637
639 return Vec256<float>{wasm_f32x4_mul(a.raw, b.raw)};
640}
641
643 return Vec256<float>{wasm_f32x4_div(a.raw, b.raw)};
644}
645
646// Approximate reciprocal
648 const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
649 return one / v;
650}
651
652// Absolute value of difference.
654 return Abs(a - b);
655}
656
657// ------------------------------ Floating-point multiply-add variants
658
659// Returns mul * x + add
661 const Vec256<float> add) {
662 // TODO(eustas): replace, when implemented in WASM.
663 // TODO(eustas): is it wasm_f32x4_qfma?
664 return mul * x + add;
665}
666
667// Returns add - mul * x
669 const Vec256<float> add) {
670 // TODO(eustas): replace, when implemented in WASM.
671 return add - mul * x;
672}
673
674// Returns mul * x - sub
676 const Vec256<float> sub) {
677 // TODO(eustas): replace, when implemented in WASM.
678 // TODO(eustas): is it wasm_f32x4_qfms?
679 return mul * x - sub;
680}
681
682// Returns -mul * x - sub
684 const Vec256<float> sub) {
685 // TODO(eustas): replace, when implemented in WASM.
686 return Neg(mul) * x - sub;
687}
688
689// ------------------------------ Floating-point square root
690
691// Full precision square root
693 return Vec256<float>{wasm_f32x4_sqrt(v.raw)};
694}
695
696// Approximate reciprocal square root
698 // TODO(eustas): find cheaper a way to calculate this.
699 const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
700 return one / Sqrt(v);
701}
702
703// ------------------------------ Floating-point rounding
704
705// Toward nearest integer, ties to even
707 return Vec256<float>{wasm_f32x4_nearest(v.raw)};
708}
709
710// Toward zero, aka truncate
712 return Vec256<float>{wasm_f32x4_trunc(v.raw)};
713}
714
715// Toward +infinity, aka ceiling
717 return Vec256<float>{wasm_f32x4_ceil(v.raw)};
718}
719
720// Toward -infinity, aka floor
722 return Vec256<float>{wasm_f32x4_floor(v.raw)};
723}
724
725// ------------------------------ Floating-point classification
726
727template <typename T>
729 return v != v;
730}
731
732template <typename T, HWY_IF_FLOAT(T)>
734 const Full256<T> d;
735 const RebindToSigned<decltype(d)> di;
736 const VFromD<decltype(di)> vi = BitCast(di, v);
737 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
738 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
739}
740
741// Returns whether normal/subnormal/zero.
742template <typename T, HWY_IF_FLOAT(T)>
744 const Full256<T> d;
745 const RebindToUnsigned<decltype(d)> du;
746 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
747 const VFromD<decltype(du)> vu = BitCast(du, v);
748 // 'Shift left' to clear the sign bit, then right so we can compare with the
749 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
750 // negative and non-negative floats would be greater).
751 const VFromD<decltype(di)> exp =
752 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
753 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
754}
755
756// ================================================== COMPARE
757
758// Comparisons fill a lane with 1-bits if the condition is true, else 0.
759
760template <typename TFrom, typename TTo>
762 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
763 return Mask256<TTo>{m.raw};
764}
765
766template <typename T>
768 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
769 return (v & bit) == bit;
770}
771
772// ------------------------------ Equality
773
774// Unsigned
776 const Vec256<uint8_t> b) {
777 return Mask256<uint8_t>{wasm_i8x16_eq(a.raw, b.raw)};
778}
780 const Vec256<uint16_t> b) {
781 return Mask256<uint16_t>{wasm_i16x8_eq(a.raw, b.raw)};
782}
784 const Vec256<uint32_t> b) {
785 return Mask256<uint32_t>{wasm_i32x4_eq(a.raw, b.raw)};
786}
787
788// Signed
790 const Vec256<int8_t> b) {
791 return Mask256<int8_t>{wasm_i8x16_eq(a.raw, b.raw)};
792}
794 return Mask256<int16_t>{wasm_i16x8_eq(a.raw, b.raw)};
795}
797 const Vec256<int32_t> b) {
798 return Mask256<int32_t>{wasm_i32x4_eq(a.raw, b.raw)};
799}
800
801// Float
803 const Vec256<float> b) {
804 return Mask256<float>{wasm_f32x4_eq(a.raw, b.raw)};
805}
806
807// ------------------------------ Inequality
808
809// Unsigned
811 const Vec256<uint8_t> b) {
812 return Mask256<uint8_t>{wasm_i8x16_ne(a.raw, b.raw)};
813}
815 const Vec256<uint16_t> b) {
816 return Mask256<uint16_t>{wasm_i16x8_ne(a.raw, b.raw)};
817}
819 const Vec256<uint32_t> b) {
820 return Mask256<uint32_t>{wasm_i32x4_ne(a.raw, b.raw)};
821}
822
823// Signed
825 const Vec256<int8_t> b) {
826 return Mask256<int8_t>{wasm_i8x16_ne(a.raw, b.raw)};
827}
829 return Mask256<int16_t>{wasm_i16x8_ne(a.raw, b.raw)};
830}
832 const Vec256<int32_t> b) {
833 return Mask256<int32_t>{wasm_i32x4_ne(a.raw, b.raw)};
834}
835
836// Float
838 const Vec256<float> b) {
839 return Mask256<float>{wasm_f32x4_ne(a.raw, b.raw)};
840}
841
842// ------------------------------ Strict inequality
843
845 const Vec256<int8_t> b) {
846 return Mask256<int8_t>{wasm_i8x16_gt(a.raw, b.raw)};
847}
849 const Vec256<int16_t> b) {
850 return Mask256<int16_t>{wasm_i16x8_gt(a.raw, b.raw)};
851}
853 const Vec256<int32_t> b) {
854 return Mask256<int32_t>{wasm_i32x4_gt(a.raw, b.raw)};
855}
857 const Vec256<int64_t> b) {
858 const Rebind < int32_t, DFromV<decltype(a)> d32;
859 const auto a32 = BitCast(d32, a);
860 const auto b32 = BitCast(d32, b);
861 // If the upper half is less than or greater, this is the answer.
862 const auto m_gt = a32 < b32;
863
864 // Otherwise, the lower half decides.
865 const auto m_eq = a32 == b32;
866 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
867 const auto lo_gt = And(m_eq, lo_in_hi);
868
869 const auto gt = Or(lo_gt, m_gt);
870 // Copy result in upper 32 bits to lower 32 bits.
871 return Mask256<int64_t>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
872}
873
874template <typename T, HWY_IF_UNSIGNED(T)>
876 const Full256<T> du;
877 const RebindToSigned<decltype(du)> di;
878 const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
879 return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
880}
881
883 return Mask256<float>{wasm_f32x4_gt(a.raw, b.raw)};
884}
885
886template <typename T>
888 return operator>(b, a);
889}
890
891// ------------------------------ Weak inequality
892
893// Float <= >=
895 const Vec256<float> b) {
896 return Mask256<float>{wasm_f32x4_le(a.raw, b.raw)};
897}
899 const Vec256<float> b) {
900 return Mask256<float>{wasm_f32x4_ge(a.raw, b.raw)};
901}
902
903// ------------------------------ FirstN (Iota, Lt)
904
905template <typename T>
907 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
908 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
909}
910
911// ================================================== LOGICAL
912
913// ------------------------------ Not
914
915template <typename T>
917 return Vec256<T>{wasm_v128_not(v.raw)};
918}
919
920// ------------------------------ And
921
922template <typename T>
924 return Vec256<T>{wasm_v128_and(a.raw, b.raw)};
925}
926
927// ------------------------------ AndNot
928
929// Returns ~not_mask & mask.
930template <typename T>
932 return Vec256<T>{wasm_v128_andnot(mask.raw, not_mask.raw)};
933}
934
935// ------------------------------ Or
936
937template <typename T>
939 return Vec256<T>{wasm_v128_or(a.raw, b.raw)};
940}
941
942// ------------------------------ Xor
943
944template <typename T>
946 return Vec256<T>{wasm_v128_xor(a.raw, b.raw)};
947}
948
949// ------------------------------ Or3
950
951template <typename T>
953 return Or(o1, Or(o2, o3));
954}
955
956// ------------------------------ OrAnd
957
958template <typename T>
960 return Or(o, And(a1, a2));
961}
962
963// ------------------------------ IfVecThenElse
964
965template <typename T>
967 return IfThenElse(MaskFromVec(mask), yes, no);
968}
969
970// ------------------------------ Operator overloads (internal-only if float)
971
972template <typename T>
974 return And(a, b);
975}
976
977template <typename T>
979 return Or(a, b);
980}
981
982template <typename T>
984 return Xor(a, b);
985}
986
987// ------------------------------ CopySign
988
989template <typename T>
991 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
992 const auto msb = SignBit(Full256<T>());
993 return Or(AndNot(msb, magn), And(msb, sign));
994}
995
996template <typename T>
998 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
999 return Or(abs, And(SignBit(Full256<T>()), sign));
1000}
1001
1002// ------------------------------ BroadcastSignBit (compare)
1003
1004template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
1006 return ShiftRight<sizeof(T) * 8 - 1>(v);
1007}
1010}
1011
1012// ------------------------------ Mask
1013
1014// Mask and Vec are the same (true = FF..FF).
1015template <typename T>
1017 return Mask256<T>{v.raw};
1018}
1019
1020template <typename T>
1022 return Vec256<T>{v.raw};
1023}
1024
1025// mask ? yes : no
1026template <typename T>
1028 return Vec256<T>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
1029}
1030
1031// mask ? yes : 0
1032template <typename T>
1034 return yes & VecFromMask(Full256<T>(), mask);
1035}
1036
1037// mask ? 0 : no
1038template <typename T>
1040 return AndNot(VecFromMask(Full256<T>(), mask), no);
1041}
1042
1043template <typename T>
1044 HWY_API Vec256 <
1045 T IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
1046 HWY_ASSERT(0);
1047}
1048
1049template <typename T, HWY_IF_FLOAT(T)>
1050HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
1051 const Full256<T> d;
1052 const auto zero = Zero(d);
1053 return IfThenElse(Mask256<T>{(v > zero).raw}, v, zero);
1054}
1055
1056// ------------------------------ Mask logical
1057
1058template <typename T>
1059HWY_API Mask256<T> Not(const Mask256<T> m) {
1060 return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
1061}
1062
1063template <typename T>
1065 const Full256<T> d;
1066 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1067}
1068
1069template <typename T>
1071 const Full256<T> d;
1072 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1073}
1074
1075template <typename T>
1077 const Full256<T> d;
1078 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1079}
1080
1081template <typename T>
1083 const Full256<T> d;
1084 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1085}
1086
1087// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
1088
1089// The x86 multiply-by-Pow2() trick will not work because WASM saturates
1090// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
1091// scalar count operand, per-lane shift instructions would require extract_lane
1092// for each lane, and hoping that shuffle is correctly mapped to a native
1093// instruction. Using non-vector shifts would incur a store-load forwarding
1094// stall when loading the result vector. We instead test bits of the shift
1095// count to "predicate" a shift of the entire vector by a constant.
1096
1097template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1099 const Full256<T> d;
1100 Mask256<T> mask;
1101 // Need a signed type for BroadcastSignBit.
1102 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1103 // Move the highest valid bit of the shift count into the sign bit.
1104 test = ShiftLeft<12>(test);
1105
1106 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1107 test = ShiftLeft<1>(test); // next bit (descending order)
1108 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1109
1110 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1111 test = ShiftLeft<1>(test); // next bit (descending order)
1112 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1113
1114 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1115 test = ShiftLeft<1>(test); // next bit (descending order)
1116 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1117
1118 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1119 return IfThenElse(mask, ShiftLeft<1>(v), v);
1120}
1121
1122template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1123HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
1124 const Full256<T> d;
1125 Mask256<T> mask;
1126 // Need a signed type for BroadcastSignBit.
1127 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1128 // Move the highest valid bit of the shift count into the sign bit.
1129 test = ShiftLeft<27>(test);
1130
1131 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1132 test = ShiftLeft<1>(test); // next bit (descending order)
1133 v = IfThenElse(mask, ShiftLeft<16>(v), v);
1134
1135 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1136 test = ShiftLeft<1>(test); // next bit (descending order)
1137 v = IfThenElse(mask, ShiftLeft<8>(v), v);
1138
1139 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1140 test = ShiftLeft<1>(test); // next bit (descending order)
1141 v = IfThenElse(mask, ShiftLeft<4>(v), v);
1142
1143 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1144 test = ShiftLeft<1>(test); // next bit (descending order)
1145 v = IfThenElse(mask, ShiftLeft<2>(v), v);
1146
1147 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1148 return IfThenElse(mask, ShiftLeft<1>(v), v);
1149}
1150
1151// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
1152
1153template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1155 const Full256<T> d;
1156 Mask256<T> mask;
1157 // Need a signed type for BroadcastSignBit.
1158 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1159 // Move the highest valid bit of the shift count into the sign bit.
1160 test = ShiftLeft<12>(test);
1161
1162 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1163 test = ShiftLeft<1>(test); // next bit (descending order)
1164 v = IfThenElse(mask, ShiftRight<8>(v), v);
1165
1166 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1167 test = ShiftLeft<1>(test); // next bit (descending order)
1168 v = IfThenElse(mask, ShiftRight<4>(v), v);
1169
1170 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1171 test = ShiftLeft<1>(test); // next bit (descending order)
1172 v = IfThenElse(mask, ShiftRight<2>(v), v);
1173
1174 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1175 return IfThenElse(mask, ShiftRight<1>(v), v);
1176}
1177
1178template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1179HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) {
1180 const Full256<T> d;
1181 Mask256<T> mask;
1182 // Need a signed type for BroadcastSignBit.
1183 auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
1184 // Move the highest valid bit of the shift count into the sign bit.
1185 test = ShiftLeft<27>(test);
1186
1187 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1188 test = ShiftLeft<1>(test); // next bit (descending order)
1189 v = IfThenElse(mask, ShiftRight<16>(v), v);
1190
1191 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1192 test = ShiftLeft<1>(test); // next bit (descending order)
1193 v = IfThenElse(mask, ShiftRight<8>(v), v);
1194
1195 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1196 test = ShiftLeft<1>(test); // next bit (descending order)
1197 v = IfThenElse(mask, ShiftRight<4>(v), v);
1198
1199 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1200 test = ShiftLeft<1>(test); // next bit (descending order)
1201 v = IfThenElse(mask, ShiftRight<2>(v), v);
1202
1203 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
1204 return IfThenElse(mask, ShiftRight<1>(v), v);
1205}
1206
1207// ================================================== MEMORY
1208
1209// ------------------------------ Load
1210
1211template <typename T>
1212HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
1213 return Vec256<T>{wasm_v128_load(aligned)};
1214}
1215
1216template <typename T>
1218 const T* HWY_RESTRICT aligned) {
1219 return IfThenElseZero(m, Load(d, aligned));
1220}
1221
1222// LoadU == Load.
1223template <typename T>
1225 return Load(d, p);
1226}
1227
1228// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1229template <typename T>
1231 return Load(d, p);
1232}
1233
1234// ------------------------------ Store
1235
1236template <typename T>
1237HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
1238 wasm_v128_store(aligned, v.raw);
1239}
1240
1241// StoreU == Store.
1242template <typename T>
1244 Store(v, d, p);
1245}
1246
1247template <typename T>
1249 T* HWY_RESTRICT p) {
1250 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
1251}
1252
1253// ------------------------------ Non-temporal stores
1254
1255// Same as aligned stores on non-x86.
1256
1257template <typename T>
1259 T* HWY_RESTRICT aligned) {
1260 wasm_v128_store(aligned, v.raw);
1261}
1262
1263// ------------------------------ Scatter (Store)
1264
1265template <typename T, typename Offset>
1267 const Vec256<Offset> offset) {
1268 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1269
1270 alignas(32) T lanes[32 / sizeof(T)];
1271 Store(v, d, lanes);
1272
1273 alignas(32) Offset offset_lanes[32 / sizeof(T)];
1274 Store(offset, Full256<Offset>(), offset_lanes);
1275
1276 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1277 for (size_t i = 0; i < N; ++i) {
1278 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1279 }
1280}
1281
1282template <typename T, typename Index>
1284 const Vec256<Index> index) {
1285 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1286
1287 alignas(32) T lanes[32 / sizeof(T)];
1288 Store(v, d, lanes);
1289
1290 alignas(32) Index index_lanes[32 / sizeof(T)];
1291 Store(index, Full256<Index>(), index_lanes);
1292
1293 for (size_t i = 0; i < N; ++i) {
1294 base[index_lanes[i]] = lanes[i];
1295 }
1296}
1297
1298// ------------------------------ Gather (Load/Store)
1299
1300template <typename T, typename Offset>
1302 const Vec256<Offset> offset) {
1303 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1304
1305 alignas(32) Offset offset_lanes[32 / sizeof(T)];
1306 Store(offset, Full256<Offset>(), offset_lanes);
1307
1308 alignas(32) T lanes[32 / sizeof(T)];
1309 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1310 for (size_t i = 0; i < N; ++i) {
1311 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1312 }
1313 return Load(d, lanes);
1314}
1315
1316template <typename T, typename Index>
1318 const Vec256<Index> index) {
1319 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1320
1321 alignas(32) Index index_lanes[32 / sizeof(T)];
1322 Store(index, Full256<Index>(), index_lanes);
1323
1324 alignas(32) T lanes[32 / sizeof(T)];
1325 for (size_t i = 0; i < N; ++i) {
1326 lanes[i] = base[index_lanes[i]];
1327 }
1328 return Load(d, lanes);
1329}
1330
1331// ================================================== SWIZZLE
1332
1333// ------------------------------ ExtractLane
1334template <typename T, size_t N>
1335HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
1336 HWY_ASSERT(0);
1337}
1338
1339// ------------------------------ InsertLane
1340template <typename T, size_t N>
1341HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
1342 HWY_ASSERT(0);
1343}
1344
1345// ------------------------------ GetLane
1346// Gets the single value stored in a vector/part.
1348 return wasm_i8x16_extract_lane(v.raw, 0);
1349}
1351 return wasm_i8x16_extract_lane(v.raw, 0);
1352}
1354 return wasm_i16x8_extract_lane(v.raw, 0);
1355}
1357 return wasm_i16x8_extract_lane(v.raw, 0);
1358}
1360 return wasm_i32x4_extract_lane(v.raw, 0);
1361}
1363 return wasm_i32x4_extract_lane(v.raw, 0);
1364}
1366 return wasm_i64x2_extract_lane(v.raw, 0);
1367}
1369 return wasm_i64x2_extract_lane(v.raw, 0);
1370}
1371
1373 return wasm_f32x4_extract_lane(v.raw, 0);
1374}
1375
1376// ------------------------------ LowerHalf
1377
1378template <typename T>
1380 return Vec128<T>{v.raw};
1381}
1382
1383template <typename T>
1385 return LowerHalf(Full128<T>(), v);
1386}
1387
1388// ------------------------------ ShiftLeftBytes
1389
1390// 0x01..0F, kBytes = 1 => 0x02..0F00
1391template <int kBytes, typename T>
1393 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1394 const __i8x16 zero = wasm_i8x16_splat(0);
1395 switch (kBytes) {
1396 case 0:
1397 return v;
1398
1399 case 1:
1400 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
1401 7, 8, 9, 10, 11, 12, 13, 14)};
1402
1403 case 2:
1404 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
1405 6, 7, 8, 9, 10, 11, 12, 13)};
1406
1407 case 3:
1408 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
1409 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1410
1411 case 4:
1412 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
1413 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1414
1415 case 5:
1416 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
1417 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1418
1419 case 6:
1420 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1421 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1422
1423 case 7:
1424 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1425 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1426
1427 case 8:
1428 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1429 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1430
1431 case 9:
1432 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1433 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
1434
1435 case 10:
1436 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1437 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
1438
1439 case 11:
1440 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1441 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
1442
1443 case 12:
1444 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1445 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
1446
1447 case 13:
1448 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1449 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
1450
1451 case 14:
1452 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1453 16, 16, 16, 16, 16, 16, 16, 16, 0,
1454 1)};
1455
1456 case 15:
1457 return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
1458 16, 16, 16, 16, 16, 16, 16, 16, 16,
1459 0)};
1460 }
1461 return Vec256<T>{zero};
1462}
1463
1464template <int kBytes, typename T>
1466 return ShiftLeftBytes<kBytes>(Full256<T>(), v);
1467}
1468
1469// ------------------------------ ShiftLeftLanes
1470
1471template <int kLanes, typename T>
1473 const Repartition<uint8_t, decltype(d)> d8;
1474 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1475}
1476
1477template <int kLanes, typename T>
1479 return ShiftLeftLanes<kLanes>(Full256<T>(), v);
1480}
1481
1482// ------------------------------ ShiftRightBytes
1483namespace detail {
1484
1485// Helper function allows zeroing invalid lanes in caller.
1486template <int kBytes, typename T>
1487HWY_API __i8x16 ShrBytes(const Vec256<T> v) {
1488 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1489 const __i8x16 zero = wasm_i8x16_splat(0);
1490
1491 switch (kBytes) {
1492 case 0:
1493 return v.raw;
1494
1495 case 1:
1496 return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1497 12, 13, 14, 15, 16);
1498
1499 case 2:
1500 return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1501 13, 14, 15, 16, 16);
1502
1503 case 3:
1504 return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1505 13, 14, 15, 16, 16, 16);
1506
1507 case 4:
1508 return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1509 14, 15, 16, 16, 16, 16);
1510
1511 case 5:
1512 return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1513 15, 16, 16, 16, 16, 16);
1514
1515 case 6:
1516 return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1517 16, 16, 16, 16, 16, 16);
1518
1519 case 7:
1520 return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1521 16, 16, 16, 16, 16, 16, 16);
1522
1523 case 8:
1524 return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1525 16, 16, 16, 16, 16, 16, 16);
1526
1527 case 9:
1528 return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1529 16, 16, 16, 16, 16, 16, 16);
1530
1531 case 10:
1532 return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1533 16, 16, 16, 16, 16, 16, 16);
1534
1535 case 11:
1536 return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1537 16, 16, 16, 16, 16, 16, 16);
1538
1539 case 12:
1540 return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1541 16, 16, 16, 16, 16, 16, 16);
1542
1543 case 13:
1544 return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1545 16, 16, 16, 16, 16, 16, 16);
1546
1547 case 14:
1548 return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1549 16, 16, 16, 16, 16, 16, 16);
1550
1551 case 15:
1552 return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1553 16, 16, 16, 16, 16, 16, 16);
1554 case 16:
1555 return zero;
1556 }
1557}
1558
1559} // namespace detail
1560
1561// 0x01..0F, kBytes = 1 => 0x0001..0E
1562template <int kBytes, typename T>
1564 return Vec256<T>{detail::ShrBytes<kBytes>(v)};
1565}
1566
1567// ------------------------------ ShiftRightLanes
1568template <int kLanes, typename T>
1570 const Repartition<uint8_t, decltype(d)> d8;
1571 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1572}
1573
1574// ------------------------------ UpperHalf (ShiftRightBytes)
1575
1576// Full input: copy hi into lo (smaller instruction encoding than shifts).
1577template <typename T>
1578HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Full128<T> /* tag */,
1579 const Vec256<T> v) {
1580 return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1581}
1583 const Vec128<float> v) {
1584 return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
1585}
1586
1587// ------------------------------ CombineShiftRightBytes
1588
1589template <int kBytes, typename T, class V = Vec256<T>>
1591 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1592 switch (kBytes) {
1593 case 0:
1594 return lo;
1595
1596 case 1:
1597 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1598 11, 12, 13, 14, 15, 16)};
1599
1600 case 2:
1601 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1602 11, 12, 13, 14, 15, 16, 17)};
1603
1604 case 3:
1605 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1606 12, 13, 14, 15, 16, 17, 18)};
1607
1608 case 4:
1609 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1610 13, 14, 15, 16, 17, 18, 19)};
1611
1612 case 5:
1613 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1614 14, 15, 16, 17, 18, 19, 20)};
1615
1616 case 6:
1617 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1618 14, 15, 16, 17, 18, 19, 20, 21)};
1619
1620 case 7:
1621 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1622 15, 16, 17, 18, 19, 20, 21, 22)};
1623
1624 case 8:
1625 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1626 16, 17, 18, 19, 20, 21, 22, 23)};
1627
1628 case 9:
1629 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1630 17, 18, 19, 20, 21, 22, 23, 24)};
1631
1632 case 10:
1633 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1634 17, 18, 19, 20, 21, 22, 23, 24, 25)};
1635
1636 case 11:
1637 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1638 18, 19, 20, 21, 22, 23, 24, 25, 26)};
1639
1640 case 12:
1641 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1642 19, 20, 21, 22, 23, 24, 25, 26, 27)};
1643
1644 case 13:
1645 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1646 20, 21, 22, 23, 24, 25, 26, 27, 28)};
1647
1648 case 14:
1649 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1650 21, 22, 23, 24, 25, 26, 27, 28, 29)};
1651
1652 case 15:
1653 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
1654 22, 23, 24, 25, 26, 27, 28, 29, 30)};
1655 }
1656 return hi;
1657}
1658
1659// ------------------------------ Broadcast/splat any lane
1660
1661// Unsigned
1662template <int kLane>
1664 static_assert(0 <= kLane && kLane < N, "Invalid lane");
1665 return Vec256<uint16_t>{wasm_i16x8_shuffle(
1666 v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
1667}
1668template <int kLane>
1670 static_assert(0 <= kLane && kLane < N, "Invalid lane");
1671 return Vec256<uint32_t>{
1672 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1673}
1674
1675// Signed
1676template <int kLane>
1678 static_assert(0 <= kLane && kLane < N, "Invalid lane");
1679 return Vec256<int16_t>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
1680 kLane, kLane, kLane, kLane, kLane)};
1681}
1682template <int kLane>
1684 static_assert(0 <= kLane && kLane < N, "Invalid lane");
1685 return Vec256<int32_t>{
1686 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1687}
1688
1689// Float
1690template <int kLane>
1692 static_assert(0 <= kLane && kLane < N, "Invalid lane");
1693 return Vec256<float>{
1694 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
1695}
1696
1697// ------------------------------ TableLookupBytes
1698
1699// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
1700// lane indices in [0, 16).
1701template <typename T, typename TI>
1703 const Vec256<TI> from) {
1704// Not yet available in all engines, see
1705// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
1706// V8 implementation of this had a bug, fixed on 2021-04-03:
1707// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
1708#if 0
1709 return Vec256<TI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
1710#else
1711 alignas(32) uint8_t control[16];
1712 alignas(32) uint8_t input[16];
1713 alignas(32) uint8_t output[16];
1714 wasm_v128_store(control, from.raw);
1715 wasm_v128_store(input, bytes.raw);
1716 for (size_t i = 0; i < 16; ++i) {
1717 output[i] = control[i] < 16 ? input[control[i]] : 0;
1718 }
1719 return Vec256<TI>{wasm_v128_load(output)};
1720#endif
1721}
1722
1723template <typename T, typename TI>
1725 const Vec256<TI> from) {
1726 const Full256<TI> d;
1727 // Mask size must match vector type, so cast everything to this type.
1728 Repartition<int8_t, decltype(d)> di8;
1730 const auto msb = BitCast(di8, from) < Zero(di8);
1731 const auto lookup =
1732 TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
1733 return BitCast(d, IfThenZeroElse(msb, lookup));
1734}
1735
1736// ------------------------------ Hard-coded shuffles
1737
1738// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1739// Shuffle0321 rotates one lane to the right (the previous least-significant
1740// lane is now most-significant). These could also be implemented via
1741// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1742
1743// Swap 32-bit halves in 64-bit halves.
1744HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
1745 return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1746}
1747HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
1748 return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1749}
1750HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
1751 return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
1752}
1753
1754// Swap 64-bit halves
1756 return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1757}
1759 return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1760}
1762 return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
1763}
1764
1765// Rotate right 32 bits
1767 return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1768}
1770 return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1771}
1773 return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
1774}
1775// Rotate left 32 bits
1777 return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1778}
1780 return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1781}
1783 return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
1784}
1785
1786// Reverse
1788 return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1789}
1791 return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1792}
1794 return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
1795}
1796
1797// ------------------------------ TableLookupLanes
1798
1799// Returned by SetTableIndices for use by TableLookupLanes.
1800template <typename T>
1802 __v128_u raw;
1803};
1804
1805template <typename T, typename TI>
1807 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
1808 return Indices256<T>{};
1809}
1810
1811template <typename T, typename TI>
1813 const Rebind<TI, decltype(d)> di;
1814 return IndicesFromVec(d, LoadU(di, idx));
1815}
1816
1817template <typename T>
1819 using TI = MakeSigned<T>;
1820 const Full256<T> d;
1821 const Full256<TI> di;
1822 return BitCast(d, TableLookupBytes(BitCast(di, v), Vec256<TI>{idx.raw}));
1823}
1824
1825// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
1826
1827template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1829 return Shuffle01(v);
1830}
1831
1832// Four lanes: shuffle
1833template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1834HWY_API Vec256<T> Reverse(Full256<T> /* tag */, const Vec256<T> v) {
1835 return Shuffle0123(v);
1836}
1837
1838// 16-bit
1839template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1840HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
1841 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
1842 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
1843}
1844
1845// ------------------------------ Reverse2
1846
1847template <typename T>
1849 HWY_ASSERT(0);
1850}
1851
1852// ------------------------------ Reverse4
1853
1854template <typename T>
1856 HWY_ASSERT(0);
1857}
1858
1859// ------------------------------ Reverse8
1860
1861template <typename T>
1863 HWY_ASSERT(0);
1864}
1865
1866// ------------------------------ InterleaveLower
1867
1869 return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18,
1870 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1871}
1873 Vec256<uint16_t> b) {
1874 return Vec256<uint16_t>{
1875 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1876}
1878 Vec256<uint32_t> b) {
1879 return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1880}
1882 Vec256<uint64_t> b) {
1883 return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
1884}
1885
1887 return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3,
1888 19, 4, 20, 5, 21, 6, 22, 7, 23)};
1889}
1891 return Vec256<int16_t>{
1892 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
1893}
1895 return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1896}
1898 return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
1899}
1900
1902 return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
1903}
1904
1905// Additional overload for the optional tag.
1906template <typename T, class V = Vec256<T>>
1907HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
1908 return InterleaveLower(a, b);
1909}
1910
1911// ------------------------------ InterleaveUpper (UpperHalf)
1912
1913// All functions inside detail lack the required D parameter.
1914namespace detail {
1915
1917 return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
1918 11, 27, 12, 28, 13, 29, 14, 30, 15,
1919 31)};
1920}
1922 Vec256<uint16_t> b) {
1923 return Vec256<uint16_t>{
1924 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1925}
1927 Vec256<uint32_t> b) {
1928 return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1929}
1931 Vec256<uint64_t> b) {
1932 return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
1933}
1934
1936 return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
1937 11, 27, 12, 28, 13, 29, 14, 30, 15,
1938 31)};
1939}
1941 return Vec256<int16_t>{
1942 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
1943}
1945 return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1946}
1948 return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
1949}
1950
1952 return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
1953}
1954
1955} // namespace detail
1956
1957template <typename T, class V = Vec256<T>>
1958HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
1959 return detail::InterleaveUpper(a, b);
1960}
1961
1962// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
1963
1964// Same as Interleave*, except that the return lanes are double-width integers;
1965// this is necessary because the single-lane scalar cannot return two values.
1966template <typename T, class DW = RepartitionToWide<Full256<T>>>
1968 return BitCast(DW(), InterleaveLower(a, b));
1969}
1970template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1972 return BitCast(dw, InterleaveLower(D(), a, b));
1973}
1974
1975template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
1977 return BitCast(dw, InterleaveUpper(D(), a, b));
1978}
1979
1980// ================================================== COMBINE
1981
1982// ------------------------------ Combine (InterleaveLower)
1983
1984// N = N/2 + N/2 (upper half undefined)
1985template <typename T>
1987 const Half<decltype(d)> d2;
1988 const RebindToUnsigned<decltype(d2)> du2;
1989 // Treat half-width input as one lane, and expand to two lanes.
1990 using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
1991 const VU lo{BitCast(du2, lo_half).raw};
1992 const VU hi{BitCast(du2, hi_half).raw};
1993 return BitCast(d, InterleaveLower(lo, hi));
1994}
1995
1996// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
1997
1998template <typename T>
2000 return IfThenElseZero(FirstN(d, 16 / sizeof(T)), Vec256<T>{lo.raw});
2001}
2002
2003// ------------------------------ ConcatLowerLower
2004
2005// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2006template <typename T>
2008 const Vec256<T> lo) {
2009 return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
2010}
2011
2012// ------------------------------ ConcatUpperUpper
2013
2014template <typename T>
2016 const Vec256<T> lo) {
2017 return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
2018}
2019
2020// ------------------------------ ConcatLowerUpper
2021
2022template <typename T>
2024 const Vec256<T> lo) {
2025 return CombineShiftRightBytes<8>(d, hi, lo);
2026}
2027
2028// ------------------------------ ConcatUpperLower
2029template <typename T>
2031 const Vec256<T> lo) {
2032 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
2033}
2034
2035// ------------------------------ ConcatOdd
2036
2037// 32-bit
2038template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2040 return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2041}
2042
2043// 64-bit full - no partial because we need at least two inputs to have
2044// even/odd.
2045template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2046HWY_API Vec256<T> ConcatOdd(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
2047 return InterleaveUpper(Full256<T>(), lo, hi);
2048}
2049
2050// ------------------------------ ConcatEven (InterleaveLower)
2051
2052// 32-bit full
2053template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2055 return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2056}
2057
2058// 64-bit full - no partial because we need at least two inputs to have
2059// even/odd.
2060template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2061HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
2062 return InterleaveLower(Full256<T>(), lo, hi);
2063}
2064
2065// ------------------------------ DupEven
2066template <typename T>
2068 HWY_ASSERT(0);
2069}
2070
2071// ------------------------------ DupOdd
2072template <typename T>
2074 HWY_ASSERT(0);
2075}
2076
2077// ------------------------------ OddEven
2078
2079namespace detail {
2080
2081template <typename T>
2083 const Vec256<T> b) {
2084 const Full256<T> d;
2085 const Repartition<uint8_t, decltype(d)> d8;
2086 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2087 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2088 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
2089}
2090template <typename T>
2092 const Vec256<T> b) {
2093 return Vec256<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2094}
2095template <typename T>
2097 const Vec256<T> b) {
2098 return Vec256<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2099}
2100template <typename T>
2102 const Vec256<T> b) {
2103 return Vec256<T>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
2104}
2105
2106} // namespace detail
2107
2108template <typename T>
2110 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
2111}
2113 return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
2114}
2115
2116// ------------------------------ OddEvenBlocks
2117template <typename T>
2119 return even;
2120}
2121
2122// ------------------------------ SwapAdjacentBlocks
2123
2124template <typename T>
2126 return v;
2127}
2128
2129// ------------------------------ ReverseBlocks
2130
2131template <typename T>
2133 return v;
2134}
2135
2136// ================================================== CONVERT
2137
2138// ------------------------------ Promotions (part w/ narrow lanes -> full)
2139
2140// Unsigned: zero-extend.
2142 const Vec128<uint8_t> v) {
2143 return Vec256<uint16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
2144}
2146 const Vec128<uint8_t> v) {
2147 return Vec256<uint32_t>{
2148 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2149}
2151 const Vec128<uint8_t> v) {
2152 return Vec256<int16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
2153}
2155 const Vec128<uint8_t> v) {
2156 return Vec256<int32_t>{
2157 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
2158}
2160 const Vec128<uint16_t> v) {
2161 return Vec256<uint32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
2162}
2164 const Vec128<uint16_t> v) {
2165 return Vec256<int32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
2166}
2167
2168// Signed: replicate sign bit.
2170 const Vec128<int8_t> v) {
2171 return Vec256<int16_t>{wasm_i16x8_extend_low_i8x16(v.raw)};
2172}
2174 const Vec128<int8_t> v) {
2175 return Vec256<int32_t>{
2176 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
2177}
2179 const Vec128<int16_t> v) {
2180 return Vec256<int32_t>{wasm_i32x4_extend_low_i16x8(v.raw)};
2181}
2182
2184 const Vec128<int32_t> v) {
2185 return Vec256<double>{wasm_f64x2_convert_low_i32x4(v.raw)};
2186}
2187
2189 const Vec128<float16_t> v) {
2190 const Full256<int32_t> di32;
2191 const Full256<uint32_t> du32;
2192 const Full256<float> df32;
2193 // Expand to u32 so we can shift.
2194 const auto bits16 = PromoteTo(du32, Vec256<uint16_t>{v.raw});
2195 const auto sign = ShiftRight<15>(bits16);
2196 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
2197 const auto mantissa = bits16 & Set(du32, 0x3FF);
2198 const auto subnormal =
2199 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
2200 Set(df32, 1.0f / 16384 / 1024));
2201
2202 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
2203 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
2204 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2205 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
2206 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2207}
2208
2210 const Vec128<bfloat16_t> v) {
2211 const Rebind<uint16_t, decltype(df32)> du16;
2212 const RebindToSigned<decltype(df32)> di32;
2213 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
2214}
2215
2216// ------------------------------ Demotions (full -> part w/ narrow lanes)
2217
2219 const Vec256<int32_t> v) {
2220 return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
2221}
2222
2224 const Vec256<int32_t> v) {
2225 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
2226}
2227
2229 const Vec256<int32_t> v) {
2230 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2231 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2232}
2233
2235 const Vec256<int16_t> v) {
2236 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
2237}
2238
2240 const Vec256<int32_t> v) {
2241 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2242 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
2243}
2244
2246 const Vec256<int16_t> v) {
2247 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
2248}
2249
2251 const Vec256<double> v) {
2252 return Vec128<int32_t>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
2253}
2254
2256 const Vec256<float> v) {
2257 const Full256<int32_t> di;
2258 const Full256<uint32_t> du;
2259 const Full256<uint16_t> du16;
2260 const auto bits32 = BitCast(du, v);
2261 const auto sign = ShiftRight<31>(bits32);
2262 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
2263 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
2264
2265 const auto k15 = Set(di, 15);
2266 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
2267 const auto is_tiny = exp < Set(di, -24);
2268
2269 const auto is_subnormal = exp < Set(di, -14);
2270 const auto biased_exp16 =
2271 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
2272 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
2273 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
2274 (mantissa32 >> (Set(du, 13) + sub_exp));
2275 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
2276 ShiftRight<13>(mantissa32)); // <1024
2277
2278 const auto sign16 = ShiftLeft<15>(sign);
2279 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2280 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
2281 return Vec128<float16_t>{DemoteTo(du16, bits16).raw};
2282}
2283
2285 const Vec256<float> v) {
2286 const Rebind<int32_t, decltype(dbf16)> di32;
2287 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
2288 const Rebind<uint16_t, decltype(dbf16)> du16;
2289 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
2290 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
2291}
2292
2295 const RebindToUnsigned<decltype(dbf16)> du16;
2296 const Repartition<uint32_t, decltype(dbf16)> du32;
2297 const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
2298 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
2299}
2300
2301// For already range-limited input [0, 255].
2303 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
2304 return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2305}
2306
2307// ------------------------------ Convert i32 <=> f32 (Round)
2308
2310 const Vec256<int32_t> v) {
2311 return Vec256<float>{wasm_f32x4_convert_i32x4(v.raw)};
2312}
2313// Truncates (rounds toward zero).
2315 const Vec256<float> v) {
2316 return Vec256<int32_t>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
2317}
2318
2320 return ConvertTo(Full256<int32_t>(), Round(v));
2321}
2322
2323// ================================================== MISC
2324
2325// ------------------------------ LoadMaskBits (TestBit)
2326
2327namespace detail {
2328
2329template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2331 const RebindToUnsigned<decltype(d)> du;
2332 // Easier than Set(), which would require an >8-bit type, which would not
2333 // compile for T=uint8_t, N=1.
2334 const Vec256<T> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
2335
2336 // Replicate bytes 8x such that each byte contains the bit that governs it.
2337 alignas(32) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2338 1, 1, 1, 1, 1, 1, 1, 1};
2339 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
2340
2341 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2342 1, 2, 4, 8, 16, 32, 64, 128};
2343 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
2344}
2345
2346template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2348 const RebindToUnsigned<decltype(d)> du;
2349 alignas(32) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2350 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2351}
2352
2353template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2354HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2355 const RebindToUnsigned<decltype(d)> du;
2356 alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2357 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2358}
2359
2360template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2361HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
2362 const RebindToUnsigned<decltype(d)> du;
2363 alignas(32) constexpr uint64_t kBit[8] = {1, 2};
2364 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
2365}
2366
2367} // namespace detail
2368
2369// `p` points to at least 8 readable bytes, not all of which need be valid.
2370template <typename T>
2372 const uint8_t* HWY_RESTRICT bits) {
2373 uint64_t mask_bits = 0;
2374 CopyBytes<(N + 7) / 8>(bits, &mask_bits);
2375 return detail::LoadMaskBits(d, mask_bits);
2376}
2377
2378// ------------------------------ Mask
2379
2380namespace detail {
2381
2382// Full
2383template <typename T>
2384HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
2385 const Mask128<T> mask) {
2386 alignas(32) uint64_t lanes[2];
2387 wasm_v128_store(lanes, mask.raw);
2388
2389 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2390 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2391 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2392 return (hi + lo);
2393}
2394
2395template <typename T>
2397 const Mask256<T> mask) {
2398 // Remove useless lower half of each u16 while preserving the sign bit.
2399 const __i16x8 zero = wasm_i16x8_splat(0);
2400 const Mask256<uint8_t> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
2401 return BitsFromMask(hwy::SizeTag<1>(), mask8);
2402}
2403
2404template <typename T>
2406 const Mask256<T> mask) {
2407 const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
2408 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2409 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2410 alignas(32) uint32_t lanes[4];
2411 wasm_v128_store(lanes, sliced_mask);
2412 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2413}
2414
2415// Returns 0xFF for bytes with index >= N, otherwise 0.
2416constexpr __i8x16 BytesAbove() {
2417 return
2418 (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
2419 : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
2420 : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
2421 : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
2422 : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
2423 : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
2424 : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
2425 : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
2426 : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
2427 : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2428 -1, -1, -1, -1, -1)
2429 : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2430 -1, -1, -1, -1)
2431 : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
2432 -1, -1, -1, -1)
2433 : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
2434 -1, -1, -1)
2435 : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
2436 -1, -1, -1)
2437 : (N == 11)
2438 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
2439 : (N == 13)
2440 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
2441 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
2442}
2443
2444template <typename T>
2445HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
2446 return BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask);
2447}
2448
2449template <typename T>
2450HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
2451 return PopCount(BitsFromMask(tag, m));
2452}
2453
2454template <typename T>
2455HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
2456 return PopCount(BitsFromMask(tag, m));
2457}
2458
2459template <typename T>
2460HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
2461 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
2462 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
2463 alignas(32) uint64_t lanes[2];
2464 wasm_v128_store(lanes, shifted_bits);
2465 return PopCount(lanes[0] | lanes[1]);
2466}
2467
2468} // namespace detail
2469
2470// `p` points to at least 8 writable bytes.
2471template <typename T>
2472HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
2473 uint8_t* bits) {
2474 const uint64_t mask_bits = detail::BitsFromMask(mask);
2475 const size_t kNumBytes = (N + 7) / 8;
2476 CopyBytes<kNumBytes>(&mask_bits, bits);
2477 return kNumBytes;
2478}
2479
2480template <typename T>
2481HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask128<T> m) {
2482 return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
2483}
2484
2485template <typename T>
2487#if 0
2488 // Casting followed by wasm_i8x16_any_true results in wasm error:
2489 // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
2490 const auto v8 = BitCast(Full256<int8_t>(), VecFromMask(d, m));
2491 return !wasm_i8x16_any_true(v8.raw);
2492#else
2493 (void)d;
2494 return (wasm_i64x2_extract_lane(m.raw, 0) |
2495 wasm_i64x2_extract_lane(m.raw, 1)) == 0;
2496#endif
2497}
2498
2499// Full vector
2500namespace detail {
2501template <typename T>
2502HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
2503 return wasm_i8x16_all_true(m.raw);
2504}
2505template <typename T>
2506HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
2507 return wasm_i16x8_all_true(m.raw);
2508}
2509template <typename T>
2510HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
2511 return wasm_i32x4_all_true(m.raw);
2512}
2513
2514} // namespace detail
2515
2516template <typename T>
2517HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask128<T> m) {
2518 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
2519}
2520
2521template <typename T>
2522HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
2523 const Mask256<T> mask) {
2524 const uint64_t bits = detail::BitsFromMask(mask);
2525 return bits ? Num0BitsBelowLS1Bit_Nonzero64(bits) : -1;
2526}
2527
2528// ------------------------------ Compress
2529
2530namespace detail {
2531
2532template <typename T>
2533HWY_INLINE Vec256<T> Idx16x8FromBits(const uint64_t mask_bits) {
2534 HWY_DASSERT(mask_bits < 256);
2535 const Full256<T> d;
2536 const Rebind<uint8_t, decltype(d)> d8;
2537 const Full256<uint16_t> du;
2538
2539 // We need byte indices for TableLookupBytes (one vector's worth for each of
2540 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
2541 // can instead store lane indices and convert to byte indices (2*lane + 0..1),
2542 // with the doubling baked into the table. Unpacking nibbles is likely more
2543 // costly than the higher cache footprint from storing bytes.
2544 alignas(32) constexpr uint8_t table[256 * 8] = {
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
2546 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
2547 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
2548 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
2549 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
2550 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
2551 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
2552 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2553 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
2554 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
2555 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
2556 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
2557 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
2558 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
2559 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
2560 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
2561 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
2562 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
2563 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
2564 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
2565 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
2566 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
2567 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
2568 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2569 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
2570 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
2571 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
2572 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
2573 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
2574 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
2575 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
2576 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
2577 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
2578 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
2579 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
2580 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
2581 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
2582 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
2583 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
2584 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2585 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
2586 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
2587 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
2588 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
2589 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
2590 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
2591 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
2592 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
2593 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
2594 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
2595 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
2596 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
2597 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
2598 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
2599 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
2600 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2601 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
2602 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
2603 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
2604 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
2605 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
2606 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
2607 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
2608 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
2609 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
2610 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
2611 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
2612 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
2613 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
2614 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
2615 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
2616 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2617 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
2618 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
2619 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
2620 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
2621 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
2622 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
2623 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
2624 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
2625 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
2626 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
2627 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
2628 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
2629 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
2630 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
2631 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
2632 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2633 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
2634 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
2635 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
2636 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
2637 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
2638 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
2639 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
2640 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
2641 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
2642 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
2643 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
2644 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
2645 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
2646 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
2647 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
2648 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2649 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
2650 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
2651 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
2652 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
2653 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
2654 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
2655 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
2656 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
2657 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
2658 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
2659
2660 const Vec256<uint8_t> byte_idx{Load(d8, table + mask_bits * 8).raw};
2661 const Vec256<uint16_t> pairs = ZipLower(byte_idx, byte_idx);
2662 return BitCast(d, pairs + Set(du, 0x0100));
2663}
2664
2665template <typename T>
2666HWY_INLINE Vec256<T> Idx32x4FromBits(const uint64_t mask_bits) {
2667 HWY_DASSERT(mask_bits < 16);
2668
2669 // There are only 4 lanes, so we can afford to load the index vector directly.
2670 alignas(32) constexpr uint8_t packed_array[16 * 16] = {
2671 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2672 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2673 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2674 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, //
2675 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2676 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2677 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
2678 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, //
2679 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
2680 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2681 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2682 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, //
2683 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
2684 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2685 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
2686 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2687
2688 const Full256<T> d;
2689 const Repartition<uint8_t, decltype(d)> d8;
2690 return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
2691}
2692
2693#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2694
2695template <typename T>
2696HWY_INLINE Vec256<T> Idx64x2FromBits(const uint64_t mask_bits) {
2697 HWY_DASSERT(mask_bits < 4);
2698
2699 // There are only 2 lanes, so we can afford to load the index vector directly.
2700 alignas(32) constexpr uint8_t packed_array[4 * 16] = {
2701 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
2702 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
2703 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
2704 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2705
2706 const Full256<T> d;
2707 const Repartition<uint8_t, decltype(d)> d8;
2708 return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
2709}
2710
2711#endif
2712
2713// Helper functions called by both Compress and CompressStore - avoids a
2714// redundant BitsFromMask in the latter.
2715
2716template <typename T>
2718 const uint64_t mask_bits) {
2719 const auto idx = detail::Idx16x8FromBits<T>(mask_bits);
2720 using D = Full256<T>;
2721 const RebindToSigned<D> di;
2722 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2723}
2724
2725template <typename T>
2727 const uint64_t mask_bits) {
2728 const auto idx = detail::Idx32x4FromBits<T>(mask_bits);
2729 using D = Full256<T>;
2730 const RebindToSigned<D> di;
2731 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2732}
2733
2734#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
2735
2736template <typename T>
2739 const uint64_t mask_bits) {
2740 const auto idx = detail::Idx64x2FromBits<uint64_t>(mask_bits);
2741 using D = Full256<T>;
2742 const RebindToSigned<D> di;
2743 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
2744}
2745
2746#endif
2747
2748} // namespace detail
2749
2750template <typename T>
2751struct CompressIsPartition {
2752 enum { value = 1 };
2753};
2754
2755template <typename T>
2757 const uint64_t mask_bits = detail::BitsFromMask(mask);
2758 return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2759}
2760
2761// ------------------------------ CompressNot
2762template <typename T>
2763HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
2764 return Compress(v, Not(mask));
2765}
2766
2767// ------------------------------ CompressBlocksNot
2769 Mask256<uint64_t> mask) {
2770 HWY_ASSERT(0);
2771}
2772
2773// ------------------------------ CompressBits
2774
2775template <typename T>
2777 uint64_t mask_bits = 0;
2778 constexpr size_t kNumBytes = (N + 7) / 8;
2779 CopyBytes<kNumBytes>(bits, &mask_bits);
2780 if (N < 8) {
2781 mask_bits &= (1ull << N) - 1;
2782 }
2783
2784 return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2785}
2786
2787// ------------------------------ CompressStore
2788template <typename T>
2790 T* HWY_RESTRICT unaligned) {
2791 const uint64_t mask_bits = detail::BitsFromMask(mask);
2792 const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2793 StoreU(c, d, unaligned);
2794 return PopCount(mask_bits);
2795}
2796
2797// ------------------------------ CompressBlendedStore
2798template <typename T>
2800 T* HWY_RESTRICT unaligned) {
2801 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
2802 using TU = TFromD<decltype(du)>;
2803 const uint64_t mask_bits = detail::BitsFromMask(m);
2804 const size_t count = PopCount(mask_bits);
2805 const Mask256<TU> store_mask = FirstN(du, count);
2806 const Vec256<TU> compressed =
2807 detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
2808 const Vec256<TU> prev = BitCast(du, LoadU(d, unaligned));
2809 StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned);
2810 return count;
2811}
2812
2813// ------------------------------ CompressBitsStore
2814
2815template <typename T>
2817 Full256<T> d, T* HWY_RESTRICT unaligned) {
2818 uint64_t mask_bits = 0;
2819 constexpr size_t kNumBytes = (N + 7) / 8;
2820 CopyBytes<kNumBytes>(bits, &mask_bits);
2821 if (N < 8) {
2822 mask_bits &= (1ull << N) - 1;
2823 }
2824
2825 const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
2826 StoreU(c, d, unaligned);
2827 return PopCount(mask_bits);
2828}
2829
2830// ------------------------------ StoreInterleaved2/3/4
2831
2832// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
2833// generic_ops-inl.h.
2834
2835// ------------------------------ MulEven/Odd (Load)
2836
2838 const Vec256<uint64_t> b) {
2839 alignas(32) uint64_t mul[2];
2840 mul[0] =
2841 Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
2842 static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
2843 return Load(Full256<uint64_t>(), mul);
2844}
2845
2847 const Vec256<uint64_t> b) {
2848 alignas(32) uint64_t mul[2];
2849 mul[0] =
2850 Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
2851 static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
2852 return Load(Full256<uint64_t>(), mul);
2853}
2854
2855// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2856
2860 const Vec256<float> sum0,
2861 Vec256<float>& sum1) {
2862 const Repartition<uint16_t, decltype(df32)> du16;
2863 const RebindToUnsigned<decltype(df32)> du32;
2864 const Vec256<uint16_t> zero = Zero(du16);
2865 const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
2866 const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
2867 const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
2868 const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
2869 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2870 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2871}
2872
2873// ------------------------------ Reductions
2874
2875namespace detail {
2876
2877// u32/i32/f32:
2878
2879template <typename T>
2881 const Vec256<T> v3210) {
2882 const Vec256<T> v1032 = Shuffle1032(v3210);
2883 const Vec256<T> v31_20_31_20 = v3210 + v1032;
2884 const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2885 return v20_31_20_31 + v31_20_31_20;
2886}
2887template <typename T>
2889 const Vec256<T> v3210) {
2890 const Vec256<T> v1032 = Shuffle1032(v3210);
2891 const Vec256<T> v31_20_31_20 = Min(v3210, v1032);
2892 const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2893 return Min(v20_31_20_31, v31_20_31_20);
2894}
2895template <typename T>
2897 const Vec256<T> v3210) {
2898 const Vec256<T> v1032 = Shuffle1032(v3210);
2899 const Vec256<T> v31_20_31_20 = Max(v3210, v1032);
2900 const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
2901 return Max(v20_31_20_31, v31_20_31_20);
2902}
2903
2904// u64/i64/f64:
2905
2906template <typename T>
2908 const Vec256<T> v10) {
2909 const Vec256<T> v01 = Shuffle01(v10);
2910 return v10 + v01;
2911}
2912template <typename T>
2914 const Vec256<T> v10) {
2915 const Vec256<T> v01 = Shuffle01(v10);
2916 return Min(v10, v01);
2917}
2918template <typename T>
2920 const Vec256<T> v10) {
2921 const Vec256<T> v01 = Shuffle01(v10);
2922 return Max(v10, v01);
2923}
2924
2925// u16/i16
2926template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2929 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
2930 const auto odd = ShiftRight<16>(BitCast(d32, v));
2931 const auto min = MinOfLanes(d32, Min(even, odd));
2932 // Also broadcast into odd lanes.
2933 return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
2934}
2935template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2938 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
2939 const auto odd = ShiftRight<16>(BitCast(d32, v));
2940 const auto min = MaxOfLanes(d32, Max(even, odd));
2941 // Also broadcast into odd lanes.
2942 return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
2943}
2944
2945} // namespace detail
2946
2947// Supported for u/i/f 32/64. Returns the same value in each lane.
2948template <typename T>
2950 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2951}
2952template <typename T>
2954 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2955}
2956template <typename T>
2958 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
2959}
2960
2961// ------------------------------ Lt128
2962
2963template <typename T>
2965
2966template <typename T>
2968
2969template <typename T>
2971
2972template <typename T>
2974
2975template <typename T>
2977
2978template <typename T>
2980
2981// NOLINTNEXTLINE(google-readability-namespace-comments)
2982} // namespace HWY_NAMESPACE
2983} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_ABORT(format,...)
Definition: base.h:141
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
Raw raw
Definition: arm_neon-inl.h:814
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:61
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:55
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:49
Raw raw
Definition: x86_256-inl.h:100
Vec128< T > v1
Definition: wasm_256-inl.h:66
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:58
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:46
Vec128< T > v0
Definition: wasm_256-inl.h:65
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:43
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:2144
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3476
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec256< T > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2533
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5207
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE Vec256< T > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_256-inl.h:2666
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: wasm_256-inl.h:1801
__v128_u raw
Definition: wasm_256-inl.h:1802
Definition: wasm_256-inl.h:70
Mask128< T > m1
Definition: wasm_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:137
Mask128< T > m0
Definition: wasm_256-inl.h:71
Definition: ops/shared-inl.h:40
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()