Grok  9.5.0
x86_256-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when
16 // compiling for that target.
17 // External include guard in highway.h - see comment there.
18 
19 // WARNING: most operations do not cross 128-bit block boundaries. In
20 // particular, "Broadcast", pack and zip behavior may be surprising.
21 
22 #include <immintrin.h> // AVX2+
23 #if defined(_MSC_VER) && defined(__clang__)
24 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
25 // including these headers when _MSC_VER is defined, like when using clang-cl.
26 // Include these directly here.
27 #include <avxintrin.h>
28 // avxintrin defines __m256i and must come before avx2intrin.
29 #include <avx2intrin.h>
30 #include <bmi2intrin.h> // _pext_u64
31 #include <f16cintrin.h>
32 #include <fmaintrin.h>
33 #include <smmintrin.h>
34 #endif
35 
36 #include <stddef.h>
37 #include <stdint.h>
38 
39 // For half-width vectors. Already includes base.h and shared-inl.h.
40 #include "hwy/ops/x86_128-inl.h"
41 
43 namespace hwy {
44 namespace HWY_NAMESPACE {
45 
46 template <typename T>
47 using Full256 = Simd<T, 32 / sizeof(T)>;
48 
49 namespace detail {
50 
51 template <typename T>
52 struct Raw256 {
53  using type = __m256i;
54 };
55 template <>
56 struct Raw256<float> {
57  using type = __m256;
58 };
59 template <>
60 struct Raw256<double> {
61  using type = __m256d;
62 };
63 
64 } // namespace detail
65 
66 template <typename T>
67 class Vec256 {
68  using Raw = typename detail::Raw256<T>::type;
69 
70  public:
71  // Compound assignment. Only usable if there is a corresponding non-member
72  // binary operator overload. For example, only f32 and f64 support division.
74  return *this = (*this * other);
75  }
77  return *this = (*this / other);
78  }
80  return *this = (*this + other);
81  }
83  return *this = (*this - other);
84  }
86  return *this = (*this & other);
87  }
89  return *this = (*this | other);
90  }
92  return *this = (*this ^ other);
93  }
94 
96 };
97 
98 #if HWY_TARGET <= HWY_AVX3
99 
100 namespace detail {
101 
102 // Template arg: sizeof(lane type)
103 template <size_t size>
104 struct RawMask256 {};
105 template <>
106 struct RawMask256<1> {
107  using type = __mmask32;
108 };
109 template <>
110 struct RawMask256<2> {
111  using type = __mmask16;
112 };
113 template <>
114 struct RawMask256<4> {
115  using type = __mmask8;
116 };
117 template <>
118 struct RawMask256<8> {
119  using type = __mmask8;
120 };
121 
122 } // namespace detail
123 
124 template <typename T>
125 struct Mask256 {
126  using Raw = typename detail::RawMask256<sizeof(T)>::type;
127 
128  static Mask256<T> FromBits(uint64_t mask_bits) {
129  return Mask256<T>{static_cast<Raw>(mask_bits)};
130  }
131 
133 };
134 
135 #else // AVX2
136 
137 // FF..FF or 0.
138 template <typename T>
139 struct Mask256 {
140  typename detail::Raw256<T>::type raw;
141 };
142 
143 #endif // HWY_TARGET <= HWY_AVX3
144 
145 // ------------------------------ BitCast
146 
147 namespace detail {
148 
149 HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
150 HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); }
151 HWY_INLINE __m256i BitCastToInteger(__m256d v) {
152  return _mm256_castpd_si256(v);
153 }
154 
155 template <typename T>
158 }
159 
160 // Cannot rely on function overloading because return types differ.
161 template <typename T>
163  HWY_INLINE __m256i operator()(__m256i v) { return v; }
164 };
165 template <>
166 struct BitCastFromInteger256<float> {
167  HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); }
168 };
169 template <>
170 struct BitCastFromInteger256<double> {
171  HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); }
172 };
173 
174 template <typename T>
177 }
178 
179 } // namespace detail
180 
181 template <typename T, typename FromT>
184 }
185 
186 // ------------------------------ Set
187 
188 // Returns an all-zero vector.
189 template <typename T>
191  return Vec256<T>{_mm256_setzero_si256()};
192 }
194  return Vec256<float>{_mm256_setzero_ps()};
195 }
197  return Vec256<double>{_mm256_setzero_pd()};
198 }
199 
200 // Returns a vector with all lanes set to "t".
201 HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
202  return Vec256<uint8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
203 }
204 HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
205  return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
206 }
207 HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
208  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
209 }
210 HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
211  return Vec256<uint64_t>{
212  _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
213 }
214 HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
215  return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
216 }
217 HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
218  return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
219 }
220 HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
221  return Vec256<int32_t>{_mm256_set1_epi32(t)};
222 }
223 HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
224  return Vec256<int64_t>{
225  _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
226 }
227 HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
228  return Vec256<float>{_mm256_set1_ps(t)};
229 }
230 HWY_API Vec256<double> Set(Full256<double> /* tag */, const double t) {
231  return Vec256<double>{_mm256_set1_pd(t)};
232 }
233 
234 HWY_DIAGNOSTICS(push)
235 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
236 
237 // Returns a vector with uninitialized elements.
238 template <typename T>
240  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
241  // generate an XOR instruction.
242  return Vec256<T>{_mm256_undefined_si256()};
243 }
245  return Vec256<float>{_mm256_undefined_ps()};
246 }
248  return Vec256<double>{_mm256_undefined_pd()};
249 }
250 
251 HWY_DIAGNOSTICS(pop)
252 
253 // ================================================== LOGICAL
254 
255 // ------------------------------ And
256 
257 template <typename T>
259  return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
260 }
261 
263  return Vec256<float>{_mm256_and_ps(a.raw, b.raw)};
264 }
266  return Vec256<double>{_mm256_and_pd(a.raw, b.raw)};
267 }
268 
269 // ------------------------------ AndNot
270 
271 // Returns ~not_mask & mask.
272 template <typename T>
274  return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
275 }
277  const Vec256<float> mask) {
278  return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
279 }
281  const Vec256<double> mask) {
282  return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)};
283 }
284 
285 // ------------------------------ Or
286 
287 template <typename T>
289  return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
290 }
291 
293  return Vec256<float>{_mm256_or_ps(a.raw, b.raw)};
294 }
296  return Vec256<double>{_mm256_or_pd(a.raw, b.raw)};
297 }
298 
299 // ------------------------------ Xor
300 
301 template <typename T>
303  return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
304 }
305 
307  return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)};
308 }
310  return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)};
311 }
312 
313 // ------------------------------ Not
314 
315 template <typename T>
317  using TU = MakeUnsigned<T>;
318 #if HWY_TARGET <= HWY_AVX3
319  const __m256i vu = BitCast(Full256<TU>(), v).raw;
320  return BitCast(Full256<T>(),
321  Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
322 #else
323  return Xor(v, BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
324 #endif
325 }
326 
327 // ------------------------------ Operator overloads (internal-only if float)
328 
329 template <typename T>
331  return And(a, b);
332 }
333 
334 template <typename T>
336  return Or(a, b);
337 }
338 
339 template <typename T>
341  return Xor(a, b);
342 }
343 
344 // ------------------------------ PopulationCount
345 
346 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
347 #if HWY_TARGET == HWY_AVX3_DL
348 
349 #ifdef HWY_NATIVE_POPCNT
350 #undef HWY_NATIVE_POPCNT
351 #else
352 #define HWY_NATIVE_POPCNT
353 #endif
354 
355 namespace detail {
356 
357 template <typename T>
359  return Vec256<T>{_mm256_popcnt_epi8(v.raw)};
360 }
361 template <typename T>
363  return Vec256<T>{_mm256_popcnt_epi16(v.raw)};
364 }
365 template <typename T>
367  return Vec256<T>{_mm256_popcnt_epi32(v.raw)};
368 }
369 template <typename T>
371  return Vec256<T>{_mm256_popcnt_epi64(v.raw)};
372 }
373 
374 } // namespace detail
375 
376 template <typename T>
378  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
379 }
380 
381 #endif // HWY_TARGET == HWY_AVX3_DL
382 
383 // ================================================== SIGN
384 
385 // ------------------------------ CopySign
386 
387 template <typename T>
388 HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
389  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
390 
391  const Full256<T> d;
392  const auto msb = SignBit(d);
393 
394 #if HWY_TARGET <= HWY_AVX3
395  const Rebind<MakeUnsigned<T>, decltype(d)> du;
396  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
397  // 0 0 0 | 0
398  // 0 0 1 | 0
399  // 0 1 0 | 1
400  // 0 1 1 | 1
401  // 1 0 0 | 0
402  // 1 0 1 | 1
403  // 1 1 0 | 0
404  // 1 1 1 | 1
405  // The lane size does not matter because we are not using predication.
406  const __m256i out = _mm256_ternarylogic_epi32(
407  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
408  return BitCast(d, decltype(Zero(du)){out});
409 #else
410  return Or(AndNot(msb, magn), And(msb, sign));
411 #endif
412 }
413 
414 template <typename T>
416 #if HWY_TARGET <= HWY_AVX3
417  // AVX3 can also handle abs < 0, so no extra action needed.
418  return CopySign(abs, sign);
419 #else
420  return Or(abs, And(SignBit(Full256<T>()), sign));
421 #endif
422 }
423 
424 // ================================================== MASK
425 
426 #if HWY_TARGET <= HWY_AVX3
427 
428 // ------------------------------ IfThenElse
429 
430 // Returns mask ? b : a.
431 
432 namespace detail {
433 
434 // Templates for signed/unsigned integer of a particular size.
435 template <typename T>
437  Vec256<T> yes, Vec256<T> no) {
438  return Vec256<T>{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
439 }
440 template <typename T>
442  Vec256<T> yes, Vec256<T> no) {
443  return Vec256<T>{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
444 }
445 template <typename T>
447  Vec256<T> yes, Vec256<T> no) {
448  return Vec256<T>{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
449 }
450 template <typename T>
452  Vec256<T> yes, Vec256<T> no) {
453  return Vec256<T>{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
454 }
455 
456 } // namespace detail
457 
458 template <typename T>
460  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
461 }
463  Vec256<float> no) {
464  return Vec256<float>{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)};
465 }
467  Vec256<double> no) {
468  return Vec256<double>{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)};
469 }
470 
471 namespace detail {
472 
473 template <typename T>
475  Vec256<T> yes) {
476  return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)};
477 }
478 template <typename T>
480  Vec256<T> yes) {
481  return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)};
482 }
483 template <typename T>
485  Vec256<T> yes) {
486  return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)};
487 }
488 template <typename T>
490  Vec256<T> yes) {
491  return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)};
492 }
493 
494 } // namespace detail
495 
496 template <typename T>
498  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
499 }
501  return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)};
502 }
504  Vec256<double> yes) {
505  return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)};
506 }
507 
508 namespace detail {
509 
510 template <typename T>
512  Vec256<T> no) {
513  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
514  return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
515 }
516 template <typename T>
518  Vec256<T> no) {
519  return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
520 }
521 template <typename T>
523  Vec256<T> no) {
524  return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
525 }
526 template <typename T>
528  Vec256<T> no) {
529  return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
530 }
531 
532 } // namespace detail
533 
534 template <typename T>
536  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
537 }
539  return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
540 }
542  return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
543 }
544 
545 template <typename T, HWY_IF_FLOAT(T)>
547  // AVX3 MaskFromVec only looks at the MSB
548  return IfThenZeroElse(MaskFromVec(v), v);
549 }
550 
551 // ------------------------------ Mask logical
552 
553 namespace detail {
554 
555 template <typename T>
557  const Mask256<T> b) {
558 #if HWY_COMPILER_HAS_MASK_INTRINSICS
559  return Mask256<T>{_kand_mask32(a.raw, b.raw)};
560 #else
561  return Mask256<T>{a.raw & b.raw};
562 #endif
563 }
564 template <typename T>
566  const Mask256<T> b) {
567 #if HWY_COMPILER_HAS_MASK_INTRINSICS
568  return Mask256<T>{_kand_mask16(a.raw, b.raw)};
569 #else
570  return Mask256<T>{a.raw & b.raw};
571 #endif
572 }
573 template <typename T>
575  const Mask256<T> b) {
576 #if HWY_COMPILER_HAS_MASK_INTRINSICS
577  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
578 #else
579  return Mask256<T>{static_cast<uint16_t>(a.raw & b.raw)};
580 #endif
581 }
582 template <typename T>
584  const Mask256<T> b) {
585 #if HWY_COMPILER_HAS_MASK_INTRINSICS
586  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
587 #else
588  return Mask256<T>{static_cast<uint8_t>(a.raw & b.raw)};
589 #endif
590 }
591 
592 template <typename T>
594  const Mask256<T> b) {
595 #if HWY_COMPILER_HAS_MASK_INTRINSICS
596  return Mask256<T>{_kandn_mask32(a.raw, b.raw)};
597 #else
598  return Mask256<T>{~a.raw & b.raw};
599 #endif
600 }
601 template <typename T>
603  const Mask256<T> b) {
604 #if HWY_COMPILER_HAS_MASK_INTRINSICS
605  return Mask256<T>{_kandn_mask16(a.raw, b.raw)};
606 #else
607  return Mask256<T>{~a.raw & b.raw};
608 #endif
609 }
610 template <typename T>
612  const Mask256<T> b) {
613 #if HWY_COMPILER_HAS_MASK_INTRINSICS
614  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
615 #else
616  return Mask256<T>{static_cast<uint16_t>(~a.raw & b.raw)};
617 #endif
618 }
619 template <typename T>
621  const Mask256<T> b) {
622 #if HWY_COMPILER_HAS_MASK_INTRINSICS
623  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
624 #else
625  return Mask256<T>{static_cast<uint8_t>(~a.raw & b.raw)};
626 #endif
627 }
628 
629 template <typename T>
631  const Mask256<T> b) {
632 #if HWY_COMPILER_HAS_MASK_INTRINSICS
633  return Mask256<T>{_kor_mask32(a.raw, b.raw)};
634 #else
635  return Mask256<T>{a.raw | b.raw};
636 #endif
637 }
638 template <typename T>
640  const Mask256<T> b) {
641 #if HWY_COMPILER_HAS_MASK_INTRINSICS
642  return Mask256<T>{_kor_mask16(a.raw, b.raw)};
643 #else
644  return Mask256<T>{a.raw | b.raw};
645 #endif
646 }
647 template <typename T>
649  const Mask256<T> b) {
650 #if HWY_COMPILER_HAS_MASK_INTRINSICS
651  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
652 #else
653  return Mask256<T>{static_cast<uint16_t>(a.raw | b.raw)};
654 #endif
655 }
656 template <typename T>
658  const Mask256<T> b) {
659 #if HWY_COMPILER_HAS_MASK_INTRINSICS
660  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
661 #else
662  return Mask256<T>{static_cast<uint8_t>(a.raw | b.raw)};
663 #endif
664 }
665 
666 template <typename T>
668  const Mask256<T> b) {
669 #if HWY_COMPILER_HAS_MASK_INTRINSICS
670  return Mask256<T>{_kxor_mask32(a.raw, b.raw)};
671 #else
672  return Mask256<T>{a.raw ^ b.raw};
673 #endif
674 }
675 template <typename T>
677  const Mask256<T> b) {
678 #if HWY_COMPILER_HAS_MASK_INTRINSICS
679  return Mask256<T>{_kxor_mask16(a.raw, b.raw)};
680 #else
681  return Mask256<T>{a.raw ^ b.raw};
682 #endif
683 }
684 template <typename T>
686  const Mask256<T> b) {
687 #if HWY_COMPILER_HAS_MASK_INTRINSICS
688  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
689 #else
690  return Mask256<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
691 #endif
692 }
693 template <typename T>
695  const Mask256<T> b) {
696 #if HWY_COMPILER_HAS_MASK_INTRINSICS
697  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
698 #else
699  return Mask256<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
700 #endif
701 }
702 
703 } // namespace detail
704 
705 template <typename T>
707  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
708 }
709 
710 template <typename T>
712  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
713 }
714 
715 template <typename T>
717  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
718 }
719 
720 template <typename T>
722  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
723 }
724 
725 template <typename T>
727  // Flip only the valid bits.
728  constexpr size_t N = 32 / sizeof(T);
729  return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
730 }
731 
732 #else // AVX2
733 
734 // ------------------------------ Mask
735 
736 // Mask and Vec are the same (true = FF..FF).
737 template <typename T>
738 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
739  return Mask256<T>{v.raw};
740 }
741 
742 template <typename T>
743 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
744  return Vec256<T>{v.raw};
745 }
746 
747 template <typename T>
748 HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
749  return Vec256<T>{v.raw};
750 }
751 
752 // ------------------------------ IfThenElse
753 
754 // mask ? yes : no
755 template <typename T>
756 HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
757  const Vec256<T> no) {
758  return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
759 }
760 HWY_API Vec256<float> IfThenElse(const Mask256<float> mask,
761  const Vec256<float> yes,
762  const Vec256<float> no) {
763  return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
764 }
765 HWY_API Vec256<double> IfThenElse(const Mask256<double> mask,
766  const Vec256<double> yes,
767  const Vec256<double> no) {
768  return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
769 }
770 
771 // mask ? yes : 0
772 template <typename T>
773 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
774  return yes & VecFromMask(Full256<T>(), mask);
775 }
776 
777 // mask ? 0 : no
778 template <typename T>
779 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
780  return AndNot(VecFromMask(Full256<T>(), mask), no);
781 }
782 
783 template <typename T, HWY_IF_FLOAT(T)>
784 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
785  const auto zero = Zero(Full256<T>());
786  return IfThenElse(MaskFromVec(v), zero, v);
787 }
788 
789 // ------------------------------ Mask logical
790 
791 template <typename T>
792 HWY_API Mask256<T> Not(const Mask256<T> m) {
793  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
794 }
795 
796 template <typename T>
797 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
798  const Full256<T> d;
799  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
800 }
801 
802 template <typename T>
803 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
804  const Full256<T> d;
805  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
806 }
807 
808 template <typename T>
809 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
810  const Full256<T> d;
811  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
812 }
813 
814 template <typename T>
815 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
816  const Full256<T> d;
817  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
818 }
819 
820 #endif // HWY_TARGET <= HWY_AVX3
821 
822 // ================================================== COMPARE
823 
824 #if HWY_TARGET <= HWY_AVX3
825 
826 // Comparisons set a mask bit to 1 if the condition is true, else 0.
827 
828 template <typename TFrom, typename TTo>
830  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
831  return Mask256<TTo>{m.raw};
832 }
833 
834 namespace detail {
835 
836 template <typename T>
838  const Vec256<T> bit) {
839  return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)};
840 }
841 template <typename T>
843  const Vec256<T> bit) {
844  return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)};
845 }
846 template <typename T>
848  const Vec256<T> bit) {
849  return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)};
850 }
851 template <typename T>
853  const Vec256<T> bit) {
854  return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)};
855 }
856 
857 } // namespace detail
858 
859 template <typename T>
861  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
862  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
863 }
864 
865 // ------------------------------ Equality
866 
867 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
869  return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)};
870 }
871 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
872 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
873  return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
874 }
875 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
876 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
877  return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
878 }
879 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
880 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
881  return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
882 }
883 
885  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
886 }
887 
889  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
890 }
891 
892 // ------------------------------ Inequality
893 
894 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
896  return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)};
897 }
898 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
899 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
900  return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
901 }
902 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
903 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
904  return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
905 }
906 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
907 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
908  return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
909 }
910 
912  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
913 }
914 
916  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
917 }
918 
919 // ------------------------------ Strict inequality
920 
921 // Signed/float <
923  return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
924 }
926  return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
927 }
929  return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
930 }
932  return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
933 }
935  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
936 }
938  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
939 }
940 
941 // ------------------------------ Weak inequality
942 
944  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
945 }
947  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
948 }
949 
950 // ------------------------------ Mask
951 
952 namespace detail {
953 
954 template <typename T>
956  return Mask256<T>{_mm256_movepi8_mask(v.raw)};
957 }
958 template <typename T>
960  return Mask256<T>{_mm256_movepi16_mask(v.raw)};
961 }
962 template <typename T>
964  return Mask256<T>{_mm256_movepi32_mask(v.raw)};
965 }
966 template <typename T>
968  return Mask256<T>{_mm256_movepi64_mask(v.raw)};
969 }
970 
971 } // namespace detail
972 
973 template <typename T>
975  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
976 }
977 // There do not seem to be native floating-point versions of these instructions.
980 }
983 }
984 
985 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
987  return Vec256<T>{_mm256_movm_epi8(v.raw)};
988 }
989 
990 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
991 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
992  return Vec256<T>{_mm256_movm_epi16(v.raw)};
993 }
994 
995 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
996 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
997  return Vec256<T>{_mm256_movm_epi32(v.raw)};
998 }
999 
1000 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1001 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1002  return Vec256<T>{_mm256_movm_epi64(v.raw)};
1003 }
1004 
1006  return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))};
1007 }
1008 
1010  return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))};
1011 }
1012 
1013 template <typename T>
1015  return VecFromMask(v);
1016 }
1017 
1018 #else // AVX2
1019 
1020 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1021 
1022 template <typename TFrom, typename TTo>
1023 HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
1024  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1025  return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
1026 }
1027 
1028 template <typename T>
1029 HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
1030  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1031  return (v & bit) == bit;
1032 }
1033 
1034 // ------------------------------ Equality
1035 
1036 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1037 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1038  return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1039 }
1040 
1041 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1042 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1043  return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1044 }
1045 
1046 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1047 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1048  return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1049 }
1050 
1051 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1052 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1053  return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1054 }
1055 
1056 HWY_API Mask256<float> operator==(const Vec256<float> a,
1057  const Vec256<float> b) {
1058  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1059 }
1060 
1061 HWY_API Mask256<double> operator==(const Vec256<double> a,
1062  const Vec256<double> b) {
1063  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1064 }
1065 
1066 // ------------------------------ Inequality
1067 
1068 template <typename T, HWY_IF_NOT_FLOAT(T)>
1069 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1070  return Not(a == b);
1071 }
1072 
1073 HWY_API Mask256<float> operator!=(const Vec256<float> a,
1074  const Vec256<float> b) {
1075  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1076 }
1077 HWY_API Mask256<double> operator!=(const Vec256<double> a,
1078  const Vec256<double> b) {
1079  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1080 }
1081 
1082 // ------------------------------ Strict inequality
1083 
1084 // Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
1085 // to perform an unsigned comparison instead of the intended signed. Workaround
1086 // is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
1087 #if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1088 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1089 #else
1090 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1091 #endif
1092 
1093 // Signed/float <
1094 HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
1095 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1096  using i8x32 = signed char __attribute__((__vector_size__(32)));
1097  return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
1098  reinterpret_cast<i8x32>(b.raw))};
1099 #else
1100  return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1101 #endif
1102 }
1103 HWY_API Mask256<int16_t> operator>(const Vec256<int16_t> a,
1104  const Vec256<int16_t> b) {
1105  return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1106 }
1107 HWY_API Mask256<int32_t> operator>(const Vec256<int32_t> a,
1108  const Vec256<int32_t> b) {
1109  return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1110 }
1111 HWY_API Mask256<int64_t> operator>(const Vec256<int64_t> a,
1112  const Vec256<int64_t> b) {
1113  return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1114 }
1115 HWY_API Mask256<float> operator>(const Vec256<float> a, const Vec256<float> b) {
1116  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1117 }
1118 HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
1119  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1120 }
1121 
1122 // ------------------------------ Weak inequality
1123 
1124 HWY_API Mask256<float> operator>=(const Vec256<float> a,
1125  const Vec256<float> b) {
1126  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1127 }
1128 HWY_API Mask256<double> operator>=(const Vec256<double> a,
1129  const Vec256<double> b) {
1130  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1131 }
1132 
1133 #endif // HWY_TARGET <= HWY_AVX3
1134 
1135 // ------------------------------ Reversed comparisons
1136 
1137 template <typename T>
1139  return b > a;
1140 }
1141 
1142 template <typename T>
1144  return b >= a;
1145 }
1146 
1147 // ------------------------------ Min (Gt, IfThenElse)
1148 
1149 // Unsigned
1151  return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1152 }
1154  const Vec256<uint16_t> b) {
1155  return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1156 }
1158  const Vec256<uint32_t> b) {
1159  return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1160 }
1162  const Vec256<uint64_t> b) {
1163 #if HWY_TARGET <= HWY_AVX3
1164  return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1165 #else
1166  const Full256<uint64_t> du;
1167  const Full256<int64_t> di;
1168  const auto msb = Set(du, 1ull << 63);
1169  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1170  return IfThenElse(gt, b, a);
1171 #endif
1172 }
1173 
1174 // Signed
1176  return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1177 }
1179  return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1180 }
1182  return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1183 }
1185 #if HWY_TARGET <= HWY_AVX3
1186  return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1187 #else
1188  return IfThenElse(a < b, a, b);
1189 #endif
1190 }
1191 
1192 // Float
1194  return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1195 }
1197  return Vec256<double>{_mm256_min_pd(a.raw, b.raw)};
1198 }
1199 
1200 // ------------------------------ Max (Gt, IfThenElse)
1201 
1202 // Unsigned
1204  return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1205 }
1207  const Vec256<uint16_t> b) {
1208  return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1209 }
1211  const Vec256<uint32_t> b) {
1212  return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1213 }
1215  const Vec256<uint64_t> b) {
1216 #if HWY_TARGET <= HWY_AVX3
1217  return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1218 #else
1219  const Full256<uint64_t> du;
1220  const Full256<int64_t> di;
1221  const auto msb = Set(du, 1ull << 63);
1222  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1223  return IfThenElse(gt, a, b);
1224 #endif
1225 }
1226 
1227 // Signed
1229  return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1230 }
1232  return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1233 }
1235  return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1236 }
1238 #if HWY_TARGET <= HWY_AVX3
1239  return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1240 #else
1241  return IfThenElse(a < b, b, a);
1242 #endif
1243 }
1244 
1245 // Float
1247  return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1248 }
1250  return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
1251 }
1252 
1253 // ------------------------------ FirstN (Iota, Lt)
1254 
1255 template <typename T>
1256 HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
1257 #if HWY_TARGET <= HWY_AVX3
1258  (void)d;
1259 #if HWY_ARCH_X86_64
1260  return Mask256<T>::FromBits(_bzhi_u64(~0ull, n));
1261 #else
1262  return Mask256<T>::FromBits(_bzhi_u32(~0u, static_cast<uint32_t>(n)));
1263 #endif // HWY_ARCH_X86_64
1264 #else
1265  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1266  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
1267 #endif
1268 }
1269 
1270 // ================================================== ARITHMETIC
1271 
1272 // ------------------------------ Addition
1273 
1274 // Unsigned
1276  const Vec256<uint8_t> b) {
1277  return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1278 }
1280  const Vec256<uint16_t> b) {
1281  return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
1282 }
1284  const Vec256<uint32_t> b) {
1285  return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
1286 }
1288  const Vec256<uint64_t> b) {
1289  return Vec256<uint64_t>{_mm256_add_epi64(a.raw, b.raw)};
1290 }
1291 
1292 // Signed
1294  const Vec256<int8_t> b) {
1295  return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1296 }
1298  const Vec256<int16_t> b) {
1299  return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1300 }
1302  const Vec256<int32_t> b) {
1303  return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1304 }
1306  const Vec256<int64_t> b) {
1307  return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)};
1308 }
1309 
1310 // Float
1312  return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1313 }
1315  const Vec256<double> b) {
1316  return Vec256<double>{_mm256_add_pd(a.raw, b.raw)};
1317 }
1318 
1319 // ------------------------------ Subtraction
1320 
1321 // Unsigned
1323  const Vec256<uint8_t> b) {
1324  return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1325 }
1327  const Vec256<uint16_t> b) {
1328  return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1329 }
1331  const Vec256<uint32_t> b) {
1332  return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1333 }
1335  const Vec256<uint64_t> b) {
1336  return Vec256<uint64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1337 }
1338 
1339 // Signed
1341  const Vec256<int8_t> b) {
1342  return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1343 }
1345  const Vec256<int16_t> b) {
1346  return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1347 }
1349  const Vec256<int32_t> b) {
1350  return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1351 }
1353  const Vec256<int64_t> b) {
1354  return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1355 }
1356 
1357 // Float
1359  return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1360 }
1362  const Vec256<double> b) {
1363  return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1364 }
1365 
1366 // ------------------------------ Saturating addition
1367 
1368 // Returns a + b clamped to the destination range.
1369 
1370 // Unsigned
1372  const Vec256<uint8_t> b) {
1373  return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
1374 }
1376  const Vec256<uint16_t> b) {
1377  return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
1378 }
1379 
1380 // Signed
1382  const Vec256<int8_t> b) {
1383  return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
1384 }
1386  const Vec256<int16_t> b) {
1387  return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1388 }
1389 
1390 // ------------------------------ Saturating subtraction
1391 
1392 // Returns a - b clamped to the destination range.
1393 
1394 // Unsigned
1396  const Vec256<uint8_t> b) {
1397  return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
1398 }
1400  const Vec256<uint16_t> b) {
1401  return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
1402 }
1403 
1404 // Signed
1406  const Vec256<int8_t> b) {
1407  return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
1408 }
1410  const Vec256<int16_t> b) {
1411  return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1412 }
1413 
1414 // ------------------------------ Average
1415 
1416 // Returns (a + b + 1) / 2
1417 
1418 // Unsigned
1420  const Vec256<uint8_t> b) {
1421  return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
1422 }
1424  const Vec256<uint16_t> b) {
1425  return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
1426 }
1427 
1428 // ------------------------------ Abs (Sub)
1429 
1430 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1432 #if HWY_COMPILER_MSVC
1433  // Workaround for incorrect codegen? (wrong result)
1434  const auto zero = Zero(Full256<int8_t>());
1435  return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
1436 #else
1437  return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
1438 #endif
1439 }
1441  return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
1442 }
1444  return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
1445 }
1446 // i64 is implemented after BroadcastSignBit.
1447 
1449  const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
1450  return v & BitCast(Full256<float>(), mask);
1451 }
1453  const Vec256<int64_t> mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
1454  return v & BitCast(Full256<double>(), mask);
1455 }
1456 
1457 // ------------------------------ Integer multiplication
1458 
1459 // Unsigned
1461  const Vec256<uint16_t> b) {
1462  return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1463 }
1465  const Vec256<uint32_t> b) {
1466  return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1467 }
1468 
1469 // Signed
1471  const Vec256<int16_t> b) {
1472  return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1473 }
1475  const Vec256<int32_t> b) {
1476  return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1477 }
1478 
1479 // Returns the upper 16 bits of a * b in each lane.
1481  const Vec256<uint16_t> b) {
1482  return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
1483 }
1485  const Vec256<int16_t> b) {
1486  return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
1487 }
1488 
1489 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1490 // even and the upper half into its odd neighbor lane.
1492  const Vec256<int32_t> b) {
1493  return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1494 }
1496  const Vec256<uint32_t> b) {
1497  return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1498 }
1499 
1500 // ------------------------------ ShiftLeft
1501 
1502 template <int kBits>
1504  return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)};
1505 }
1506 
1507 template <int kBits>
1509  return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)};
1510 }
1511 
1512 template <int kBits>
1514  return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)};
1515 }
1516 
1517 template <int kBits>
1519  return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)};
1520 }
1521 
1522 template <int kBits>
1524  return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)};
1525 }
1526 
1527 template <int kBits>
1529  return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
1530 }
1531 
1532 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
1534  const Full256<T> d8;
1535  const RepartitionToWide<decltype(d8)> d16;
1536  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
1537  return kBits == 1
1538  ? (v + v)
1539  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1540 }
1541 
1542 // ------------------------------ ShiftRight
1543 
1544 template <int kBits>
1546  return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)};
1547 }
1548 
1549 template <int kBits>
1551  return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)};
1552 }
1553 
1554 template <int kBits>
1556  return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)};
1557 }
1558 
1559 template <int kBits>
1561  const Full256<uint8_t> d8;
1562  // Use raw instead of BitCast to support N=1.
1563  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
1564  return shifted & Set(d8, 0xFF >> kBits);
1565 }
1566 
1567 template <int kBits>
1569  return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
1570 }
1571 
1572 template <int kBits>
1574  return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
1575 }
1576 
1577 template <int kBits>
1579  const Full256<int8_t> di;
1580  const Full256<uint8_t> du;
1581  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1582  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1583  return (shifted ^ shifted_sign) - shifted_sign;
1584 }
1585 
1586 // i64 is implemented after BroadcastSignBit.
1587 
1588 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1589 
1591  return VecFromMask(v < Zero(Full256<int8_t>()));
1592 }
1593 
1595  return ShiftRight<15>(v);
1596 }
1597 
1599  return ShiftRight<31>(v);
1600 }
1601 
1603 #if HWY_TARGET == HWY_AVX2
1604  return VecFromMask(v < Zero(Full256<int64_t>()));
1605 #else
1606  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
1607 #endif
1608 }
1609 
1610 template <int kBits>
1612 #if HWY_TARGET <= HWY_AVX3
1613  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, kBits)};
1614 #else
1615  const Full256<int64_t> di;
1616  const Full256<uint64_t> du;
1617  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1618  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
1619  return right | sign;
1620 #endif
1621 }
1622 
1624 #if HWY_TARGET <= HWY_AVX3
1625  return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
1626 #else
1627  const auto zero = Zero(Full256<int64_t>());
1628  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1629 #endif
1630 }
1631 
1632 // ------------------------------ ShiftLeftSame
1633 
1635  const int bits) {
1636  return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1637 }
1639  const int bits) {
1640  return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1641 }
1643  const int bits) {
1644  return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1645 }
1646 
1648  return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1649 }
1650 
1652  return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1653 }
1654 
1656  return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1657 }
1658 
1659 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1660 HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
1661  const Full256<T> d8;
1662  const RepartitionToWide<decltype(d8)> d16;
1663  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
1664  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
1665 }
1666 
1667 // ------------------------------ ShiftRightSame (BroadcastSignBit)
1668 
1670  const int bits) {
1671  return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1672 }
1674  const int bits) {
1675  return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1676 }
1678  const int bits) {
1679  return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1680 }
1681 
1683  const Full256<uint8_t> d8;
1684  const RepartitionToWide<decltype(d8)> d16;
1685  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
1686  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
1687 }
1688 
1690  const int bits) {
1691  return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1692 }
1693 
1695  const int bits) {
1696  return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1697 }
1699  const int bits) {
1700 #if HWY_TARGET <= HWY_AVX3
1701  return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1702 #else
1703  const Full256<int64_t> di;
1704  const Full256<uint64_t> du;
1705  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1706  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
1707  return right | sign;
1708 #endif
1709 }
1710 
1712  const Full256<int8_t> di;
1713  const Full256<uint8_t> du;
1714  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1715  const auto shifted_sign =
1716  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1717  return (shifted ^ shifted_sign) - shifted_sign;
1718 }
1719 
1720 // ------------------------------ Neg (Xor, Sub)
1721 
1722 template <typename T, HWY_IF_FLOAT(T)>
1724  return Xor(v, SignBit(Full256<T>()));
1725 }
1726 
1727 template <typename T, HWY_IF_NOT_FLOAT(T)>
1728 HWY_API Vec256<T> Neg(const Vec256<T> v) {
1729  return Zero(Full256<T>()) - v;
1730 }
1731 
1732 // ------------------------------ Floating-point mul / div
1733 
1735  return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
1736 }
1738  const Vec256<double> b) {
1739  return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
1740 }
1741 
1743  return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
1744 }
1746  const Vec256<double> b) {
1747  return Vec256<double>{_mm256_div_pd(a.raw, b.raw)};
1748 }
1749 
1750 // Approximate reciprocal
1752  return Vec256<float>{_mm256_rcp_ps(v.raw)};
1753 }
1754 
1755 // Absolute value of difference.
1757  return Abs(a - b);
1758 }
1759 
1760 // ------------------------------ Floating-point multiply-add variants
1761 
1762 // Returns mul * x + add
1764  const Vec256<float> add) {
1765 #ifdef HWY_DISABLE_BMI2_FMA
1766  return mul * x + add;
1767 #else
1768  return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
1769 #endif
1770 }
1772  const Vec256<double> add) {
1773 #ifdef HWY_DISABLE_BMI2_FMA
1774  return mul * x + add;
1775 #else
1776  return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
1777 #endif
1778 }
1779 
1780 // Returns add - mul * x
1782  const Vec256<float> add) {
1783 #ifdef HWY_DISABLE_BMI2_FMA
1784  return add - mul * x;
1785 #else
1786  return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
1787 #endif
1788 }
1790  const Vec256<double> x,
1791  const Vec256<double> add) {
1792 #ifdef HWY_DISABLE_BMI2_FMA
1793  return add - mul * x;
1794 #else
1795  return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
1796 #endif
1797 }
1798 
1799 // Returns mul * x - sub
1801  const Vec256<float> sub) {
1802 #ifdef HWY_DISABLE_BMI2_FMA
1803  return mul * x - sub;
1804 #else
1805  return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
1806 #endif
1807 }
1809  const Vec256<double> sub) {
1810 #ifdef HWY_DISABLE_BMI2_FMA
1811  return mul * x - sub;
1812 #else
1813  return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
1814 #endif
1815 }
1816 
1817 // Returns -mul * x - sub
1819  const Vec256<float> sub) {
1820 #ifdef HWY_DISABLE_BMI2_FMA
1821  return Neg(mul * x) - sub;
1822 #else
1823  return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1824 #endif
1825 }
1827  const Vec256<double> x,
1828  const Vec256<double> sub) {
1829 #ifdef HWY_DISABLE_BMI2_FMA
1830  return Neg(mul * x) - sub;
1831 #else
1832  return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1833 #endif
1834 }
1835 
1836 // ------------------------------ Floating-point square root
1837 
1838 // Full precision square root
1840  return Vec256<float>{_mm256_sqrt_ps(v.raw)};
1841 }
1843  return Vec256<double>{_mm256_sqrt_pd(v.raw)};
1844 }
1845 
1846 // Approximate reciprocal square root
1848  return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
1849 }
1850 
1851 // ------------------------------ Floating-point rounding
1852 
1853 // Toward nearest integer, tie to even
1855  return Vec256<float>{
1856  _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1857 }
1859  return Vec256<double>{
1860  _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1861 }
1862 
1863 // Toward zero, aka truncate
1865  return Vec256<float>{
1866  _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1867 }
1869  return Vec256<double>{
1870  _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1871 }
1872 
1873 // Toward +infinity, aka ceiling
1875  return Vec256<float>{
1876  _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1877 }
1879  return Vec256<double>{
1880  _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1881 }
1882 
1883 // Toward -infinity, aka floor
1885  return Vec256<float>{
1886  _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1887 }
1889  return Vec256<double>{
1890  _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1891 }
1892 
1893 // ================================================== MEMORY
1894 
1895 // ------------------------------ Load
1896 
1897 template <typename T>
1898 HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
1899  return Vec256<T>{
1900  _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
1901 }
1903  const float* HWY_RESTRICT aligned) {
1904  return Vec256<float>{_mm256_load_ps(aligned)};
1905 }
1907  const double* HWY_RESTRICT aligned) {
1908  return Vec256<double>{_mm256_load_pd(aligned)};
1909 }
1910 
1911 template <typename T>
1913  return Vec256<T>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
1914 }
1916  const float* HWY_RESTRICT p) {
1917  return Vec256<float>{_mm256_loadu_ps(p)};
1918 }
1920  const double* HWY_RESTRICT p) {
1921  return Vec256<double>{_mm256_loadu_pd(p)};
1922 }
1923 
1924 // ------------------------------ MaskedLoad
1925 
1926 #if HWY_TARGET <= HWY_AVX3
1927 
1928 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1930  const T* HWY_RESTRICT aligned) {
1931  return Vec256<T>{_mm256_maskz_load_epi32(m.raw, aligned)};
1932 }
1933 
1934 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1935 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
1936  const T* HWY_RESTRICT aligned) {
1937  return Vec256<T>{_mm256_maskz_load_epi64(m.raw, aligned)};
1938 }
1939 
1941  const float* HWY_RESTRICT aligned) {
1942  return Vec256<float>{_mm256_maskz_load_ps(m.raw, aligned)};
1943 }
1944 
1946  const double* HWY_RESTRICT aligned) {
1947  return Vec256<double>{_mm256_maskz_load_pd(m.raw, aligned)};
1948 }
1949 
1950 // There is no load_epi8/16, so use loadu instead.
1951 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1952 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
1953  const T* HWY_RESTRICT aligned) {
1954  return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, aligned)};
1955 }
1956 
1957 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1958 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
1959  const T* HWY_RESTRICT aligned) {
1960  return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, aligned)};
1961 }
1962 
1963 #endif // else: fallback defined in x86_128-inl.h
1964 
1965 // ------------------------------ LoadDup128
1966 
1967 // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
1968 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
1969 template <typename T>
1971 #if HWY_LOADDUP_ASM
1972  __m256i out;
1973  asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
1974  return Vec256<T>{out};
1975 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
1976  // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
1977  // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
1978  // upper half undefined) is fine because we're overwriting that anyway.
1979  const __m128i v128 = LoadU(Full128<T>(), p).raw;
1980  return Vec256<T>{
1981  _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
1982 #else
1983  return Vec256<T>{_mm256_broadcastsi128_si256(LoadU(Full128<T>(), p).raw)};
1984 #endif
1985 }
1987  const float* const HWY_RESTRICT p) {
1988 #if HWY_LOADDUP_ASM
1989  __m256 out;
1990  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
1991  return Vec256<float>{out};
1992 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
1993  const __m128 v128 = LoadU(Full128<float>(), p).raw;
1994  return Vec256<float>{
1995  _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
1996 #else
1997  return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))};
1998 #endif
1999 }
2001  const double* const HWY_RESTRICT p) {
2002 #if HWY_LOADDUP_ASM
2003  __m256d out;
2004  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
2005  return Vec256<double>{out};
2006 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2007  const __m128d v128 = LoadU(Full128<double>(), p).raw;
2008  return Vec256<double>{
2009  _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2010 #else
2011  return Vec256<double>{
2012  _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))};
2013 #endif
2014 }
2015 
2016 // ------------------------------ Store
2017 
2018 template <typename T>
2019 HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
2020  _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2021 }
2022 HWY_API void Store(const Vec256<float> v, Full256<float> /* tag */,
2023  float* HWY_RESTRICT aligned) {
2024  _mm256_store_ps(aligned, v.raw);
2025 }
2027  double* HWY_RESTRICT aligned) {
2028  _mm256_store_pd(aligned, v.raw);
2029 }
2030 
2031 template <typename T>
2032 HWY_API void StoreU(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT p) {
2033  _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
2034 }
2035 HWY_API void StoreU(const Vec256<float> v, Full256<float> /* tag */,
2036  float* HWY_RESTRICT p) {
2037  _mm256_storeu_ps(p, v.raw);
2038 }
2040  double* HWY_RESTRICT p) {
2041  _mm256_storeu_pd(p, v.raw);
2042 }
2043 
2044 // ------------------------------ Non-temporal stores
2045 
2046 template <typename T>
2048  T* HWY_RESTRICT aligned) {
2049  _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2050 }
2051 HWY_API void Stream(const Vec256<float> v, Full256<float> /* tag */,
2052  float* HWY_RESTRICT aligned) {
2053  _mm256_stream_ps(aligned, v.raw);
2054 }
2056  double* HWY_RESTRICT aligned) {
2057  _mm256_stream_pd(aligned, v.raw);
2058 }
2059 
2060 // ------------------------------ Scatter
2061 
2062 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2063 HWY_DIAGNOSTICS(push)
2064 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2065 
2066 #if HWY_TARGET <= HWY_AVX3
2067 namespace detail {
2068 
2069 template <typename T>
2071  Full256<T> /* tag */, T* HWY_RESTRICT base,
2072  const Vec256<int32_t> offset) {
2073  _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
2074 }
2075 template <typename T>
2077  Full256<T> /* tag */, T* HWY_RESTRICT base,
2078  const Vec256<int32_t> index) {
2079  _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
2080 }
2081 
2082 template <typename T>
2084  Full256<T> /* tag */, T* HWY_RESTRICT base,
2085  const Vec256<int64_t> offset) {
2086  _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
2087 }
2088 template <typename T>
2090  Full256<T> /* tag */, T* HWY_RESTRICT base,
2091  const Vec256<int64_t> index) {
2092  _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
2093 }
2094 
2095 } // namespace detail
2096 
2097 template <typename T, typename Offset>
2099  const Vec256<Offset> offset) {
2100  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2101  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2102 }
2103 template <typename T, typename Index>
2105  const Vec256<Index> index) {
2106  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2107  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2108 }
2109 
2111  float* HWY_RESTRICT base,
2112  const Vec256<int32_t> offset) {
2113  _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
2114 }
2116  float* HWY_RESTRICT base,
2117  const Vec256<int32_t> index) {
2118  _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
2119 }
2120 
2122  double* HWY_RESTRICT base,
2123  const Vec256<int64_t> offset) {
2124  _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
2125 }
2127  double* HWY_RESTRICT base,
2128  const Vec256<int64_t> index) {
2129  _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
2130 }
2131 
2132 #else
2133 
2134 template <typename T, typename Offset>
2135 HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2136  const Vec256<Offset> offset) {
2137  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2138 
2139  constexpr size_t N = 32 / sizeof(T);
2140  alignas(32) T lanes[N];
2141  Store(v, d, lanes);
2142 
2143  alignas(32) Offset offset_lanes[N];
2144  Store(offset, Simd<Offset, N>(), offset_lanes);
2145 
2146  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2147  for (size_t i = 0; i < N; ++i) {
2148  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2149  }
2150 }
2151 
2152 template <typename T, typename Index>
2153 HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2154  const Vec256<Index> index) {
2155  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2156 
2157  constexpr size_t N = 32 / sizeof(T);
2158  alignas(32) T lanes[N];
2159  Store(v, d, lanes);
2160 
2161  alignas(32) Index index_lanes[N];
2162  Store(index, Simd<Index, N>(), index_lanes);
2163 
2164  for (size_t i = 0; i < N; ++i) {
2165  base[index_lanes[i]] = lanes[i];
2166  }
2167 }
2168 
2169 #endif
2170 
2171 // ------------------------------ Gather
2172 
2173 namespace detail {
2174 
2175 template <typename T>
2177  Full256<T> /* tag */,
2178  const T* HWY_RESTRICT base,
2179  const Vec256<int32_t> offset) {
2180  return Vec256<T>{_mm256_i32gather_epi32(
2181  reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
2182 }
2183 template <typename T>
2185  Full256<T> /* tag */,
2186  const T* HWY_RESTRICT base,
2187  const Vec256<int32_t> index) {
2188  return Vec256<T>{_mm256_i32gather_epi32(
2189  reinterpret_cast<const int32_t*>(base), index.raw, 4)};
2190 }
2191 
2192 template <typename T>
2194  Full256<T> /* tag */,
2195  const T* HWY_RESTRICT base,
2196  const Vec256<int64_t> offset) {
2197  return Vec256<T>{_mm256_i64gather_epi64(
2198  reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
2199 }
2200 template <typename T>
2202  Full256<T> /* tag */,
2203  const T* HWY_RESTRICT base,
2204  const Vec256<int64_t> index) {
2205  return Vec256<T>{_mm256_i64gather_epi64(
2206  reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
2207 }
2208 
2209 } // namespace detail
2210 
2211 template <typename T, typename Offset>
2213  const Vec256<Offset> offset) {
2214  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2215  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2216 }
2217 template <typename T, typename Index>
2219  const Vec256<Index> index) {
2220  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2221  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2222 }
2223 
2225  const float* HWY_RESTRICT base,
2226  const Vec256<int32_t> offset) {
2227  return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
2228 }
2230  const float* HWY_RESTRICT base,
2231  const Vec256<int32_t> index) {
2232  return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
2233 }
2234 
2236  const double* HWY_RESTRICT base,
2237  const Vec256<int64_t> offset) {
2238  return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
2239 }
2241  const double* HWY_RESTRICT base,
2242  const Vec256<int64_t> index) {
2243  return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
2244 }
2245 
2246 HWY_DIAGNOSTICS(pop)
2247 
2248 // ================================================== SWIZZLE
2249 
2250 // ------------------------------ LowerHalf
2251 
2252 template <typename T>
2254  return Vec128<T>{_mm256_castsi256_si128(v.raw)};
2255 }
2257  return Vec128<float>{_mm256_castps256_ps128(v.raw)};
2258 }
2260  return Vec128<double>{_mm256_castpd256_pd128(v.raw)};
2261 }
2262 
2263 template <typename T>
2265  return LowerHalf(Full128<T>(), v);
2266 }
2267 
2268 // ------------------------------ UpperHalf
2269 
2270 template <typename T>
2272  return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)};
2273 }
2275  return Vec128<float>{_mm256_extractf128_ps(v.raw, 1)};
2276 }
2278  return Vec128<double>{_mm256_extractf128_pd(v.raw, 1)};
2279 }
2280 
2281 // ------------------------------ GetLane (LowerHalf)
2282 template <typename T>
2284  return GetLane(LowerHalf(v));
2285 }
2286 
2287 // ------------------------------ ZeroExtendVector
2288 
2289 // Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper
2290 // bits undefined. Although it makes sense for them to be zero (VEX encoded
2291 // 128-bit instructions zero the upper lanes to avoid large penalties), a
2292 // compiler could decide to optimize out code that relies on this.
2293 //
2294 // The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
2295 // zeroing, but it is not available on MSVC nor GCC until 10.1. For older GCC,
2296 // we can still obtain the desired code thanks to pattern recognition; note that
2297 // the expensive insert instruction is not actually generated, see
2298 // https://gcc.godbolt.org/z/1MKGaP.
2299 
2300 template <typename T>
2302 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2303  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2304 #else
2305  return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2306 #endif
2307 }
2309  Vec128<float> lo) {
2310 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2311  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
2312 #else
2313  return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
2314 #endif
2315 }
2317  Vec128<double> lo) {
2318 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2319  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
2320 #else
2321  return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
2322 #endif
2323 }
2324 
2325 // ------------------------------ Combine
2326 
2327 template <typename T>
2329  const auto lo256 = ZeroExtendVector(d, lo);
2330  return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2331 }
2333  Vec128<float> lo) {
2334  const auto lo256 = ZeroExtendVector(d, lo);
2335  return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)};
2336 }
2338  Vec128<double> lo) {
2339  const auto lo256 = ZeroExtendVector(d, lo);
2340  return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)};
2341 }
2342 
2343 // ------------------------------ ShiftLeftBytes
2344 
2345 template <int kBytes, typename T>
2347  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2348  // This is the same operation as _mm256_bslli_epi128.
2349  return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)};
2350 }
2351 
2352 template <int kBytes, typename T>
2354  return ShiftLeftBytes<kBytes>(Full256<T>(), v);
2355 }
2356 
2357 // ------------------------------ ShiftLeftLanes
2358 
2359 template <int kLanes, typename T>
2361  const Repartition<uint8_t, decltype(d)> d8;
2362  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2363 }
2364 
2365 template <int kLanes, typename T>
2367  return ShiftLeftLanes<kLanes>(Full256<T>(), v);
2368 }
2369 
2370 // ------------------------------ ShiftRightBytes
2371 
2372 template <int kBytes, typename T>
2374  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2375  // This is the same operation as _mm256_bsrli_epi128.
2376  return Vec256<T>{_mm256_srli_si256(v.raw, kBytes)};
2377 }
2378 
2379 // ------------------------------ ShiftRightLanes
2380 template <int kLanes, typename T>
2382  const Repartition<uint8_t, decltype(d)> d8;
2383  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2384 }
2385 
2386 // ------------------------------ CombineShiftRightBytes
2387 
2388 // Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
2389 template <int kBytes, typename T, class V = Vec256<T>>
2391  const Repartition<uint8_t, decltype(d)> d8;
2392  return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
2393  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2394 }
2395 
2396 // ------------------------------ Broadcast/splat any lane
2397 
2398 // Unsigned
2399 template <int kLane>
2401  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2402  if (kLane < 4) {
2403  const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2404  return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
2405  } else {
2406  const __m256i hi =
2407  _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2408  return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
2409  }
2410 }
2411 template <int kLane>
2413  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2414  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2415 }
2416 template <int kLane>
2418  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2419  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2420 }
2421 
2422 // Signed
2423 template <int kLane>
2425  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2426  if (kLane < 4) {
2427  const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2428  return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
2429  } else {
2430  const __m256i hi =
2431  _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2432  return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
2433  }
2434 }
2435 template <int kLane>
2437  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2438  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2439 }
2440 template <int kLane>
2442  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2443  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2444 }
2445 
2446 // Float
2447 template <int kLane>
2449  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2450  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
2451 }
2452 template <int kLane>
2454  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2455  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
2456 }
2457 
2458 // ------------------------------ Hard-coded shuffles
2459 
2460 // Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2461 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2462 // right (the previous least-significant lane is now most-significant =>
2463 // 47650321). These could also be implemented via CombineShiftRightBytes but
2464 // the shuffle_abcd notation is more convenient.
2465 
2466 // Swap 32-bit halves in 64-bit halves.
2468  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0xB1)};
2469 }
2471  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0xB1)};
2472 }
2474  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
2475 }
2476 
2477 // Swap 64-bit halves
2479  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2480 }
2482  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2483 }
2485  // Shorter encoding than _mm256_permute_ps.
2486  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)};
2487 }
2489  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2490 }
2492  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2493 }
2495  // Shorter encoding than _mm256_permute_pd.
2496  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)};
2497 }
2498 
2499 // Rotate right 32 bits
2501  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2502 }
2504  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2505 }
2507  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)};
2508 }
2509 // Rotate left 32 bits
2511  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2512 }
2514  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2515 }
2517  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)};
2518 }
2519 
2520 // Reverse
2522  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2523 }
2525  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2526 }
2528  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)};
2529 }
2530 
2531 // ------------------------------ TableLookupLanes
2532 
2533 // Returned by SetTableIndices for use by TableLookupLanes.
2534 template <typename T>
2535 struct Indices256 {
2536  __m256i raw;
2537 };
2538 
2539 template <typename T>
2540 HWY_API Indices256<T> SetTableIndices(const Full256<T>, const int32_t* idx) {
2541 #if HWY_IS_DEBUG_BUILD
2542  const size_t N = 32 / sizeof(T);
2543  for (size_t i = 0; i < N; ++i) {
2544  HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
2545  }
2546 #endif
2547  return Indices256<T>{LoadU(Full256<int32_t>(), idx).raw};
2548 }
2549 
2551  const Indices256<uint32_t> idx) {
2552  return Vec256<uint32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2553 }
2555  const Indices256<int32_t> idx) {
2556  return Vec256<int32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2557 }
2559  const Indices256<float> idx) {
2560  return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
2561 }
2562 
2563 // ------------------------------ Reverse
2564 
2565 template <typename T>
2567  alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2568  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2569 }
2570 
2571 // ------------------------------ InterleaveLower
2572 
2573 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2574 // the least-significant lane) and "b". To concatenate two half-width integers
2575 // into one, use ZipLower/Upper instead (also works with scalar).
2576 
2578  const Vec256<uint8_t> b) {
2579  return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
2580 }
2582  const Vec256<uint16_t> b) {
2583  return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
2584 }
2586  const Vec256<uint32_t> b) {
2587  return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
2588 }
2590  const Vec256<uint64_t> b) {
2591  return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
2592 }
2593 
2595  const Vec256<int8_t> b) {
2596  return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
2597 }
2599  const Vec256<int16_t> b) {
2600  return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
2601 }
2603  const Vec256<int32_t> b) {
2604  return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
2605 }
2607  const Vec256<int64_t> b) {
2608  return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
2609 }
2610 
2612  const Vec256<float> b) {
2613  return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
2614 }
2616  const Vec256<double> b) {
2617  return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
2618 }
2619 
2620 // Additional overload for the optional Simd<> tag.
2621 template <typename T, class V = Vec256<T>>
2622 HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
2623  return InterleaveLower(a, b);
2624 }
2625 
2626 // ------------------------------ InterleaveUpper
2627 
2628 // All functions inside detail lack the required D parameter.
2629 namespace detail {
2630 
2632  const Vec256<uint8_t> b) {
2633  return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
2634 }
2636  const Vec256<uint16_t> b) {
2637  return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
2638 }
2640  const Vec256<uint32_t> b) {
2641  return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
2642 }
2644  const Vec256<uint64_t> b) {
2645  return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
2646 }
2647 
2649  const Vec256<int8_t> b) {
2650  return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
2651 }
2653  const Vec256<int16_t> b) {
2654  return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
2655 }
2657  const Vec256<int32_t> b) {
2658  return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
2659 }
2661  const Vec256<int64_t> b) {
2662  return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
2663 }
2664 
2666  const Vec256<float> b) {
2667  return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
2668 }
2670  const Vec256<double> b) {
2671  return Vec256<double>{_mm256_unpackhi_pd(a.raw, b.raw)};
2672 }
2673 
2674 } // namespace detail
2675 
2676 template <typename T, class V = Vec256<T>>
2677 HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
2678  return detail::InterleaveUpper(a, b);
2679 }
2680 
2681 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2682 
2683 // Same as Interleave*, except that the return lanes are double-width integers;
2684 // this is necessary because the single-lane scalar cannot return two values.
2685 template <typename T, typename TW = MakeWide<T>>
2687  return BitCast(Full256<TW>(), InterleaveLower(Full256<T>(), a, b));
2688 }
2689 template <typename T, typename TW = MakeWide<T>>
2691  return BitCast(dw, InterleaveLower(Full256<T>(), a, b));
2692 }
2693 
2694 template <typename T, typename TW = MakeWide<T>>
2696  return BitCast(dw, InterleaveUpper(Full256<T>(), a, b));
2697 }
2698 
2699 // ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
2700 
2701 // _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is
2702 // slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on
2703 // UpperUpper at the cost of one extra cycle/instruction.
2704 
2705 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2706 template <typename T>
2708  const Vec256<T> lo) {
2709  const Half<decltype(d)> d2;
2710  return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
2711 }
2713  const Vec256<float> lo) {
2714  const Half<decltype(d)> d2;
2715  return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
2716 }
2718  const Vec256<double> hi,
2719  const Vec256<double> lo) {
2720  const Half<decltype(d)> d2;
2721  return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
2722 }
2723 
2724 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
2725 template <typename T>
2727  const Vec256<T> lo) {
2728  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
2729 }
2731  const Vec256<float> hi,
2732  const Vec256<float> lo) {
2733  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
2734 }
2736  const Vec256<double> hi,
2737  const Vec256<double> lo) {
2738  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
2739 }
2740 
2741 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2742 template <typename T>
2744  const Vec256<T> lo) {
2745  return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
2746 }
2748  const Vec256<float> hi,
2749  const Vec256<float> lo) {
2750  return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
2751 }
2753  const Vec256<double> hi,
2754  const Vec256<double> lo) {
2755  return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
2756 }
2757 
2758 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2759 template <typename T>
2761  const Vec256<T> lo) {
2762  const Half<decltype(d)> d2;
2763  return ConcatUpperLower(d, hi, ZeroExtendVector(d, UpperHalf(d2, lo)));
2764 }
2765 
2766 // ------------------------------ ConcatOdd
2767 
2768 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2770  const RebindToUnsigned<decltype(d)> du;
2771 #if HWY_TARGET <= HWY_AVX3
2772  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2773  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
2774  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2775  BitCast(du, hi).raw)});
2776 #else
2777  const RebindToFloat<decltype(d)> df;
2778  const Vec256<float> v3131{_mm256_shuffle_ps(
2779  BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
2780  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw,
2781  _MM_SHUFFLE(3, 1, 2, 0))};
2782 #endif
2783 }
2784 
2786  Vec256<float> lo) {
2787  const RebindToUnsigned<decltype(d)> du;
2788 #if HWY_TARGET <= HWY_AVX3
2789  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2790  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2791  __mmask8{0xFF}, hi.raw)};
2792 #else
2793  const Vec256<float> v3131{
2794  _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
2795  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
2796  BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
2797 #endif
2798 }
2799 
2800 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2801 HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
2802  const RebindToUnsigned<decltype(d)> du;
2803 #if HWY_TARGET <= HWY_AVX3
2804  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
2805  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
2806  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2807  BitCast(du, hi).raw)});
2808 #else
2809  const RebindToFloat<decltype(d)> df;
2810  const Vec256<double> v31{
2811  _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)};
2812  return Vec256<T>{
2813  _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
2814 #endif
2815 }
2816 
2818  Vec256<double> lo) {
2819 #if HWY_TARGET <= HWY_AVX3
2820  const RebindToUnsigned<decltype(d)> du;
2821  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
2822  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2823  __mmask8{0xFF}, hi.raw)};
2824 #else
2825  (void)d;
2826  const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)};
2827  return Vec256<double>{
2828  _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
2829 #endif
2830 }
2831 
2832 // ------------------------------ ConcatEven
2833 
2834 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2836  const RebindToUnsigned<decltype(d)> du;
2837 #if HWY_TARGET <= HWY_AVX3
2838  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2839  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
2840  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2841  BitCast(du, hi).raw)});
2842 #else
2843  const RebindToFloat<decltype(d)> df;
2844  const Vec256<float> v2020{_mm256_shuffle_ps(
2845  BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
2846  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
2847  _MM_SHUFFLE(3, 1, 2, 0))};
2848 
2849 #endif
2850 }
2851 
2853  Vec256<float> lo) {
2854  const RebindToUnsigned<decltype(d)> du;
2855 #if HWY_TARGET <= HWY_AVX3
2856  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2857  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2858  __mmask8{0xFF}, hi.raw)};
2859 #else
2860  const Vec256<float> v2020{
2861  _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
2862  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
2863  BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
2864 
2865 #endif
2866 }
2867 
2868 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2869 HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
2870  const RebindToUnsigned<decltype(d)> du;
2871 #if HWY_TARGET <= HWY_AVX3
2872  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
2873  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
2874  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2875  BitCast(du, hi).raw)});
2876 #else
2877  const RebindToFloat<decltype(d)> df;
2878  const Vec256<double> v20{
2879  _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
2880  return Vec256<T>{
2881  _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
2882 
2883 #endif
2884 }
2885 
2887  Vec256<double> lo) {
2888 #if HWY_TARGET <= HWY_AVX3
2889  const RebindToUnsigned<decltype(d)> du;
2890  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
2891  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2892  __mmask8{0xFF}, hi.raw)};
2893 #else
2894  (void)d;
2895  const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
2896  return Vec256<double>{
2897  _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
2898 #endif
2899 }
2900 
2901 // ------------------------------ OddEven
2902 
2903 namespace detail {
2904 
2905 template <typename T>
2907  const Vec256<T> b) {
2908  const Full256<T> d;
2909  const Full256<uint8_t> d8;
2910  alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2911  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2912  return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
2913 }
2914 template <typename T>
2916  const Vec256<T> b) {
2917  return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
2918 }
2919 template <typename T>
2921  const Vec256<T> b) {
2922  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
2923 }
2924 template <typename T>
2926  const Vec256<T> b) {
2927  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
2928 }
2929 
2930 } // namespace detail
2931 
2932 template <typename T>
2934  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
2935 }
2937  return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
2938 }
2939 
2941  return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
2942 }
2943 
2944 // ------------------------------ TableLookupBytes (ZeroExtendVector)
2945 
2946 // Both full
2947 template <typename T, typename TI>
2949  const Vec256<TI> from) {
2950  return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
2951 }
2952 
2953 // Partial index vector
2954 template <typename T, typename TI, size_t NI>
2956  const Vec128<TI, NI> from) {
2957  // First expand to full 128, then 256.
2958  const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
2959  const auto tbl_full = TableLookupBytes(bytes, from_256);
2960  // Shrink to 128, then partial.
2961  return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
2962 }
2963 
2964 // Partial table vector
2965 template <typename T, size_t N, typename TI>
2967  const Vec256<TI> from) {
2968  // First expand to full 128, then 256.
2969  const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
2970  return TableLookupBytes(bytes_256, from);
2971 }
2972 
2973 // Partial both are handled by x86_128.
2974 
2975 // ------------------------------ Shl (Mul, ZipLower)
2976 
2977 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
2978 namespace detail {
2979 
2980 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
2981 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2982 HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
2983  const Full256<T> d;
2984  const RepartitionToWide<decltype(d)> dw;
2985  const Rebind<float, decltype(dw)> df;
2986  const auto zero = Zero(d);
2987  // Move into exponent (this u16 will become the upper half of an f32)
2988  const auto exp = ShiftLeft<23 - 16>(v);
2989  const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
2990  // Insert 0 into lower halves for reinterpreting as binary32.
2991  const auto f0 = ZipLower(dw, zero, upper);
2992  const auto f1 = ZipUpper(dw, zero, upper);
2993  // Do not use ConvertTo because it checks for overflow, which is redundant
2994  // because we only care about v in [0, 16).
2995  const Vec256<int32_t> bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)};
2996  const Vec256<int32_t> bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)};
2997  return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
2998 }
2999 
3000 } // namespace detail
3001 #endif // HWY_TARGET > HWY_AVX3
3002 
3004  const Vec256<uint16_t> bits) {
3005 #if HWY_TARGET <= HWY_AVX3
3006  return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
3007 #else
3008  return v * detail::Pow2(bits);
3009 #endif
3010 }
3011 
3013  const Vec256<uint32_t> bits) {
3014  return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
3015 }
3016 
3018  const Vec256<uint64_t> bits) {
3019  return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
3020 }
3021 
3022 // Signed left shift is the same as unsigned.
3023 template <typename T, HWY_IF_SIGNED(T)>
3025  const Full256<T> di;
3026  const Full256<MakeUnsigned<T>> du;
3027  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
3028 }
3029 
3030 // ------------------------------ Shr (MulHigh, IfThenElse, Not)
3031 
3033  const Vec256<uint16_t> bits) {
3034 #if HWY_TARGET <= HWY_AVX3
3035  return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
3036 #else
3037  const Full256<uint16_t> d;
3038  // For bits=0, we cannot mul by 2^16, so fix the result later.
3039  const auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
3040  // Replace output with input where bits == 0.
3041  return IfThenElse(bits == Zero(d), v, out);
3042 #endif
3043 }
3044 
3046  const Vec256<uint32_t> bits) {
3047  return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
3048 }
3049 
3051  const Vec256<uint64_t> bits) {
3052  return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
3053 }
3054 
3056  const Vec256<int16_t> bits) {
3057 #if HWY_TARGET <= HWY_AVX3
3058  return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
3059 #else
3060  return detail::SignedShr(Full256<int16_t>(), v, bits);
3061 #endif
3062 }
3063 
3065  const Vec256<int32_t> bits) {
3066  return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
3067 }
3068 
3070  const Vec256<int64_t> bits) {
3071 #if HWY_TARGET <= HWY_AVX3
3072  return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
3073 #else
3074  return detail::SignedShr(Full256<int64_t>(), v, bits);
3075 #endif
3076 }
3077 
3079  const Vec256<uint64_t> b) {
3080  const DFromV<decltype(a)> du64;
3081  const RepartitionToNarrow<decltype(du64)> du32;
3082  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3083  const auto a32 = BitCast(du32, a);
3084  const auto b32 = BitCast(du32, b);
3085  // Inputs for MulEven: we only need the lower 32 bits
3086  const auto aH = Shuffle2301(a32);
3087  const auto bH = Shuffle2301(b32);
3088 
3089  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
3090  // the even (lower 64 bits of every 128-bit block) results. See
3091  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
3092  const auto aLbL = MulEven(a32, b32);
3093  const auto w3 = aLbL & maskL;
3094 
3095  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3096  const auto w2 = t2 & maskL;
3097  const auto w1 = ShiftRight<32>(t2);
3098 
3099  const auto t = MulEven(a32, bH) + w2;
3100  const auto k = ShiftRight<32>(t);
3101 
3102  const auto mulH = MulEven(aH, bH) + w1 + k;
3103  const auto mulL = ShiftLeft<32>(t) + w3;
3104  return InterleaveLower(mulL, mulH);
3105 }
3106 
3108  const Vec256<uint64_t> b) {
3109  const DFromV<decltype(a)> du64;
3110  const RepartitionToNarrow<decltype(du64)> du32;
3111  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3112  const auto a32 = BitCast(du32, a);
3113  const auto b32 = BitCast(du32, b);
3114  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
3115  const auto aH = Shuffle2301(a32);
3116  const auto bH = Shuffle2301(b32);
3117 
3118  // Same as above, but we're using the odd results (upper 64 bits per block).
3119  const auto aLbL = MulEven(a32, b32);
3120  const auto w3 = aLbL & maskL;
3121 
3122  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3123  const auto w2 = t2 & maskL;
3124  const auto w1 = ShiftRight<32>(t2);
3125 
3126  const auto t = MulEven(a32, bH) + w2;
3127  const auto k = ShiftRight<32>(t);
3128 
3129  const auto mulH = MulEven(aH, bH) + w1 + k;
3130  const auto mulL = ShiftLeft<32>(t) + w3;
3131  return InterleaveUpper(du64, mulL, mulH);
3132 }
3133 
3134 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3135 
3139  const Vec256<float> sum0,
3140  Vec256<float>& sum1) {
3141  // TODO(janwas): _mm256_dpbf16_ps when available
3142  const Repartition<uint16_t, decltype(df32)> du16;
3143  const RebindToUnsigned<decltype(df32)> du32;
3144  const Vec256<uint16_t> zero = Zero(du16);
3145  // Lane order within sum0/1 is undefined, hence we can avoid the
3146  // longer-latency lane-crossing PromoteTo.
3147  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
3148  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3149  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
3150  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3151  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3152  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3153 }
3154 
3155 // ================================================== CONVERT
3156 
3157 // ------------------------------ Promotions (part w/ narrow lanes -> full)
3158 
3160  const Vec128<float, 4> v) {
3161  return Vec256<double>{_mm256_cvtps_pd(v.raw)};
3162 }
3163 
3165  const Vec128<int32_t, 4> v) {
3166  return Vec256<double>{_mm256_cvtepi32_pd(v.raw)};
3167 }
3168 
3169 // Unsigned: zero-extend.
3170 // Note: these have 3 cycle latency; if inputs are already split across the
3171 // 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3173  Vec128<uint8_t> v) {
3174  return Vec256<uint16_t>{_mm256_cvtepu8_epi16(v.raw)};
3175 }
3177  Vec128<uint8_t, 8> v) {
3178  return Vec256<uint32_t>{_mm256_cvtepu8_epi32(v.raw)};
3179 }
3181  Vec128<uint8_t> v) {
3182  return Vec256<int16_t>{_mm256_cvtepu8_epi16(v.raw)};
3183 }
3185  Vec128<uint8_t, 8> v) {
3186  return Vec256<int32_t>{_mm256_cvtepu8_epi32(v.raw)};
3187 }
3189  Vec128<uint16_t> v) {
3190  return Vec256<uint32_t>{_mm256_cvtepu16_epi32(v.raw)};
3191 }
3193  Vec128<uint16_t> v) {
3194  return Vec256<int32_t>{_mm256_cvtepu16_epi32(v.raw)};
3195 }
3197  Vec128<uint32_t> v) {
3198  return Vec256<uint64_t>{_mm256_cvtepu32_epi64(v.raw)};
3199 }
3200 
3201 // Signed: replicate sign bit.
3202 // Note: these have 3 cycle latency; if inputs are already split across the
3203 // 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3204 // signed shift would be faster.
3206  Vec128<int8_t> v) {
3207  return Vec256<int16_t>{_mm256_cvtepi8_epi16(v.raw)};
3208 }
3210  Vec128<int8_t, 8> v) {
3211  return Vec256<int32_t>{_mm256_cvtepi8_epi32(v.raw)};
3212 }
3214  Vec128<int16_t> v) {
3215  return Vec256<int32_t>{_mm256_cvtepi16_epi32(v.raw)};
3216 }
3218  Vec128<int32_t> v) {
3219  return Vec256<int64_t>{_mm256_cvtepi32_epi64(v.raw)};
3220 }
3221 
3222 // ------------------------------ Demotions (full -> part w/ narrow lanes)
3223 
3225  const Vec256<int32_t> v) {
3226  const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
3227  // Concatenating lower halves of both 128-bit blocks afterward is more
3228  // efficient than an extra input with low block = high block of v.
3229  return Vec128<uint16_t>{
3230  _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
3231 }
3232 
3234  const Vec256<int32_t> v) {
3235  const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
3236  return Vec128<int16_t>{
3237  _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
3238 }
3239 
3241  const Vec256<int32_t> v) {
3242  const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
3243  // Concatenate lower 64 bits of each 128-bit block
3244  const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
3245  const __m128i u16 = _mm256_castsi256_si128(u16_concat);
3246  // packus treats the input as signed; we want unsigned. Clear the MSB to get
3247  // unsigned saturation to u8.
3248  const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
3249  return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
3250 }
3251 
3253  const Vec256<int16_t> v) {
3254  const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw);
3255  return Vec128<uint8_t>{
3256  _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
3257 }
3258 
3260  const Vec256<int32_t> v) {
3261  const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
3262  // Concatenate lower 64 bits of each 128-bit block
3263  const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
3264  const __m128i i16 = _mm256_castsi256_si128(i16_concat);
3265  return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
3266 }
3267 
3269  const Vec256<int16_t> v) {
3270  const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
3271  return Vec128<int8_t>{
3272  _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
3273 }
3274 
3275  // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
3276  // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
3277 HWY_DIAGNOSTICS(push)
3278 HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
3279 
3281  const Vec256<float> v) {
3282 #ifdef HWY_DISABLE_F16C
3283  const RebindToUnsigned<decltype(df16)> du16;
3284  const Rebind<uint32_t, decltype(df16)> du;
3285  const RebindToSigned<decltype(du)> di;
3286  const auto bits32 = BitCast(du, v);
3287  const auto sign = ShiftRight<31>(bits32);
3288  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3289  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3290 
3291  const auto k15 = Set(di, 15);
3292  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3293  const auto is_tiny = exp < Set(di, -24);
3294 
3295  const auto is_subnormal = exp < Set(di, -14);
3296  const auto biased_exp16 =
3297  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3298  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3299  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3300  (mantissa32 >> (Set(du, 13) + sub_exp));
3301  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3302  ShiftRight<13>(mantissa32)); // <1024
3303 
3304  const auto sign16 = ShiftLeft<15>(sign);
3305  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3306  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3307  return BitCast(df16, DemoteTo(du16, bits16));
3308 #else
3309  (void)df16;
3310  return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3311 #endif
3312 }
3313 
3314 HWY_DIAGNOSTICS(pop)
3315 
3317  const Vec256<float> v) {
3318  // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
3319  const Rebind<int32_t, decltype(dbf16)> di32;
3320  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3321  const Rebind<uint16_t, decltype(dbf16)> du16;
3322  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3323  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3324 }
3325 
3328  // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
3329  const RebindToUnsigned<decltype(dbf16)> du16;
3330  const Repartition<uint32_t, decltype(dbf16)> du32;
3331  const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
3332  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3333 }
3334 
3336  const Vec256<double> v) {
3337  return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
3338 }
3339 
3341  const Vec256<double> v) {
3342  const auto clamped = detail::ClampF64ToI32Max(Full256<double>(), v);
3343  return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
3344 }
3345 
3346 // For already range-limited input [0, 255].
3348  const Full256<uint32_t> d32;
3349  alignas(32) static constexpr uint32_t k8From32[8] = {
3350  0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
3351  // Place first four bytes in lo[0], remaining 4 in hi[1].
3352  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
3353  // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
3354  const auto lo = LowerHalf(quad);
3355  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
3356  const auto pair = LowerHalf(lo | hi);
3357  return BitCast(Simd<uint8_t, 8>(), pair);
3358 }
3359 
3360 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
3361 
3363  const Vec256<int32_t> v) {
3364  return Vec256<float>{_mm256_cvtepi32_ps(v.raw)};
3365 }
3366 
3368 #if HWY_TARGET <= HWY_AVX3
3369  (void)dd;
3370  return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
3371 #else
3372  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
3373  const Repartition<uint32_t, decltype(dd)> d32;
3374  const Repartition<uint64_t, decltype(dd)> d64;
3375 
3376  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
3377  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
3378  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
3379 
3380  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
3381  const auto k52 = Set(d32, 0x43300000);
3382  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
3383 
3384  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
3385  return (v_upper - k84_63_52) + v_lower; // order matters!
3386 #endif
3387 }
3388 
3389 // Truncates (rounds toward zero).
3391  return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
3392 }
3393 
3395 #if HWY_TARGET <= HWY_AVX3
3396  return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw));
3397 #else
3398  using VI = decltype(Zero(di));
3399  const VI k0 = Zero(di);
3400  const VI k1 = Set(di, 1);
3401  const VI k51 = Set(di, 51);
3402 
3403  // Exponent indicates whether the number can be represented as int64_t.
3404  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
3405  const VI exp = biased_exp - Set(di, 0x3FF);
3406  const auto in_range = exp < Set(di, 63);
3407 
3408  // If we were to cap the exponent at 51 and add 2^52, the number would be in
3409  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
3410  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
3411  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
3412  // manually shift the mantissa into place (we already have many of the
3413  // inputs anyway).
3414  const VI shift_mnt = Max(k51 - exp, k0);
3415  const VI shift_int = Max(exp - k51, k0);
3416  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
3417  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
3418  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
3419  // For inputs larger than 2^52, insert zeros at the bottom.
3420  const VI shifted = int52 << shift_int;
3421  // Restore the one bit lost when shifting in the implicit 1-bit.
3422  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
3423 
3424  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
3425  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
3426  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
3427  const VI magnitude = IfThenElse(in_range, restored, limit);
3428 
3429  // If the input was negative, negate the integer (two's complement).
3430  return (magnitude ^ sign_mask) - sign_mask;
3431 #endif
3432 }
3433 
3435  const Full256<int32_t> di;
3436  return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw));
3437 }
3438 
3439 
3441  const Vec128<float16_t> v) {
3442 #ifdef HWY_DISABLE_F16C
3443  const RebindToSigned<decltype(df32)> di32;
3444  const RebindToUnsigned<decltype(df32)> du32;
3445  // Expand to u32 so we can shift.
3446  const auto bits16 = PromoteTo(du32, Vec128<uint16_t>{v.raw});
3447  const auto sign = ShiftRight<15>(bits16);
3448  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3449  const auto mantissa = bits16 & Set(du32, 0x3FF);
3450  const auto subnormal =
3451  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3452  Set(df32, 1.0f / 16384 / 1024));
3453 
3454  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3455  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3456  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3457  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3458  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3459 #else
3460  (void)df32;
3461  return Vec256<float>{_mm256_cvtph_ps(v.raw)};
3462 #endif
3463 }
3464 
3466  const Vec128<bfloat16_t> v) {
3467  const Rebind<uint16_t, decltype(df32)> du16;
3468  const RebindToSigned<decltype(df32)> di32;
3469  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3470 }
3471 
3472 // ================================================== CRYPTO
3473 
3474 #if !defined(HWY_DISABLE_PCLMUL_AES)
3475 
3476 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3477 #ifdef HWY_NATIVE_AES
3478 #undef HWY_NATIVE_AES
3479 #else
3480 #define HWY_NATIVE_AES
3481 #endif
3482 
3484  Vec256<uint8_t> round_key) {
3485 #if HWY_TARGET == HWY_AVX3_DL
3486  return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)};
3487 #else
3488  const Full256<uint8_t> d;
3489  const Half<decltype(d)> d2;
3490  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3491  AESRound(LowerHalf(state), LowerHalf(round_key)));
3492 #endif
3493 }
3494 
3496 #if HWY_TARGET == HWY_AVX3_DL
3497  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
3498 #else
3499  const Full256<uint64_t> d;
3500  const Half<decltype(d)> d2;
3501  return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)),
3502  CLMulLower(LowerHalf(a), LowerHalf(b)));
3503 #endif
3504 }
3505 
3507 #if HWY_TARGET == HWY_AVX3_DL
3508  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)};
3509 #else
3510  const Full256<uint64_t> d;
3511  const Half<decltype(d)> d2;
3512  return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)),
3513  CLMulUpper(LowerHalf(a), LowerHalf(b)));
3514 #endif
3515 }
3516 
3517 #endif // HWY_DISABLE_PCLMUL_AES
3518 
3519 // ================================================== MISC
3520 
3521 // Returns a vector with lane i=[0, N) set to "first" + i.
3522 template <typename T, typename T2>
3523 HWY_API Vec256<T> Iota(const Full256<T> d, const T2 first) {
3524  HWY_ALIGN T lanes[32 / sizeof(T)];
3525  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
3526  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
3527  }
3528  return Load(d, lanes);
3529 }
3530 
3531 #if HWY_TARGET <= HWY_AVX3
3532 
3533 // ------------------------------ LoadMaskBits
3534 
3535 // `p` points to at least 8 readable bytes, not all of which need be valid.
3536 template <typename T>
3538  const uint8_t* HWY_RESTRICT bits) {
3539  constexpr size_t N = 32 / sizeof(T);
3540  constexpr size_t kNumBytes = (N + 7) / 8;
3541 
3542  uint64_t mask_bits = 0;
3543  CopyBytes<kNumBytes>(bits, &mask_bits);
3544 
3545  if (N < 8) {
3546  mask_bits &= (1ull << N) - 1;
3547  }
3548 
3549  return Mask256<T>::FromBits(mask_bits);
3550 }
3551 
3552 // ------------------------------ StoreMaskBits
3553 
3554 // `p` points to at least 8 writable bytes.
3555 template <typename T>
3556 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
3557  uint8_t* bits) {
3558  constexpr size_t N = 32 / sizeof(T);
3559  constexpr size_t kNumBytes = (N + 7) / 8;
3560 
3561  CopyBytes<kNumBytes>(&mask.raw, bits);
3562 
3563  // Non-full byte, need to clear the undefined upper bits.
3564  if (N < 8) {
3565  const int mask = static_cast<int>((1ull << N) - 1);
3566  bits[0] = static_cast<uint8_t>(bits[0] & mask);
3567  }
3568  return kNumBytes;
3569 }
3570 
3571 // ------------------------------ Mask testing
3572 
3573 template <typename T>
3574 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
3575  return PopCount(static_cast<uint64_t>(mask.raw));
3576 }
3577 
3578 template <typename T>
3579 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
3580  const Mask256<T> mask) {
3581  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
3582 }
3583 
3584 // Beware: the suffix indicates the number of mask bits, not lane size!
3585 
3586 namespace detail {
3587 
3588 template <typename T>
3589 HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
3590 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3591  return _kortestz_mask32_u8(mask.raw, mask.raw);
3592 #else
3593  return mask.raw == 0;
3594 #endif
3595 }
3596 template <typename T>
3597 HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
3598 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3599  return _kortestz_mask16_u8(mask.raw, mask.raw);
3600 #else
3601  return mask.raw == 0;
3602 #endif
3603 }
3604 template <typename T>
3605 HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
3606 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3607  return _kortestz_mask8_u8(mask.raw, mask.raw);
3608 #else
3609  return mask.raw == 0;
3610 #endif
3611 }
3612 template <typename T>
3613 HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
3614  return (uint64_t{mask.raw} & 0xF) == 0;
3615 }
3616 
3617 } // namespace detail
3618 
3619 template <typename T>
3620 HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
3621  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
3622 }
3623 
3624 namespace detail {
3625 
3626 template <typename T>
3627 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
3628 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3629  return _kortestc_mask32_u8(mask.raw, mask.raw);
3630 #else
3631  return mask.raw == 0xFFFFFFFFu;
3632 #endif
3633 }
3634 template <typename T>
3635 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
3636 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3637  return _kortestc_mask16_u8(mask.raw, mask.raw);
3638 #else
3639  return mask.raw == 0xFFFFu;
3640 #endif
3641 }
3642 template <typename T>
3643 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
3644 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3645  return _kortestc_mask8_u8(mask.raw, mask.raw);
3646 #else
3647  return mask.raw == 0xFFu;
3648 #endif
3649 }
3650 template <typename T>
3651 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
3652  // Cannot use _kortestc because we have less than 8 mask bits.
3653  return mask.raw == 0xFu;
3654 }
3655 
3656 } // namespace detail
3657 
3658 template <typename T>
3659 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
3660  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
3661 }
3662 
3663 // ------------------------------ Compress
3664 
3665 // 16-bit is defined in x86_512 so we can use 512-bit vectors.
3666 
3667 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3669  return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
3670 }
3671 
3672 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3673 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
3674  return Vec256<T>{_mm256_maskz_compress_epi64(mask.raw, v.raw)};
3675 }
3676 
3678  return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)};
3679 }
3680 
3682  return Vec256<double>{_mm256_maskz_compress_pd(mask.raw, v.raw)};
3683 }
3684 
3685 // ------------------------------ CompressBits (LoadMaskBits)
3686 
3687 template <typename T>
3689  return Compress(v, LoadMaskBits(Full256<T>(), bits));
3690 }
3691 
3692 // ------------------------------ CompressStore
3693 
3694 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3696  T* HWY_RESTRICT unaligned) {
3697  const Rebind<uint16_t, decltype(d)> du;
3698  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3699 
3700  const uint64_t mask_bits{mask.raw};
3701 
3702 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3703  _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
3704 #else
3705  // Split into halves to keep the table size manageable.
3706  const Half<decltype(du)> duh;
3707  const auto vL = LowerHalf(duh, vu);
3708  const auto vH = UpperHalf(duh, vu);
3709 
3710  const uint64_t mask_bitsL = mask_bits & 0xFF;
3711  const uint64_t mask_bitsH = mask_bits >> 8;
3712 
3713  const auto idxL = detail::IndicesForCompress16(mask_bitsL);
3714  const auto idxH = detail::IndicesForCompress16(mask_bitsH);
3715 
3716  // Compress and 128-bit halves.
3717  const Vec128<uint16_t> cL{_mm_permutexvar_epi16(idxL.raw, vL.raw)};
3718  const Vec128<uint16_t> cH{_mm_permutexvar_epi16(idxH.raw, vH.raw)};
3719  const Half<decltype(d)> dh;
3720  StoreU(BitCast(dh, cL), dh, unaligned);
3721  StoreU(BitCast(dh, cH), dh, unaligned + PopCount(mask_bitsL));
3722 #endif // HWY_TARGET == HWY_AVX3_DL
3723 
3724  return PopCount(mask_bits);
3725 }
3726 
3727 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3728 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
3729  T* HWY_RESTRICT unaligned) {
3730  _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3731  return PopCount(uint64_t{mask.raw});
3732 }
3733 
3734 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3735 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
3736  T* HWY_RESTRICT unaligned) {
3737  _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3738  return PopCount(uint64_t{mask.raw} & 0xFull);
3739 }
3740 
3742  Full256<float> /* tag */,
3743  float* HWY_RESTRICT unaligned) {
3744  _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
3745  return PopCount(uint64_t{mask.raw});
3746 }
3747 
3749  Full256<double> /* tag */,
3750  double* HWY_RESTRICT unaligned) {
3751  _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
3752  return PopCount(uint64_t{mask.raw} & 0xFull);
3753 }
3754 
3755 // ------------------------------ CompressBitsStore (LoadMaskBits)
3756 
3757 template <typename T>
3758 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
3759  Full256<T> d, T* HWY_RESTRICT unaligned) {
3760  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
3761 }
3762 
3763 #else // AVX2
3764 
3765 // ------------------------------ LoadMaskBits (TestBit)
3766 
3767 namespace detail {
3768 
3769 // 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_LE128 there.
3770 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3771 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3772  const RebindToUnsigned<decltype(d)> du;
3773  const Repartition<uint32_t, decltype(d)> du32;
3774  const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
3775 
3776  // Replicate bytes 8x such that each byte contains the bit that governs it.
3777  const Repartition<uint64_t, decltype(d)> du64;
3778  alignas(32) constexpr uint64_t kRep8[4] = {
3779  0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
3780  0x0303030303030303ull};
3781  const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
3782 
3783  alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
3784  1, 2, 4, 8, 16, 32, 64, 128};
3785  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
3786 }
3787 
3788 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3789 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3790  const RebindToUnsigned<decltype(d)> du;
3791  alignas(32) constexpr uint16_t kBit[16] = {
3792  1, 2, 4, 8, 16, 32, 64, 128,
3793  0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3794  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
3795  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
3796 }
3797 
3798 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3799 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3800  const RebindToUnsigned<decltype(d)> du;
3801  constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3802  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
3803  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
3804 }
3805 
3806 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3807 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
3808  const RebindToUnsigned<decltype(d)> du;
3809  constexpr uint64_t kBit[8] = {1, 2, 4, 8};
3810  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
3811 }
3812 
3813 } // namespace detail
3814 
3815 // `p` points to at least 8 readable bytes, not all of which need be valid.
3816 template <typename T>
3817 HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
3818  const uint8_t* HWY_RESTRICT bits) {
3819  constexpr size_t N = 32 / sizeof(T);
3820  constexpr size_t kNumBytes = (N + 7) / 8;
3821 
3822  uint64_t mask_bits = 0;
3823  CopyBytes<kNumBytes>(bits, &mask_bits);
3824 
3825  if (N < 8) {
3826  mask_bits &= (1ull << N) - 1;
3827  }
3828 
3829  return detail::LoadMaskBits256(d, mask_bits);
3830 }
3831 
3832 // ------------------------------ StoreMaskBits
3833 
3834 namespace detail {
3835 
3836 template <typename T>
3837 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
3838  const Mask256<T> mask) {
3839  const Full256<T> d;
3840  const Full256<uint8_t> d8;
3841  const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
3842  // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
3843  return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
3844 }
3845 
3846 template <typename T>
3847 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
3848  const Mask256<T> mask) {
3849 #if HWY_ARCH_X86_64
3850  const uint64_t sign_bits8 = BitsFromMask(hwy::SizeTag<1>(), mask);
3851  // Skip the bits from the lower byte of each u16 (better not to use the
3852  // same packs_epi16 as SSE4, because that requires an extra swizzle here).
3853  return _pext_u64(sign_bits8, 0xAAAAAAAAull);
3854 #else
3855  // Slow workaround for 32-bit builds, which lack _pext_u64.
3856  // Remove useless lower half of each u16 while preserving the sign bit.
3857  // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes.
3858  const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
3859  // Move odd qwords (value zero) to top so they don't affect the mask value.
3860  const auto compressed =
3861  _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
3862  return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
3863 #endif // HWY_ARCH_X86_64
3864 }
3865 
3866 template <typename T>
3867 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
3868  const Mask256<T> mask) {
3869  const Full256<T> d;
3870  const Full256<float> df;
3871  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
3872  return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
3873 }
3874 
3875 template <typename T>
3876 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
3877  const Mask256<T> mask) {
3878  const Full256<T> d;
3879  const Full256<double> df;
3880  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
3881  return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
3882 }
3883 
3884 template <typename T>
3885 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
3886  return BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask);
3887 }
3888 
3889 } // namespace detail
3890 
3891 // `p` points to at least 8 writable bytes.
3892 template <typename T>
3893 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
3894  uint8_t* bits) {
3895  constexpr size_t N = 32 / sizeof(T);
3896  constexpr size_t kNumBytes = (N + 7) / 8;
3897 
3898  const uint64_t mask_bits = detail::BitsFromMask(mask);
3899  CopyBytes<kNumBytes>(&mask_bits, bits);
3900  return kNumBytes;
3901 }
3902 
3903 // ------------------------------ Mask testing
3904 
3905 template <typename T>
3906 HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
3907  // Cheaper than PTEST, which is 2 uop / 3L.
3908  return detail::BitsFromMask(mask) == 0;
3909 }
3910 
3911 template <typename T>
3912 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
3913  constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1;
3914  return detail::BitsFromMask(mask) == kAllBits;
3915 }
3916 
3917 template <typename T>
3918 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
3919  return PopCount(detail::BitsFromMask(mask));
3920 }
3921 
3922 template <typename T>
3923 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
3924  const Mask256<T> mask) {
3925  const uint64_t mask_bits = detail::BitsFromMask(mask);
3926  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
3927 }
3928 
3929 // ------------------------------ Compress, CompressBits
3930 
3931 namespace detail {
3932 
3933 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3934 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 8> d,
3935  uint64_t mask_bits) {
3936  const RebindToUnsigned<decltype(d)> d32;
3937  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
3938  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
3939  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
3940  // and unavailable in 32-bit builds. We instead compress each index into 4
3941  // bits, for a total of 1 KiB.
3942  alignas(16) constexpr uint32_t packed_array[256] = {
3943  0x00000000, 0x00000000, 0x00000001, 0x00000010, 0x00000002, 0x00000020,
3944  0x00000021, 0x00000210, 0x00000003, 0x00000030, 0x00000031, 0x00000310,
3945  0x00000032, 0x00000320, 0x00000321, 0x00003210, 0x00000004, 0x00000040,
3946  0x00000041, 0x00000410, 0x00000042, 0x00000420, 0x00000421, 0x00004210,
3947  0x00000043, 0x00000430, 0x00000431, 0x00004310, 0x00000432, 0x00004320,
3948  0x00004321, 0x00043210, 0x00000005, 0x00000050, 0x00000051, 0x00000510,
3949  0x00000052, 0x00000520, 0x00000521, 0x00005210, 0x00000053, 0x00000530,
3950  0x00000531, 0x00005310, 0x00000532, 0x00005320, 0x00005321, 0x00053210,
3951  0x00000054, 0x00000540, 0x00000541, 0x00005410, 0x00000542, 0x00005420,
3952  0x00005421, 0x00054210, 0x00000543, 0x00005430, 0x00005431, 0x00054310,
3953  0x00005432, 0x00054320, 0x00054321, 0x00543210, 0x00000006, 0x00000060,
3954  0x00000061, 0x00000610, 0x00000062, 0x00000620, 0x00000621, 0x00006210,
3955  0x00000063, 0x00000630, 0x00000631, 0x00006310, 0x00000632, 0x00006320,
3956  0x00006321, 0x00063210, 0x00000064, 0x00000640, 0x00000641, 0x00006410,
3957  0x00000642, 0x00006420, 0x00006421, 0x00064210, 0x00000643, 0x00006430,
3958  0x00006431, 0x00064310, 0x00006432, 0x00064320, 0x00064321, 0x00643210,
3959  0x00000065, 0x00000650, 0x00000651, 0x00006510, 0x00000652, 0x00006520,
3960  0x00006521, 0x00065210, 0x00000653, 0x00006530, 0x00006531, 0x00065310,
3961  0x00006532, 0x00065320, 0x00065321, 0x00653210, 0x00000654, 0x00006540,
3962  0x00006541, 0x00065410, 0x00006542, 0x00065420, 0x00065421, 0x00654210,
3963  0x00006543, 0x00065430, 0x00065431, 0x00654310, 0x00065432, 0x00654320,
3964  0x00654321, 0x06543210, 0x00000007, 0x00000070, 0x00000071, 0x00000710,
3965  0x00000072, 0x00000720, 0x00000721, 0x00007210, 0x00000073, 0x00000730,
3966  0x00000731, 0x00007310, 0x00000732, 0x00007320, 0x00007321, 0x00073210,
3967  0x00000074, 0x00000740, 0x00000741, 0x00007410, 0x00000742, 0x00007420,
3968  0x00007421, 0x00074210, 0x00000743, 0x00007430, 0x00007431, 0x00074310,
3969  0x00007432, 0x00074320, 0x00074321, 0x00743210, 0x00000075, 0x00000750,
3970  0x00000751, 0x00007510, 0x00000752, 0x00007520, 0x00007521, 0x00075210,
3971  0x00000753, 0x00007530, 0x00007531, 0x00075310, 0x00007532, 0x00075320,
3972  0x00075321, 0x00753210, 0x00000754, 0x00007540, 0x00007541, 0x00075410,
3973  0x00007542, 0x00075420, 0x00075421, 0x00754210, 0x00007543, 0x00075430,
3974  0x00075431, 0x00754310, 0x00075432, 0x00754320, 0x00754321, 0x07543210,
3975  0x00000076, 0x00000760, 0x00000761, 0x00007610, 0x00000762, 0x00007620,
3976  0x00007621, 0x00076210, 0x00000763, 0x00007630, 0x00007631, 0x00076310,
3977  0x00007632, 0x00076320, 0x00076321, 0x00763210, 0x00000764, 0x00007640,
3978  0x00007641, 0x00076410, 0x00007642, 0x00076420, 0x00076421, 0x00764210,
3979  0x00007643, 0x00076430, 0x00076431, 0x00764310, 0x00076432, 0x00764320,
3980  0x00764321, 0x07643210, 0x00000765, 0x00007650, 0x00007651, 0x00076510,
3981  0x00007652, 0x00076520, 0x00076521, 0x00765210, 0x00007653, 0x00076530,
3982  0x00076531, 0x00765310, 0x00076532, 0x00765320, 0x00765321, 0x07653210,
3983  0x00007654, 0x00076540, 0x00076541, 0x00765410, 0x00076542, 0x00765420,
3984  0x00765421, 0x07654210, 0x00076543, 0x00765430, 0x00765431, 0x07654310,
3985  0x00765432, 0x07654320, 0x07654321, 0x76543210};
3986 
3987  // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
3988  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
3989  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
3990  // latency, it may be faster to use LoadDup128 and PSHUFB.
3991  const auto packed = Set(d32, packed_array[mask_bits]);
3992  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3993  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
3994 }
3995 
3996 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3997 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 4> d,
3998  uint64_t mask_bits) {
3999  const Repartition<uint32_t, decltype(d)> d32;
4000 
4001  // For 64-bit, we still need 32-bit indices because there is no 64-bit
4002  // permutevar, but there are only 4 lanes, so we can afford to skip the
4003  // unpacking and load the entire index vector directly.
4004  alignas(32) constexpr uint32_t packed_array[128] = {
4005  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, //
4006  2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 0, 1, 0, 1, //
4007  4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 0, 1, 0, 1, //
4008  2, 3, 4, 5, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 0, 1, //
4009  6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 0, 1, //
4010  2, 3, 6, 7, 0, 1, 0, 1, 0, 1, 2, 3, 6, 7, 0, 1, //
4011  4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 4, 5, 6, 7, 0, 1,
4012  2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
4013  return Indices256<uint32_t>{Load(d32, packed_array + 8 * mask_bits).raw};
4014 }
4015 
4016 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4017 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4018  const Full256<T> d;
4019  const Repartition<uint32_t, decltype(d)> du32;
4020 
4021  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
4022  const auto indices = IndicesFromBits(d, mask_bits);
4023  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
4024 }
4025 
4026 // LUTs are infeasible for 2^16 possible masks. Promoting to 32-bit and using
4027 // the native Compress is probably more efficient than 2 LUTs.
4028 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4029 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4030  using D = Full256<T>;
4031  const Rebind<uint16_t, D> du;
4032  const Repartition<int32_t, D> dw;
4033  const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
4034  const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
4035  const auto promoted1 = PromoteTo(dw, UpperHalf(Half<decltype(du)>(), vu16));
4036 
4037  const uint64_t mask_bits0 = mask_bits & 0xFF;
4038  const uint64_t mask_bits1 = mask_bits >> 8;
4039  const auto compressed0 = Compress(promoted0, mask_bits0);
4040  const auto compressed1 = Compress(promoted1, mask_bits1);
4041 
4042  const Half<decltype(du)> dh;
4043  const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
4044  const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
4045 
4046  const size_t count0 = PopCount(mask_bits0);
4047  // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
4048  // VPERMD for shifting at 4 byte granularity.
4049  alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0,
4050  0, 1, 2, 3, 4, 5, 6, 7};
4051  const auto indices = SetTableIndices(dw, iota4 + 8 - count0 / 2);
4052  const auto shift1_multiple4 =
4053  BitCast(du, TableLookupLanes(BitCast(dw, demoted1), indices));
4054 
4055  // Whole-register unconditional shift by 2 bytes.
4056  // TODO(janwas): slow on AMD, use 2 shifts + permq + OR instead?
4057  const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw,
4058  shift1_multiple4.raw, 0x08);
4059  const auto shift1_multiple2 =
4060  Vec256<uint16_t>{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)};
4061 
4062  // Make the shift conditional on the lower bit of count0.
4063  const auto m_odd =
4064  TestBit(Set(du, static_cast<uint16_t>(count0)), Set(du, 1));
4065  const auto shifted1 = IfThenElse(m_odd, shift1_multiple2, shift1_multiple4);
4066 
4067  // Blend the lower and shifted upper parts.
4068  constexpr uint16_t on = 0xFFFF;
4069  alignas(32) constexpr uint16_t lower_lanes[32] = {HWY_REP4(on), HWY_REP4(on),
4070  HWY_REP4(on), HWY_REP4(on)};
4071  const auto m_lower = MaskFromVec(LoadU(du, lower_lanes + 16 - count0));
4072  return BitCast(D(), IfThenElse(m_lower, demoted0, shifted1));
4073 }
4074 
4075 } // namespace detail
4076 
4077 template <typename T>
4078 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
4079  const uint64_t mask_bits = detail::BitsFromMask(m);
4080  return detail::Compress(v, mask_bits);
4081 }
4082 
4083 template <typename T>
4084 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4085  constexpr size_t N = 32 / sizeof(T);
4086  constexpr size_t kNumBytes = (N + 7) / 8;
4087 
4088  uint64_t mask_bits = 0;
4089  CopyBytes<kNumBytes>(bits, &mask_bits);
4090 
4091  if (N < 8) {
4092  mask_bits &= (1ull << N) - 1;
4093  }
4094 
4095  return detail::Compress(v, mask_bits);
4096 }
4097 
4098 // ------------------------------ CompressStore, CompressBitsStore
4099 
4100 template <typename T>
4101 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4102  T* HWY_RESTRICT unaligned) {
4103  const uint64_t mask_bits = detail::BitsFromMask(m);
4104  StoreU(detail::Compress(v, mask_bits), d, unaligned);
4105  return PopCount(mask_bits);
4106 }
4107 
4108 template <typename T>
4109 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
4110  Full256<T> d, T* HWY_RESTRICT unaligned) {
4111  constexpr size_t N = 32 / sizeof(T);
4112  constexpr size_t kNumBytes = (N + 7) / 8;
4113 
4114  uint64_t mask_bits = 0;
4115  CopyBytes<kNumBytes>(bits, &mask_bits);
4116 
4117  if (N < 8) {
4118  mask_bits &= (1ull << N) - 1;
4119  }
4120 
4121  StoreU(detail::Compress(v, mask_bits), d, unaligned);
4122  return PopCount(mask_bits);
4123 }
4124 
4125 #endif // HWY_TARGET <= HWY_AVX3
4126 
4127 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
4128 // TableLookupBytes, ConcatUpperLower)
4129 
4131  const Vec256<uint8_t> v1,
4132  const Vec256<uint8_t> v2, Full256<uint8_t> d,
4133  uint8_t* HWY_RESTRICT unaligned) {
4134  const auto k5 = Set(d, 5);
4135  const auto k6 = Set(d, 6);
4136 
4137  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
4138  // 0x80 so lanes to be filled from other vectors are 0 for blending.
4139  alignas(16) static constexpr uint8_t tbl_r0[16] = {
4140  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
4141  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
4142  alignas(16) static constexpr uint8_t tbl_g0[16] = {
4143  0x80, 0, 0x80, 0x80, 1, 0x80, //
4144  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
4145  const auto shuf_r0 = LoadDup128(d, tbl_r0);
4146  const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
4147  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
4148  const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
4149  const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
4150  const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
4151  const auto interleaved_10_00 = r0 | g0 | b0;
4152 
4153  // Second vector: g10,r10, bgr[9:6], b5,g5
4154  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
4155  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
4156  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
4157  const auto r1 = TableLookupBytes(v0, shuf_r1);
4158  const auto g1 = TableLookupBytes(v1, shuf_g1);
4159  const auto b1 = TableLookupBytes(v2, shuf_b1);
4160  const auto interleaved_15_05 = r1 | g1 | b1;
4161 
4162  // We want to write the lower halves of the interleaved vectors, then the
4163  // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but
4164  // that would require two ununaligned stores. For the lower halves, we can
4165  // merge two 128-bit stores for the same swizzling cost:
4166  const auto out0 = ConcatLowerLower(d, interleaved_15_05, interleaved_10_00);
4167  StoreU(out0, d, unaligned + 0 * 32);
4168 
4169  // Third vector: bgr[15:11], b10
4170  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
4171  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
4172  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
4173  const auto r2 = TableLookupBytes(v0, shuf_r2);
4174  const auto g2 = TableLookupBytes(v1, shuf_g2);
4175  const auto b2 = TableLookupBytes(v2, shuf_b2);
4176  const auto interleaved_1A_0A = r2 | g2 | b2;
4177 
4178  const auto out1 = ConcatUpperLower(d, interleaved_10_00, interleaved_1A_0A);
4179  StoreU(out1, d, unaligned + 1 * 32);
4180 
4181  const auto out2 = ConcatUpperUpper(d, interleaved_1A_0A, interleaved_15_05);
4182  StoreU(out2, d, unaligned + 2 * 32);
4183 }
4184 
4185 // ------------------------------ StoreInterleaved4
4186 
4188  const Vec256<uint8_t> v1,
4189  const Vec256<uint8_t> v2,
4190  const Vec256<uint8_t> v3, Full256<uint8_t> d8,
4191  uint8_t* HWY_RESTRICT unaligned) {
4192  const RepartitionToWide<decltype(d8)> d16;
4193  const RepartitionToWide<decltype(d16)> d32;
4194  // let a,b,c,d denote v0..3.
4195  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
4196  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
4197  const auto ba8 = ZipUpper(d16, v0, v1);
4198  const auto dc8 = ZipUpper(d16, v2, v3);
4199  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a13 d..a10 | d..a03 d..a00
4200  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a17 d..a14 | d..a07 d..a04
4201  const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..a1B d..a18 | d..a0B d..a08
4202  const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..a1F d..a1C | d..a0F d..a0C
4203  // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can
4204  // efficiently combine two lower halves into 256 bits:
4205  const auto out0 = BitCast(d8, ConcatLowerLower(d32, dcba_4, dcba_0));
4206  const auto out1 = BitCast(d8, ConcatLowerLower(d32, dcba_C, dcba_8));
4207  StoreU(out0, d8, unaligned + 0 * 32);
4208  StoreU(out1, d8, unaligned + 1 * 32);
4209  const auto out2 = BitCast(d8, ConcatUpperUpper(d32, dcba_4, dcba_0));
4210  const auto out3 = BitCast(d8, ConcatUpperUpper(d32, dcba_C, dcba_8));
4211  StoreU(out2, d8, unaligned + 2 * 32);
4212  StoreU(out3, d8, unaligned + 3 * 32);
4213 }
4214 
4215 // ------------------------------ Reductions
4216 
4217 namespace detail {
4218 
4219 // Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block.
4220 // Same logic as x86/128.h, but with Vec256 arguments.
4221 template <typename T>
4223  const Vec256<T> v3210) {
4224  const auto v1032 = Shuffle1032(v3210);
4225  const auto v31_20_31_20 = v3210 + v1032;
4226  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
4227  return v20_31_20_31 + v31_20_31_20;
4228 }
4229 template <typename T>
4231  const Vec256<T> v3210) {
4232  const auto v1032 = Shuffle1032(v3210);
4233  const auto v31_20_31_20 = Min(v3210, v1032);
4234  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
4235  return Min(v20_31_20_31, v31_20_31_20);
4236 }
4237 template <typename T>
4239  const Vec256<T> v3210) {
4240  const auto v1032 = Shuffle1032(v3210);
4241  const auto v31_20_31_20 = Max(v3210, v1032);
4242  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
4243  return Max(v20_31_20_31, v31_20_31_20);
4244 }
4245 
4246 template <typename T>
4248  const Vec256<T> v10) {
4249  const auto v01 = Shuffle01(v10);
4250  return v10 + v01;
4251 }
4252 template <typename T>
4254  const Vec256<T> v10) {
4255  const auto v01 = Shuffle01(v10);
4256  return Min(v10, v01);
4257 }
4258 template <typename T>
4260  const Vec256<T> v10) {
4261  const auto v01 = Shuffle01(v10);
4262  return Max(v10, v01);
4263 }
4264 
4265 } // namespace detail
4266 
4267 // Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
4268 template <typename T>
4270  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
4271  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), vLH + vHL);
4272 }
4273 template <typename T>
4275  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
4276  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), Min(vLH, vHL));
4277 }
4278 template <typename T>
4280  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
4281  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
4282 }
4283 
4284 // ================================================== DEPRECATED
4285 
4286 template <typename T>
4287 HWY_API size_t StoreMaskBits(const Mask256<T> mask, uint8_t* bits) {
4288  return StoreMaskBits(Full256<T>(), mask, bits);
4289 }
4290 
4291 template <typename T>
4292 HWY_API bool AllTrue(const Mask256<T> mask) {
4293  return AllTrue(Full256<T>(), mask);
4294 }
4295 
4296 template <typename T>
4297 HWY_API bool AllFalse(const Mask256<T> mask) {
4298  return AllFalse(Full256<T>(), mask);
4299 }
4300 
4301 template <typename T>
4302 HWY_API size_t CountTrue(const Mask256<T> mask) {
4303  return CountTrue(Full256<T>(), mask);
4304 }
4305 
4306 template <typename T>
4308  return SumOfLanes(Full256<T>(), vHL);
4309 }
4310 template <typename T>
4312  return MinOfLanes(Full256<T>(), vHL);
4313 }
4314 template <typename T>
4316  return MaxOfLanes(Full256<T>(), vHL);
4317 }
4318 
4319 template <typename T>
4321  return UpperHalf(Full128<T>(), v);
4322 }
4323 
4324 template <int kBytes, typename T>
4326  return ShiftRightBytes<kBytes>(Full256<T>(), v);
4327 }
4328 
4329 template <int kLanes, typename T>
4331  return ShiftRightLanes<kLanes>(Full256<T>(), v);
4332 }
4333 
4334 template <size_t kBytes, typename T>
4336  return CombineShiftRightBytes<kBytes>(Full256<T>(), hi, lo);
4337 }
4338 
4339 template <typename T>
4341  return InterleaveUpper(Full256<T>(), a, b);
4342 }
4343 
4344 template <typename T>
4346  return InterleaveUpper(Full256<MakeWide<T>>(), a, b);
4347 }
4348 
4349 template <typename T>
4351  return Combine(Full256<T>(), hi, lo);
4352 }
4353 
4354 template <typename T>
4356  return ZeroExtendVector(Full256<T>(), lo);
4357 }
4358 
4359 template <typename T>
4361  return ConcatLowerLower(Full256<T>(), hi, lo);
4362 }
4363 
4364 template <typename T>
4366  return ConcatLowerUpper(Full256<T>(), hi, lo);
4367 }
4368 
4369 template <typename T>
4371  return ConcatUpperLower(Full256<T>(), hi, lo);
4372 }
4373 
4374 template <typename T>
4376  return ConcatUpperUpper(Full256<T>(), hi, lo);
4377 }
4378 
4379 // NOLINTNEXTLINE(google-readability-namespace-comments)
4380 } // namespace HWY_NAMESPACE
4381 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:58
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:66
#define HWY_API
Definition: base.h:117
#define HWY_REP4(literal)
Definition: base.h:136
#define HWY_INLINE
Definition: base.h:59
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:67
#define HWY_DASSERT(condition)
Definition: base.h:163
Definition: arm_neon-inl.h:468
Raw raw
Definition: arm_neon-inl.h:501
Definition: x86_256-inl.h:67
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: x86_256-inl.h:76
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: x86_256-inl.h:73
Raw raw
Definition: x86_256-inl.h:95
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: x86_256-inl.h:88
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: x86_256-inl.h:91
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: x86_256-inl.h:79
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: x86_256-inl.h:85
typename detail::Raw256< T >::type Raw
Definition: x86_256-inl.h:68
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: x86_256-inl.h:82
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1463
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4288
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:842
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:2739
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:2798
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1278
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1136
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:127
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:3589
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4095
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:879
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:4769
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:672
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:768
HWY_INLINE auto FixConversionOverflow(Simd< TI, N > di, decltype(Zero(DF())) original, decltype(Zero(di).raw) converted_raw) -> decltype(Zero(di))
Definition: x86_128-inl.h:4176
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4100
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2184
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4165
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:551
HWY_INLINE Vu16m1 DemoteTo(Du16m1 d, const Vu32m2 v)
Definition: rvv-inl.h:1176
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:2728
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:714
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2176
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:805
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4091
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:624
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:803
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1573
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_API Vec128< uint32_t, N > TableLookupLanes(const Vec128< uint32_t, N > v, const Indices128< uint32_t, N > idx)
Definition: arm_neon-inl.h:3342
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3064
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1167
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:744
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1520
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2829
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1073
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3629
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2949
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3052
HWY_API size_t StoreMaskBits(Simd< T, N >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4528
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4207
HWY_API Mask128< T, N > FirstN(const Simd< T, N > d, size_t num)
Definition: arm_neon-inl.h:1806
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_API Vec128< T, N > LoadDup128(Simd< T, N > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2164
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4054
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2352
HWY_API Vec128< T, N > VecFromMask(const Mask128< T, N > v)
Definition: arm_neon-inl.h:1607
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1206
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3760
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3495
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N > d, const int32_t *idx)
Definition: arm_neon-inl.h:3323
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1600
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:904
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3947
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3389
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1795
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:422
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3070
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3802
Vec128< T, N > Iota(const Simd< T, N > d, const T2 first)
Definition: arm_neon-inl.h:734
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3483
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1438
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1443
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:3907
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: shared-inl.h:151
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2903
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:778
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1619
HWY_API Vec128< T, N > Undefined(Simd< T, N >)
Definition: arm_neon-inl.h:724
HWY_API intptr_t FindFirstTrue(const Simd< T, N >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4520
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1448
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4509
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:392
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4538
HWY_API bool AllTrue(const Simd< T, N > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4557
HWY_API void StoreInterleaved3(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4829
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_API VFromD< DW > ZipUpper(DW dw, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3538
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1581
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1655
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2170
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1735
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2846
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: shared-inl.h:160
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:794
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:475
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:325
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3506
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1827
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3041
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:1953
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1126
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:449
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec128< uint8_t, 8 > hi, Vec128< uint8_t, 8 > lo)
Definition: arm_neon-inl.h:3566
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1194
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4071
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2157
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4036
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1799
HWY_API V InterleaveUpper(Simd< T, N >, V a, V b)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1084
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:319
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1077
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:532
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2890
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1366
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4276
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4812
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:3844
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4211
Neg(const Vec128< int64_t, 1 > v)
Definition: arm_neon-inl.h:866
HWY_API Vec128< uint8_t, 4 > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2699
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3235
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4787
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2818
HWY_API VFromD< DW > ZipLower(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:3527
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4203
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:2665
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4802
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3401
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:3957
HWY_API void StoreInterleaved4(const Vec128< uint8_t > v0, const Vec128< uint8_t > v1, const Vec128< uint8_t > v2, const Vec128< uint8_t > v3, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:4864
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< uint8_t, 8 > UpperHalf(Simd< uint8_t, 8 >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3096
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3407
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:984
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1590
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1320
long long int GatherIndex64
Definition: x86_128-inl.h:2721
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4780
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:405
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3545
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1296
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
Definition: aligned_allocator.h:23
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:565
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:589
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:555
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:521
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:529
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:523
#define HWY_ALIGN
Definition: set_macros-inl.h:78
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
Definition: x86_256-inl.h:2535
__m256i raw
Definition: x86_256-inl.h:2536
Definition: x86_256-inl.h:125
typename detail::RawMask256< sizeof(T)>::type Raw
Definition: x86_256-inl.h:126
Raw raw
Definition: x86_256-inl.h:132
static Mask256< T > FromBits(uint64_t mask_bits)
Definition: x86_256-inl.h:128
Definition: shared-inl.h:35
HWY_INLINE __m256d operator()(__m256i v)
Definition: x86_256-inl.h:171
HWY_INLINE __m256 operator()(__m256i v)
Definition: x86_256-inl.h:167
HWY_INLINE __m256i operator()(__m256i v)
Definition: x86_256-inl.h:163
__m256d type
Definition: x86_256-inl.h:61
__m256 type
Definition: x86_256-inl.h:57
Definition: x86_256-inl.h:52
__m256i type
Definition: x86_256-inl.h:53
__mmask32 type
Definition: x86_256-inl.h:107
__mmask16 type
Definition: x86_256-inl.h:111
__mmask8 type
Definition: x86_256-inl.h:115
__mmask8 type
Definition: x86_256-inl.h:119
Definition: x86_256-inl.h:104
Definition: base.h:290
Definition: base.h:227
Definition: base.h:222
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()