Grok  9.7.5
x86_256-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when
17 // compiling for that target.
18 // External include guard in highway.h - see comment there.
19 
20 // WARNING: most operations do not cross 128-bit block boundaries. In
21 // particular, "Broadcast", pack and zip behavior may be surprising.
22 
23 #include <immintrin.h> // AVX2+
24 
25 #include "hwy/base.h"
26 #if defined(_MSC_VER) && defined(__clang__)
27 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
28 // including these headers when _MSC_VER is defined, like when using clang-cl.
29 // Include these directly here.
30 #include <avxintrin.h>
31 // avxintrin defines __m256i and must come before avx2intrin.
32 #include <avx2intrin.h>
33 #include <bmi2intrin.h> // _pext_u64
34 #include <f16cintrin.h>
35 #include <fmaintrin.h>
36 #include <smmintrin.h>
37 #endif
38 
39 #include <stddef.h>
40 #include <stdint.h>
41 
42 // For half-width vectors. Already includes base.h and shared-inl.h.
43 #include "hwy/ops/x86_128-inl.h"
44 
46 namespace hwy {
47 namespace HWY_NAMESPACE {
48 namespace detail {
49 
50 template <typename T>
51 struct Raw256 {
52  using type = __m256i;
53 };
54 template <>
55 struct Raw256<float> {
56  using type = __m256;
57 };
58 template <>
59 struct Raw256<double> {
60  using type = __m256d;
61 };
62 
63 } // namespace detail
64 
65 template <typename T>
66 class Vec256 {
67  using Raw = typename detail::Raw256<T>::type;
68 
69  public:
70  // Compound assignment. Only usable if there is a corresponding non-member
71  // binary operator overload. For example, only f32 and f64 support division.
73  return *this = (*this * other);
74  }
76  return *this = (*this / other);
77  }
79  return *this = (*this + other);
80  }
82  return *this = (*this - other);
83  }
85  return *this = (*this & other);
86  }
88  return *this = (*this | other);
89  }
91  return *this = (*this ^ other);
92  }
93 
95 };
96 
97 #if HWY_TARGET <= HWY_AVX3
98 
99 namespace detail {
100 
101 // Template arg: sizeof(lane type)
102 template <size_t size>
103 struct RawMask256 {};
104 template <>
105 struct RawMask256<1> {
106  using type = __mmask32;
107 };
108 template <>
109 struct RawMask256<2> {
110  using type = __mmask16;
111 };
112 template <>
113 struct RawMask256<4> {
114  using type = __mmask8;
115 };
116 template <>
117 struct RawMask256<8> {
118  using type = __mmask8;
119 };
120 
121 } // namespace detail
122 
123 template <typename T>
124 struct Mask256 {
125  using Raw = typename detail::RawMask256<sizeof(T)>::type;
126 
127  static Mask256<T> FromBits(uint64_t mask_bits) {
128  return Mask256<T>{static_cast<Raw>(mask_bits)};
129  }
130 
132 };
133 
134 #else // AVX2
135 
136 // FF..FF or 0.
137 template <typename T>
138 struct Mask256 {
139  typename detail::Raw256<T>::type raw;
140 };
141 
142 #endif // HWY_TARGET <= HWY_AVX3
143 
144 // ------------------------------ BitCast
145 
146 namespace detail {
147 
148 HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
149 HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); }
150 HWY_INLINE __m256i BitCastToInteger(__m256d v) {
151  return _mm256_castpd_si256(v);
152 }
153 
154 template <typename T>
156  return Vec256<uint8_t>{BitCastToInteger(v.raw)};
157 }
158 
159 // Cannot rely on function overloading because return types differ.
160 template <typename T>
162  HWY_INLINE __m256i operator()(__m256i v) { return v; }
163 };
164 template <>
165 struct BitCastFromInteger256<float> {
166  HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); }
167 };
168 template <>
169 struct BitCastFromInteger256<double> {
170  HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); }
171 };
172 
173 template <typename T>
175  return Vec256<T>{BitCastFromInteger256<T>()(v.raw)};
176 }
177 
178 } // namespace detail
179 
180 template <typename T, typename FromT>
181 HWY_API Vec256<T> BitCast(Full256<T> d, Vec256<FromT> v) {
183 }
184 
185 // ------------------------------ Set
186 
187 // Returns an all-zero vector.
188 template <typename T>
189 HWY_API Vec256<T> Zero(Full256<T> /* tag */) {
190  return Vec256<T>{_mm256_setzero_si256()};
191 }
192 HWY_API Vec256<float> Zero(Full256<float> /* tag */) {
193  return Vec256<float>{_mm256_setzero_ps()};
194 }
196  return Vec256<double>{_mm256_setzero_pd()};
197 }
198 
199 // Returns a vector with all lanes set to "t".
200 HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
201  return Vec256<uint8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
202 }
203 HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
204  return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
205 }
206 HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
207  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
208 }
209 HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
210  return Vec256<uint64_t>{
211  _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
212 }
213 HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
214  return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
215 }
216 HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
217  return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
218 }
219 HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
220  return Vec256<int32_t>{_mm256_set1_epi32(t)};
221 }
222 HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
223  return Vec256<int64_t>{
224  _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
225 }
226 HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
227  return Vec256<float>{_mm256_set1_ps(t)};
228 }
229 HWY_API Vec256<double> Set(Full256<double> /* tag */, const double t) {
230  return Vec256<double>{_mm256_set1_pd(t)};
231 }
232 
233 HWY_DIAGNOSTICS(push)
234 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
235 
236 // Returns a vector with uninitialized elements.
237 template <typename T>
238 HWY_API Vec256<T> Undefined(Full256<T> /* tag */) {
239  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
240  // generate an XOR instruction.
241  return Vec256<T>{_mm256_undefined_si256()};
242 }
244  return Vec256<float>{_mm256_undefined_ps()};
245 }
247  return Vec256<double>{_mm256_undefined_pd()};
248 }
249 
250 HWY_DIAGNOSTICS(pop)
251 
252 // ================================================== LOGICAL
253 
254 // ------------------------------ And
255 
256 template <typename T>
257 HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
258  return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
259 }
260 
262  return Vec256<float>{_mm256_and_ps(a.raw, b.raw)};
263 }
265  return Vec256<double>{_mm256_and_pd(a.raw, b.raw)};
266 }
267 
268 // ------------------------------ AndNot
269 
270 // Returns ~not_mask & mask.
271 template <typename T>
272 HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
273  return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
274 }
276  const Vec256<float> mask) {
277  return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
278 }
280  const Vec256<double> mask) {
281  return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)};
282 }
283 
284 // ------------------------------ Or
285 
286 template <typename T>
287 HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
288  return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
289 }
290 
292  return Vec256<float>{_mm256_or_ps(a.raw, b.raw)};
293 }
295  return Vec256<double>{_mm256_or_pd(a.raw, b.raw)};
296 }
297 
298 // ------------------------------ Xor
299 
300 template <typename T>
301 HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
302  return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
303 }
304 
306  return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)};
307 }
309  return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)};
310 }
311 
312 // ------------------------------ Not
313 
314 template <typename T>
315 HWY_API Vec256<T> Not(const Vec256<T> v) {
316  using TU = MakeUnsigned<T>;
317 #if HWY_TARGET <= HWY_AVX3
318  const __m256i vu = BitCast(Full256<TU>(), v).raw;
319  return BitCast(Full256<T>(),
320  Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
321 #else
322  return Xor(v, BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
323 #endif
324 }
325 
326 // ------------------------------ OrAnd
327 
328 template <typename T>
329 HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
330 #if HWY_TARGET <= HWY_AVX3
331  const Full256<T> d;
332  const RebindToUnsigned<decltype(d)> du;
333  using VU = VFromD<decltype(du)>;
334  const __m256i ret = _mm256_ternarylogic_epi64(
335  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
336  return BitCast(d, VU{ret});
337 #else
338  return Or(o, And(a1, a2));
339 #endif
340 }
341 
342 // ------------------------------ IfVecThenElse
343 
344 template <typename T>
345 HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
346 #if HWY_TARGET <= HWY_AVX3
347  const Full256<T> d;
348  const RebindToUnsigned<decltype(d)> du;
349  using VU = VFromD<decltype(du)>;
350  return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
351  BitCast(du, yes).raw,
352  BitCast(du, no).raw, 0xCA)});
353 #else
354  return IfThenElse(MaskFromVec(mask), yes, no);
355 #endif
356 }
357 
358 // ------------------------------ Operator overloads (internal-only if float)
359 
360 template <typename T>
361 HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
362  return And(a, b);
363 }
364 
365 template <typename T>
366 HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
367  return Or(a, b);
368 }
369 
370 template <typename T>
371 HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
372  return Xor(a, b);
373 }
374 
375 // ------------------------------ PopulationCount
376 
377 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
378 #if HWY_TARGET == HWY_AVX3_DL
379 
380 #ifdef HWY_NATIVE_POPCNT
381 #undef HWY_NATIVE_POPCNT
382 #else
383 #define HWY_NATIVE_POPCNT
384 #endif
385 
386 namespace detail {
387 
388 template <typename T>
390  return Vec256<T>{_mm256_popcnt_epi8(v.raw)};
391 }
392 template <typename T>
394  return Vec256<T>{_mm256_popcnt_epi16(v.raw)};
395 }
396 template <typename T>
398  return Vec256<T>{_mm256_popcnt_epi32(v.raw)};
399 }
400 template <typename T>
402  return Vec256<T>{_mm256_popcnt_epi64(v.raw)};
403 }
404 
405 } // namespace detail
406 
407 template <typename T>
409  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
410 }
411 
412 #endif // HWY_TARGET == HWY_AVX3_DL
413 
414 // ================================================== SIGN
415 
416 // ------------------------------ CopySign
417 
418 template <typename T>
419 HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
420  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
421 
422  const Full256<T> d;
423  const auto msb = SignBit(d);
424 
425 #if HWY_TARGET <= HWY_AVX3
426  const Rebind<MakeUnsigned<T>, decltype(d)> du;
427  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
428  // 0 0 0 | 0
429  // 0 0 1 | 0
430  // 0 1 0 | 1
431  // 0 1 1 | 1
432  // 1 0 0 | 0
433  // 1 0 1 | 1
434  // 1 1 0 | 0
435  // 1 1 1 | 1
436  // The lane size does not matter because we are not using predication.
437  const __m256i out = _mm256_ternarylogic_epi32(
438  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
439  return BitCast(d, decltype(Zero(du)){out});
440 #else
441  return Or(AndNot(msb, magn), And(msb, sign));
442 #endif
443 }
444 
445 template <typename T>
446 HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) {
447 #if HWY_TARGET <= HWY_AVX3
448  // AVX3 can also handle abs < 0, so no extra action needed.
449  return CopySign(abs, sign);
450 #else
451  return Or(abs, And(SignBit(Full256<T>()), sign));
452 #endif
453 }
454 
455 // ================================================== MASK
456 
457 #if HWY_TARGET <= HWY_AVX3
458 
459 // ------------------------------ IfThenElse
460 
461 // Returns mask ? b : a.
462 
463 namespace detail {
464 
465 // Templates for signed/unsigned integer of a particular size.
466 template <typename T>
468  Vec256<T> yes, Vec256<T> no) {
469  return Vec256<T>{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
470 }
471 template <typename T>
473  Vec256<T> yes, Vec256<T> no) {
474  return Vec256<T>{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
475 }
476 template <typename T>
478  Vec256<T> yes, Vec256<T> no) {
479  return Vec256<T>{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
480 }
481 template <typename T>
483  Vec256<T> yes, Vec256<T> no) {
484  return Vec256<T>{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
485 }
486 
487 } // namespace detail
488 
489 template <typename T>
490 HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
491  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
492 }
494  Vec256<float> no) {
495  return Vec256<float>{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)};
496 }
498  Vec256<double> no) {
499  return Vec256<double>{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)};
500 }
501 
502 namespace detail {
503 
504 template <typename T>
506  Vec256<T> yes) {
507  return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)};
508 }
509 template <typename T>
511  Vec256<T> yes) {
512  return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)};
513 }
514 template <typename T>
516  Vec256<T> yes) {
517  return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)};
518 }
519 template <typename T>
521  Vec256<T> yes) {
522  return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)};
523 }
524 
525 } // namespace detail
526 
527 template <typename T>
528 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
529  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
530 }
532  return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)};
533 }
535  Vec256<double> yes) {
536  return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)};
537 }
538 
539 namespace detail {
540 
541 template <typename T>
543  Vec256<T> no) {
544  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
545  return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
546 }
547 template <typename T>
549  Vec256<T> no) {
550  return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
551 }
552 template <typename T>
554  Vec256<T> no) {
555  return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
556 }
557 template <typename T>
559  Vec256<T> no) {
560  return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
561 }
562 
563 } // namespace detail
564 
565 template <typename T>
566 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
567  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
568 }
570  return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
571 }
573  return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
574 }
575 
576 template <typename T, HWY_IF_FLOAT(T)>
578  // AVX3 MaskFromVec only looks at the MSB
579  return IfThenZeroElse(MaskFromVec(v), v);
580 }
581 
582 // ------------------------------ Mask logical
583 
584 namespace detail {
585 
586 template <typename T>
588  const Mask256<T> b) {
589 #if HWY_COMPILER_HAS_MASK_INTRINSICS
590  return Mask256<T>{_kand_mask32(a.raw, b.raw)};
591 #else
592  return Mask256<T>{static_cast<__mmask32>(a.raw & b.raw)};
593 #endif
594 }
595 template <typename T>
597  const Mask256<T> b) {
598 #if HWY_COMPILER_HAS_MASK_INTRINSICS
599  return Mask256<T>{_kand_mask16(a.raw, b.raw)};
600 #else
601  return Mask256<T>{static_cast<__mmask16>(a.raw & b.raw)};
602 #endif
603 }
604 template <typename T>
606  const Mask256<T> b) {
607 #if HWY_COMPILER_HAS_MASK_INTRINSICS
608  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
609 #else
610  return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
611 #endif
612 }
613 template <typename T>
615  const Mask256<T> b) {
616 #if HWY_COMPILER_HAS_MASK_INTRINSICS
617  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
618 #else
619  return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
620 #endif
621 }
622 
623 template <typename T>
625  const Mask256<T> b) {
626 #if HWY_COMPILER_HAS_MASK_INTRINSICS
627  return Mask256<T>{_kandn_mask32(a.raw, b.raw)};
628 #else
629  return Mask256<T>{static_cast<__mmask32>(~a.raw & b.raw)};
630 #endif
631 }
632 template <typename T>
634  const Mask256<T> b) {
635 #if HWY_COMPILER_HAS_MASK_INTRINSICS
636  return Mask256<T>{_kandn_mask16(a.raw, b.raw)};
637 #else
638  return Mask256<T>{static_cast<__mmask16>(~a.raw & b.raw)};
639 #endif
640 }
641 template <typename T>
643  const Mask256<T> b) {
644 #if HWY_COMPILER_HAS_MASK_INTRINSICS
645  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
646 #else
647  return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
648 #endif
649 }
650 template <typename T>
652  const Mask256<T> b) {
653 #if HWY_COMPILER_HAS_MASK_INTRINSICS
654  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
655 #else
656  return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
657 #endif
658 }
659 
660 template <typename T>
662  const Mask256<T> b) {
663 #if HWY_COMPILER_HAS_MASK_INTRINSICS
664  return Mask256<T>{_kor_mask32(a.raw, b.raw)};
665 #else
666  return Mask256<T>{static_cast<__mmask32>(a.raw | b.raw)};
667 #endif
668 }
669 template <typename T>
671  const Mask256<T> b) {
672 #if HWY_COMPILER_HAS_MASK_INTRINSICS
673  return Mask256<T>{_kor_mask16(a.raw, b.raw)};
674 #else
675  return Mask256<T>{static_cast<__mmask16>(a.raw | b.raw)};
676 #endif
677 }
678 template <typename T>
680  const Mask256<T> b) {
681 #if HWY_COMPILER_HAS_MASK_INTRINSICS
682  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
683 #else
684  return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
685 #endif
686 }
687 template <typename T>
689  const Mask256<T> b) {
690 #if HWY_COMPILER_HAS_MASK_INTRINSICS
691  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
692 #else
693  return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
694 #endif
695 }
696 
697 template <typename T>
699  const Mask256<T> b) {
700 #if HWY_COMPILER_HAS_MASK_INTRINSICS
701  return Mask256<T>{_kxor_mask32(a.raw, b.raw)};
702 #else
703  return Mask256<T>{static_cast<__mmask32>(a.raw ^ b.raw)};
704 #endif
705 }
706 template <typename T>
708  const Mask256<T> b) {
709 #if HWY_COMPILER_HAS_MASK_INTRINSICS
710  return Mask256<T>{_kxor_mask16(a.raw, b.raw)};
711 #else
712  return Mask256<T>{static_cast<__mmask16>(a.raw ^ b.raw)};
713 #endif
714 }
715 template <typename T>
717  const Mask256<T> b) {
718 #if HWY_COMPILER_HAS_MASK_INTRINSICS
719  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
720 #else
721  return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
722 #endif
723 }
724 template <typename T>
726  const Mask256<T> b) {
727 #if HWY_COMPILER_HAS_MASK_INTRINSICS
728  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
729 #else
730  return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
731 #endif
732 }
733 
734 } // namespace detail
735 
736 template <typename T>
738  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
739 }
740 
741 template <typename T>
743  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
744 }
745 
746 template <typename T>
748  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
749 }
750 
751 template <typename T>
753  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
754 }
755 
756 template <typename T>
758  // Flip only the valid bits.
759  constexpr size_t N = 32 / sizeof(T);
760  return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
761 }
762 
763 #else // AVX2
764 
765 // ------------------------------ Mask
766 
767 // Mask and Vec are the same (true = FF..FF).
768 template <typename T>
769 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
770  return Mask256<T>{v.raw};
771 }
772 
773 template <typename T>
774 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
775  return Vec256<T>{v.raw};
776 }
777 
778 template <typename T>
779 HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
780  return Vec256<T>{v.raw};
781 }
782 
783 // ------------------------------ IfThenElse
784 
785 // mask ? yes : no
786 template <typename T>
787 HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
788  const Vec256<T> no) {
789  return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
790 }
791 HWY_API Vec256<float> IfThenElse(const Mask256<float> mask,
792  const Vec256<float> yes,
793  const Vec256<float> no) {
794  return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
795 }
796 HWY_API Vec256<double> IfThenElse(const Mask256<double> mask,
797  const Vec256<double> yes,
798  const Vec256<double> no) {
799  return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
800 }
801 
802 // mask ? yes : 0
803 template <typename T>
804 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
805  return yes & VecFromMask(Full256<T>(), mask);
806 }
807 
808 // mask ? 0 : no
809 template <typename T>
810 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
811  return AndNot(VecFromMask(Full256<T>(), mask), no);
812 }
813 
814 template <typename T, HWY_IF_FLOAT(T)>
815 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
816  const auto zero = Zero(Full256<T>());
817  // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
818  return IfThenElse(MaskFromVec(v), zero, v);
819 }
820 
821 // ------------------------------ Mask logical
822 
823 template <typename T>
824 HWY_API Mask256<T> Not(const Mask256<T> m) {
825  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
826 }
827 
828 template <typename T>
829 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
830  const Full256<T> d;
831  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
832 }
833 
834 template <typename T>
835 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
836  const Full256<T> d;
837  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
838 }
839 
840 template <typename T>
841 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
842  const Full256<T> d;
843  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
844 }
845 
846 template <typename T>
847 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
848  const Full256<T> d;
849  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
850 }
851 
852 #endif // HWY_TARGET <= HWY_AVX3
853 
854 // ================================================== COMPARE
855 
856 #if HWY_TARGET <= HWY_AVX3
857 
858 // Comparisons set a mask bit to 1 if the condition is true, else 0.
859 
860 template <typename TFrom, typename TTo>
861 HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
862  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
863  return Mask256<TTo>{m.raw};
864 }
865 
866 namespace detail {
867 
868 template <typename T>
870  const Vec256<T> bit) {
871  return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)};
872 }
873 template <typename T>
875  const Vec256<T> bit) {
876  return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)};
877 }
878 template <typename T>
880  const Vec256<T> bit) {
881  return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)};
882 }
883 template <typename T>
885  const Vec256<T> bit) {
886  return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)};
887 }
888 
889 } // namespace detail
890 
891 template <typename T>
892 HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
893  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
894  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
895 }
896 
897 // ------------------------------ Equality
898 
899 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
901  return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)};
902 }
903 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
904 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
905  return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
906 }
907 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
908 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
909  return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
910 }
911 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
912 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
913  return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
914 }
915 
916 HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
917  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
918 }
919 
921  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
922 }
923 
924 // ------------------------------ Inequality
925 
926 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
928  return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)};
929 }
930 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
931 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
932  return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
933 }
934 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
935 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
936  return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
937 }
938 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
939 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
940  return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
941 }
942 
943 HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
944  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
945 }
946 
948  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
949 }
950 
951 // ------------------------------ Strict inequality
952 
953 HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
954  return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
955 }
956 HWY_API Mask256<int16_t> operator>(Vec256<int16_t> a, Vec256<int16_t> b) {
957  return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
958 }
959 HWY_API Mask256<int32_t> operator>(Vec256<int32_t> a, Vec256<int32_t> b) {
960  return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
961 }
962 HWY_API Mask256<int64_t> operator>(Vec256<int64_t> a, Vec256<int64_t> b) {
963  return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
964 }
965 
967  return Mask256<uint8_t>{_mm256_cmpgt_epu8_mask(a.raw, b.raw)};
968 }
970  const Vec256<uint16_t> b) {
971  return Mask256<uint16_t>{_mm256_cmpgt_epu16_mask(a.raw, b.raw)};
972 }
974  const Vec256<uint32_t> b) {
975  return Mask256<uint32_t>{_mm256_cmpgt_epu32_mask(a.raw, b.raw)};
976 }
978  const Vec256<uint64_t> b) {
979  return Mask256<uint64_t>{_mm256_cmpgt_epu64_mask(a.raw, b.raw)};
980 }
981 
982 HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
983  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
984 }
986  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
987 }
988 
989 // ------------------------------ Weak inequality
990 
991 HWY_API Mask256<float> operator>=(Vec256<float> a, Vec256<float> b) {
992  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
993 }
995  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
996 }
997 
998 // ------------------------------ Mask
999 
1000 namespace detail {
1001 
1002 template <typename T>
1004  return Mask256<T>{_mm256_movepi8_mask(v.raw)};
1005 }
1006 template <typename T>
1008  return Mask256<T>{_mm256_movepi16_mask(v.raw)};
1009 }
1010 template <typename T>
1012  return Mask256<T>{_mm256_movepi32_mask(v.raw)};
1013 }
1014 template <typename T>
1016  return Mask256<T>{_mm256_movepi64_mask(v.raw)};
1017 }
1018 
1019 } // namespace detail
1020 
1021 template <typename T>
1022 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
1023  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1024 }
1025 // There do not seem to be native floating-point versions of these instructions.
1028 }
1031 }
1032 
1033 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1035  return Vec256<T>{_mm256_movm_epi8(v.raw)};
1036 }
1037 
1038 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1039 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1040  return Vec256<T>{_mm256_movm_epi16(v.raw)};
1041 }
1042 
1043 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1044 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1045  return Vec256<T>{_mm256_movm_epi32(v.raw)};
1046 }
1047 
1048 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1049 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1050  return Vec256<T>{_mm256_movm_epi64(v.raw)};
1051 }
1052 
1054  return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))};
1055 }
1056 
1058  return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))};
1059 }
1060 
1061 template <typename T>
1062 HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
1063  return VecFromMask(v);
1064 }
1065 
1066 #else // AVX2
1067 
1068 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1069 
1070 template <typename TFrom, typename TTo>
1071 HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
1072  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1073  return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
1074 }
1075 
1076 template <typename T>
1077 HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
1078  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1079  return (v & bit) == bit;
1080 }
1081 
1082 // ------------------------------ Equality
1083 
1084 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1085 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1086  return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1087 }
1088 
1089 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1090 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1091  return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1092 }
1093 
1094 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1095 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1096  return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1097 }
1098 
1099 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1100 HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1101  return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1102 }
1103 
1104 HWY_API Mask256<float> operator==(const Vec256<float> a,
1105  const Vec256<float> b) {
1106  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1107 }
1108 
1109 HWY_API Mask256<double> operator==(const Vec256<double> a,
1110  const Vec256<double> b) {
1111  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1112 }
1113 
1114 // ------------------------------ Inequality
1115 
1116 template <typename T, HWY_IF_NOT_FLOAT(T)>
1117 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1118  return Not(a == b);
1119 }
1120 
1121 HWY_API Mask256<float> operator!=(const Vec256<float> a,
1122  const Vec256<float> b) {
1123  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1124 }
1125 HWY_API Mask256<double> operator!=(const Vec256<double> a,
1126  const Vec256<double> b) {
1127  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1128 }
1129 
1130 // ------------------------------ Strict inequality
1131 
1132 // Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
1133 // to perform an unsigned comparison instead of the intended signed. Workaround
1134 // is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
1135 #if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1136 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1137 #else
1138 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1139 #endif
1140 
1141 HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
1142 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1143  using i8x32 = signed char __attribute__((__vector_size__(32)));
1144  return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
1145  reinterpret_cast<i8x32>(b.raw))};
1146 #else
1147  return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1148 #endif
1149 }
1150 HWY_API Mask256<int16_t> operator>(const Vec256<int16_t> a,
1151  const Vec256<int16_t> b) {
1152  return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1153 }
1154 HWY_API Mask256<int32_t> operator>(const Vec256<int32_t> a,
1155  const Vec256<int32_t> b) {
1156  return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1157 }
1158 HWY_API Mask256<int64_t> operator>(const Vec256<int64_t> a,
1159  const Vec256<int64_t> b) {
1160  return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1161 }
1162 
1163 template <typename T, HWY_IF_UNSIGNED(T)>
1164 HWY_API Mask256<T> operator>(const Vec256<T> a, const Vec256<T> b) {
1165  const Full256<T> du;
1166  const RebindToSigned<decltype(du)> di;
1167  const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1168  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1169 }
1170 
1171 HWY_API Mask256<float> operator>(const Vec256<float> a, const Vec256<float> b) {
1172  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1173 }
1174 HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
1175  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1176 }
1177 
1178 // ------------------------------ Weak inequality
1179 
1180 HWY_API Mask256<float> operator>=(const Vec256<float> a,
1181  const Vec256<float> b) {
1182  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1183 }
1184 HWY_API Mask256<double> operator>=(const Vec256<double> a,
1185  const Vec256<double> b) {
1186  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1187 }
1188 
1189 #endif // HWY_TARGET <= HWY_AVX3
1190 
1191 // ------------------------------ Reversed comparisons
1192 
1193 template <typename T>
1194 HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
1195  return b > a;
1196 }
1197 
1198 template <typename T>
1200  return b >= a;
1201 }
1202 
1203 // ------------------------------ Min (Gt, IfThenElse)
1204 
1205 // Unsigned
1206 HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
1207  return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1208 }
1209 HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a,
1210  const Vec256<uint16_t> b) {
1211  return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1212 }
1213 HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a,
1214  const Vec256<uint32_t> b) {
1215  return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1216 }
1217 HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a,
1218  const Vec256<uint64_t> b) {
1219 #if HWY_TARGET <= HWY_AVX3
1220  return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1221 #else
1222  const Full256<uint64_t> du;
1223  const Full256<int64_t> di;
1224  const auto msb = Set(du, 1ull << 63);
1225  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1226  return IfThenElse(gt, b, a);
1227 #endif
1228 }
1229 
1230 // Signed
1231 HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) {
1232  return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1233 }
1234 HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) {
1235  return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1236 }
1237 HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) {
1238  return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1239 }
1240 HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) {
1241 #if HWY_TARGET <= HWY_AVX3
1242  return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1243 #else
1244  return IfThenElse(a < b, a, b);
1245 #endif
1246 }
1247 
1248 // Float
1249 HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) {
1250  return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1251 }
1253  return Vec256<double>{_mm256_min_pd(a.raw, b.raw)};
1254 }
1255 
1256 // ------------------------------ Max (Gt, IfThenElse)
1257 
1258 // Unsigned
1259 HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
1260  return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1261 }
1262 HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a,
1263  const Vec256<uint16_t> b) {
1264  return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1265 }
1266 HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a,
1267  const Vec256<uint32_t> b) {
1268  return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1269 }
1270 HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a,
1271  const Vec256<uint64_t> b) {
1272 #if HWY_TARGET <= HWY_AVX3
1273  return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1274 #else
1275  const Full256<uint64_t> du;
1276  const Full256<int64_t> di;
1277  const auto msb = Set(du, 1ull << 63);
1278  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1279  return IfThenElse(gt, a, b);
1280 #endif
1281 }
1282 
1283 // Signed
1284 HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) {
1285  return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1286 }
1287 HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) {
1288  return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1289 }
1290 HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) {
1291  return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1292 }
1293 HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) {
1294 #if HWY_TARGET <= HWY_AVX3
1295  return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1296 #else
1297  return IfThenElse(a < b, b, a);
1298 #endif
1299 }
1300 
1301 // Float
1302 HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) {
1303  return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1304 }
1306  return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
1307 }
1308 
1309 // ------------------------------ FirstN (Iota, Lt)
1310 
1311 template <typename T>
1312 HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
1313 #if HWY_TARGET <= HWY_AVX3
1314  (void)d;
1315  constexpr size_t N = 32 / sizeof(T);
1316 #if HWY_ARCH_X86_64
1317  const uint64_t all = (1ull << N) - 1;
1318  // BZHI only looks at the lower 8 bits of n!
1319  return Mask256<T>::FromBits((n > 255) ? all : _bzhi_u64(all, n));
1320 #else
1321  const uint32_t all = static_cast<uint32_t>((1ull << N) - 1);
1322  // BZHI only looks at the lower 8 bits of n!
1323  return Mask256<T>::FromBits(
1324  (n > 255) ? all : _bzhi_u32(all, static_cast<uint32_t>(n)));
1325 #endif // HWY_ARCH_X86_64
1326 #else
1327  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1328  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
1329 #endif
1330 }
1331 
1332 // ================================================== ARITHMETIC
1333 
1334 // ------------------------------ Addition
1335 
1336 // Unsigned
1337 HWY_API Vec256<uint8_t> operator+(const Vec256<uint8_t> a,
1338  const Vec256<uint8_t> b) {
1339  return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1340 }
1341 HWY_API Vec256<uint16_t> operator+(const Vec256<uint16_t> a,
1342  const Vec256<uint16_t> b) {
1343  return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
1344 }
1345 HWY_API Vec256<uint32_t> operator+(const Vec256<uint32_t> a,
1346  const Vec256<uint32_t> b) {
1347  return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
1348 }
1350  const Vec256<uint64_t> b) {
1351  return Vec256<uint64_t>{_mm256_add_epi64(a.raw, b.raw)};
1352 }
1353 
1354 // Signed
1355 HWY_API Vec256<int8_t> operator+(const Vec256<int8_t> a,
1356  const Vec256<int8_t> b) {
1357  return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1358 }
1359 HWY_API Vec256<int16_t> operator+(const Vec256<int16_t> a,
1360  const Vec256<int16_t> b) {
1361  return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1362 }
1363 HWY_API Vec256<int32_t> operator+(const Vec256<int32_t> a,
1364  const Vec256<int32_t> b) {
1365  return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1366 }
1368  const Vec256<int64_t> b) {
1369  return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)};
1370 }
1371 
1372 // Float
1373 HWY_API Vec256<float> operator+(const Vec256<float> a, const Vec256<float> b) {
1374  return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1375 }
1377  const Vec256<double> b) {
1378  return Vec256<double>{_mm256_add_pd(a.raw, b.raw)};
1379 }
1380 
1381 // ------------------------------ Subtraction
1382 
1383 // Unsigned
1384 HWY_API Vec256<uint8_t> operator-(const Vec256<uint8_t> a,
1385  const Vec256<uint8_t> b) {
1386  return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1387 }
1388 HWY_API Vec256<uint16_t> operator-(const Vec256<uint16_t> a,
1389  const Vec256<uint16_t> b) {
1390  return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1391 }
1392 HWY_API Vec256<uint32_t> operator-(const Vec256<uint32_t> a,
1393  const Vec256<uint32_t> b) {
1394  return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1395 }
1397  const Vec256<uint64_t> b) {
1398  return Vec256<uint64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1399 }
1400 
1401 // Signed
1402 HWY_API Vec256<int8_t> operator-(const Vec256<int8_t> a,
1403  const Vec256<int8_t> b) {
1404  return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1405 }
1406 HWY_API Vec256<int16_t> operator-(const Vec256<int16_t> a,
1407  const Vec256<int16_t> b) {
1408  return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1409 }
1410 HWY_API Vec256<int32_t> operator-(const Vec256<int32_t> a,
1411  const Vec256<int32_t> b) {
1412  return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1413 }
1415  const Vec256<int64_t> b) {
1416  return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1417 }
1418 
1419 // Float
1420 HWY_API Vec256<float> operator-(const Vec256<float> a, const Vec256<float> b) {
1421  return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1422 }
1424  const Vec256<double> b) {
1425  return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1426 }
1427 
1428 // ------------------------------ SumsOf8
1429 HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
1430  return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
1431 }
1432 
1433 // ------------------------------ SaturatedAdd
1434 
1435 // Returns a + b clamped to the destination range.
1436 
1437 // Unsigned
1438 HWY_API Vec256<uint8_t> SaturatedAdd(const Vec256<uint8_t> a,
1439  const Vec256<uint8_t> b) {
1440  return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
1441 }
1442 HWY_API Vec256<uint16_t> SaturatedAdd(const Vec256<uint16_t> a,
1443  const Vec256<uint16_t> b) {
1444  return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
1445 }
1446 
1447 // Signed
1448 HWY_API Vec256<int8_t> SaturatedAdd(const Vec256<int8_t> a,
1449  const Vec256<int8_t> b) {
1450  return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
1451 }
1452 HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
1453  const Vec256<int16_t> b) {
1454  return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1455 }
1456 
1457 // ------------------------------ SaturatedSub
1458 
1459 // Returns a - b clamped to the destination range.
1460 
1461 // Unsigned
1462 HWY_API Vec256<uint8_t> SaturatedSub(const Vec256<uint8_t> a,
1463  const Vec256<uint8_t> b) {
1464  return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
1465 }
1466 HWY_API Vec256<uint16_t> SaturatedSub(const Vec256<uint16_t> a,
1467  const Vec256<uint16_t> b) {
1468  return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
1469 }
1470 
1471 // Signed
1472 HWY_API Vec256<int8_t> SaturatedSub(const Vec256<int8_t> a,
1473  const Vec256<int8_t> b) {
1474  return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
1475 }
1476 HWY_API Vec256<int16_t> SaturatedSub(const Vec256<int16_t> a,
1477  const Vec256<int16_t> b) {
1478  return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1479 }
1480 
1481 // ------------------------------ Average
1482 
1483 // Returns (a + b + 1) / 2
1484 
1485 // Unsigned
1486 HWY_API Vec256<uint8_t> AverageRound(const Vec256<uint8_t> a,
1487  const Vec256<uint8_t> b) {
1488  return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
1489 }
1490 HWY_API Vec256<uint16_t> AverageRound(const Vec256<uint16_t> a,
1491  const Vec256<uint16_t> b) {
1492  return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
1493 }
1494 
1495 // ------------------------------ Abs (Sub)
1496 
1497 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1498 HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
1499 #if HWY_COMPILER_MSVC
1500  // Workaround for incorrect codegen? (wrong result)
1501  const auto zero = Zero(Full256<int8_t>());
1502  return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
1503 #else
1504  return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
1505 #endif
1506 }
1507 HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
1508  return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
1509 }
1510 HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
1511  return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
1512 }
1513 // i64 is implemented after BroadcastSignBit.
1514 
1515 HWY_API Vec256<float> Abs(const Vec256<float> v) {
1516  const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
1517  return v & BitCast(Full256<float>(), mask);
1518 }
1520  const Vec256<int64_t> mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
1521  return v & BitCast(Full256<double>(), mask);
1522 }
1523 
1524 // ------------------------------ Integer multiplication
1525 
1526 // Unsigned
1527 HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1528  return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1529 }
1530 HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1531  return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1532 }
1533 
1534 // Signed
1535 HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
1536  return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1537 }
1538 HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
1539  return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1540 }
1541 
1542 // Returns the upper 16 bits of a * b in each lane.
1543 HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1544  return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
1545 }
1546 HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
1547  return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
1548 }
1549 
1550 HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t> a, Vec256<int16_t> b) {
1551  return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
1552 }
1553 
1554 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1555 // even and the upper half into its odd neighbor lane.
1556 HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1557  return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1558 }
1559 HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1560  return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1561 }
1562 
1563 // ------------------------------ ShiftLeft
1564 
1565 template <int kBits>
1566 HWY_API Vec256<uint16_t> ShiftLeft(const Vec256<uint16_t> v) {
1567  return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)};
1568 }
1569 
1570 template <int kBits>
1571 HWY_API Vec256<uint32_t> ShiftLeft(const Vec256<uint32_t> v) {
1572  return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)};
1573 }
1574 
1575 template <int kBits>
1577  return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)};
1578 }
1579 
1580 template <int kBits>
1581 HWY_API Vec256<int16_t> ShiftLeft(const Vec256<int16_t> v) {
1582  return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)};
1583 }
1584 
1585 template <int kBits>
1586 HWY_API Vec256<int32_t> ShiftLeft(const Vec256<int32_t> v) {
1587  return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)};
1588 }
1589 
1590 template <int kBits>
1592  return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
1593 }
1594 
1595 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
1596 HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
1597  const Full256<T> d8;
1598  const RepartitionToWide<decltype(d8)> d16;
1599  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
1600  return kBits == 1
1601  ? (v + v)
1602  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1603 }
1604 
1605 // ------------------------------ ShiftRight
1606 
1607 template <int kBits>
1608 HWY_API Vec256<uint16_t> ShiftRight(const Vec256<uint16_t> v) {
1609  return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)};
1610 }
1611 
1612 template <int kBits>
1613 HWY_API Vec256<uint32_t> ShiftRight(const Vec256<uint32_t> v) {
1614  return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)};
1615 }
1616 
1617 template <int kBits>
1619  return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)};
1620 }
1621 
1622 template <int kBits>
1623 HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
1624  const Full256<uint8_t> d8;
1625  // Use raw instead of BitCast to support N=1.
1626  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
1627  return shifted & Set(d8, 0xFF >> kBits);
1628 }
1629 
1630 template <int kBits>
1631 HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
1632  return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
1633 }
1634 
1635 template <int kBits>
1636 HWY_API Vec256<int32_t> ShiftRight(const Vec256<int32_t> v) {
1637  return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
1638 }
1639 
1640 template <int kBits>
1641 HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
1642  const Full256<int8_t> di;
1643  const Full256<uint8_t> du;
1644  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1645  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1646  return (shifted ^ shifted_sign) - shifted_sign;
1647 }
1648 
1649 // i64 is implemented after BroadcastSignBit.
1650 
1651 // ------------------------------ RotateRight
1652 
1653 template <int kBits>
1655  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1656 #if HWY_TARGET <= HWY_AVX3
1657  return Vec256<uint32_t>{_mm256_ror_epi32(v.raw, kBits)};
1658 #else
1659  if (kBits == 0) return v;
1660  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
1661 #endif
1662 }
1663 
1664 template <int kBits>
1666  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1667 #if HWY_TARGET <= HWY_AVX3
1668  return Vec256<uint64_t>{_mm256_ror_epi64(v.raw, kBits)};
1669 #else
1670  if (kBits == 0) return v;
1671  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
1672 #endif
1673 }
1674 
1675 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1676 
1677 HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
1678  return VecFromMask(v < Zero(Full256<int8_t>()));
1679 }
1680 
1682  return ShiftRight<15>(v);
1683 }
1684 
1686  return ShiftRight<31>(v);
1687 }
1688 
1690 #if HWY_TARGET == HWY_AVX2
1691  return VecFromMask(v < Zero(Full256<int64_t>()));
1692 #else
1693  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
1694 #endif
1695 }
1696 
1697 template <int kBits>
1699 #if HWY_TARGET <= HWY_AVX3
1700  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, kBits)};
1701 #else
1702  const Full256<int64_t> di;
1703  const Full256<uint64_t> du;
1704  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1705  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
1706  return right | sign;
1707 #endif
1708 }
1709 
1710 HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
1711 #if HWY_TARGET <= HWY_AVX3
1712  return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
1713 #else
1714  const auto zero = Zero(Full256<int64_t>());
1715  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1716 #endif
1717 }
1718 
1719 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1721  Vec256<int8_t> no) {
1722  // int8: AVX2 IfThenElse only looks at the MSB.
1723  return IfThenElse(MaskFromVec(v), yes, no);
1724 }
1725 
1726 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1728  static_assert(IsSigned<T>(), "Only works for signed/float");
1729  const Full256<T> d;
1730  const RebindToSigned<decltype(d)> di;
1731 
1732  // 16-bit: no native blendv, so copy sign to lower byte's MSB.
1733  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1734  return IfThenElse(MaskFromVec(v), yes, no);
1735 }
1736 
1737 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1738 HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
1739  static_assert(IsSigned<T>(), "Only works for signed/float");
1740  const Full256<T> d;
1741  const RebindToFloat<decltype(d)> df;
1742 
1743  // 32/64-bit: use float IfThenElse, which only looks at the MSB.
1744  const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
1745  return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
1746 }
1747 
1748 // ------------------------------ ShiftLeftSame
1749 
1750 HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
1751  const int bits) {
1752  return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1753 }
1754 HWY_API Vec256<uint32_t> ShiftLeftSame(const Vec256<uint32_t> v,
1755  const int bits) {
1756  return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1757 }
1759  const int bits) {
1760  return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1761 }
1762 
1763 HWY_API Vec256<int16_t> ShiftLeftSame(const Vec256<int16_t> v, const int bits) {
1764  return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1765 }
1766 
1767 HWY_API Vec256<int32_t> ShiftLeftSame(const Vec256<int32_t> v, const int bits) {
1768  return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1769 }
1770 
1772  return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1773 }
1774 
1775 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1776 HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
1777  const Full256<T> d8;
1778  const RepartitionToWide<decltype(d8)> d16;
1779  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
1780  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
1781 }
1782 
1783 // ------------------------------ ShiftRightSame (BroadcastSignBit)
1784 
1785 HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
1786  const int bits) {
1787  return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1788 }
1789 HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v,
1790  const int bits) {
1791  return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1792 }
1794  const int bits) {
1795  return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1796 }
1797 
1798 HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
1799  const Full256<uint8_t> d8;
1800  const RepartitionToWide<decltype(d8)> d16;
1801  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
1802  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
1803 }
1804 
1805 HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
1806  const int bits) {
1807  return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1808 }
1809 
1810 HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v,
1811  const int bits) {
1812  return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1813 }
1815  const int bits) {
1816 #if HWY_TARGET <= HWY_AVX3
1817  return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1818 #else
1819  const Full256<int64_t> di;
1820  const Full256<uint64_t> du;
1821  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1822  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
1823  return right | sign;
1824 #endif
1825 }
1826 
1827 HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
1828  const Full256<int8_t> di;
1829  const Full256<uint8_t> du;
1830  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1831  const auto shifted_sign =
1832  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1833  return (shifted ^ shifted_sign) - shifted_sign;
1834 }
1835 
1836 // ------------------------------ Neg (Xor, Sub)
1837 
1838 template <typename T, HWY_IF_FLOAT(T)>
1839 HWY_API Vec256<T> Neg(const Vec256<T> v) {
1840  return Xor(v, SignBit(Full256<T>()));
1841 }
1842 
1843 template <typename T, HWY_IF_NOT_FLOAT(T)>
1844 HWY_API Vec256<T> Neg(const Vec256<T> v) {
1845  return Zero(Full256<T>()) - v;
1846 }
1847 
1848 // ------------------------------ Floating-point mul / div
1849 
1850 HWY_API Vec256<float> operator*(const Vec256<float> a, const Vec256<float> b) {
1851  return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
1852 }
1854  const Vec256<double> b) {
1855  return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
1856 }
1857 
1858 HWY_API Vec256<float> operator/(const Vec256<float> a, const Vec256<float> b) {
1859  return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
1860 }
1862  const Vec256<double> b) {
1863  return Vec256<double>{_mm256_div_pd(a.raw, b.raw)};
1864 }
1865 
1866 // Approximate reciprocal
1867 HWY_API Vec256<float> ApproximateReciprocal(const Vec256<float> v) {
1868  return Vec256<float>{_mm256_rcp_ps(v.raw)};
1869 }
1870 
1871 // Absolute value of difference.
1872 HWY_API Vec256<float> AbsDiff(const Vec256<float> a, const Vec256<float> b) {
1873  return Abs(a - b);
1874 }
1875 
1876 // ------------------------------ Floating-point multiply-add variants
1877 
1878 // Returns mul * x + add
1879 HWY_API Vec256<float> MulAdd(const Vec256<float> mul, const Vec256<float> x,
1880  const Vec256<float> add) {
1881 #ifdef HWY_DISABLE_BMI2_FMA
1882  return mul * x + add;
1883 #else
1884  return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
1885 #endif
1886 }
1888  const Vec256<double> add) {
1889 #ifdef HWY_DISABLE_BMI2_FMA
1890  return mul * x + add;
1891 #else
1892  return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
1893 #endif
1894 }
1895 
1896 // Returns add - mul * x
1897 HWY_API Vec256<float> NegMulAdd(const Vec256<float> mul, const Vec256<float> x,
1898  const Vec256<float> add) {
1899 #ifdef HWY_DISABLE_BMI2_FMA
1900  return add - mul * x;
1901 #else
1902  return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
1903 #endif
1904 }
1906  const Vec256<double> x,
1907  const Vec256<double> add) {
1908 #ifdef HWY_DISABLE_BMI2_FMA
1909  return add - mul * x;
1910 #else
1911  return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
1912 #endif
1913 }
1914 
1915 // Returns mul * x - sub
1916 HWY_API Vec256<float> MulSub(const Vec256<float> mul, const Vec256<float> x,
1917  const Vec256<float> sub) {
1918 #ifdef HWY_DISABLE_BMI2_FMA
1919  return mul * x - sub;
1920 #else
1921  return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
1922 #endif
1923 }
1925  const Vec256<double> sub) {
1926 #ifdef HWY_DISABLE_BMI2_FMA
1927  return mul * x - sub;
1928 #else
1929  return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
1930 #endif
1931 }
1932 
1933 // Returns -mul * x - sub
1934 HWY_API Vec256<float> NegMulSub(const Vec256<float> mul, const Vec256<float> x,
1935  const Vec256<float> sub) {
1936 #ifdef HWY_DISABLE_BMI2_FMA
1937  return Neg(mul * x) - sub;
1938 #else
1939  return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1940 #endif
1941 }
1943  const Vec256<double> x,
1944  const Vec256<double> sub) {
1945 #ifdef HWY_DISABLE_BMI2_FMA
1946  return Neg(mul * x) - sub;
1947 #else
1948  return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1949 #endif
1950 }
1951 
1952 // ------------------------------ Floating-point square root
1953 
1954 // Full precision square root
1955 HWY_API Vec256<float> Sqrt(const Vec256<float> v) {
1956  return Vec256<float>{_mm256_sqrt_ps(v.raw)};
1957 }
1959  return Vec256<double>{_mm256_sqrt_pd(v.raw)};
1960 }
1961 
1962 // Approximate reciprocal square root
1963 HWY_API Vec256<float> ApproximateReciprocalSqrt(const Vec256<float> v) {
1964  return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
1965 }
1966 
1967 // ------------------------------ Floating-point rounding
1968 
1969 // Toward nearest integer, tie to even
1970 HWY_API Vec256<float> Round(const Vec256<float> v) {
1971  return Vec256<float>{
1972  _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1973 }
1975  return Vec256<double>{
1976  _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1977 }
1978 
1979 // Toward zero, aka truncate
1980 HWY_API Vec256<float> Trunc(const Vec256<float> v) {
1981  return Vec256<float>{
1982  _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1983 }
1985  return Vec256<double>{
1986  _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1987 }
1988 
1989 // Toward +infinity, aka ceiling
1990 HWY_API Vec256<float> Ceil(const Vec256<float> v) {
1991  return Vec256<float>{
1992  _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1993 }
1995  return Vec256<double>{
1996  _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1997 }
1998 
1999 // Toward -infinity, aka floor
2000 HWY_API Vec256<float> Floor(const Vec256<float> v) {
2001  return Vec256<float>{
2002  _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2003 }
2005  return Vec256<double>{
2006  _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2007 }
2008 
2009 // ================================================== MEMORY
2010 
2011 // ------------------------------ Load
2012 
2013 template <typename T>
2014 HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
2015  return Vec256<T>{
2016  _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
2017 }
2019  const float* HWY_RESTRICT aligned) {
2020  return Vec256<float>{_mm256_load_ps(aligned)};
2021 }
2023  const double* HWY_RESTRICT aligned) {
2024  return Vec256<double>{_mm256_load_pd(aligned)};
2025 }
2026 
2027 template <typename T>
2028 HWY_API Vec256<T> LoadU(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2029  return Vec256<T>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
2030 }
2032  const float* HWY_RESTRICT p) {
2033  return Vec256<float>{_mm256_loadu_ps(p)};
2034 }
2036  const double* HWY_RESTRICT p) {
2037  return Vec256<double>{_mm256_loadu_pd(p)};
2038 }
2039 
2040 // ------------------------------ MaskedLoad
2041 
2042 #if HWY_TARGET <= HWY_AVX3
2043 
2044 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2046  const T* HWY_RESTRICT p) {
2047  return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, p)};
2048 }
2049 
2050 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2051 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2052  const T* HWY_RESTRICT p) {
2053  return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2054 }
2055 
2056 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2057 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2058  const T* HWY_RESTRICT p) {
2059  return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2060 }
2061 
2062 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2063 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2064  const T* HWY_RESTRICT p) {
2065  return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2066 }
2067 
2069  const float* HWY_RESTRICT p) {
2070  return Vec256<float>{_mm256_maskz_loadu_ps(m.raw, p)};
2071 }
2072 
2074  const double* HWY_RESTRICT p) {
2075  return Vec256<double>{_mm256_maskz_loadu_pd(m.raw, p)};
2076 }
2077 
2078 #else // AVX2
2079 
2080 // There is no maskload_epi8/16, so blend instead.
2081 template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2082 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2083  const T* HWY_RESTRICT p) {
2084  return IfThenElseZero(m, LoadU(d, p));
2085 }
2086 
2087 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2088 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2089  const T* HWY_RESTRICT p) {
2090  auto pi = reinterpret_cast<const int*>(p); // NOLINT
2091  return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2092 }
2093 
2094 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2095 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2096  const T* HWY_RESTRICT p) {
2097  auto pi = reinterpret_cast<const long long*>(p); // NOLINT
2098  return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2099 }
2100 
2101 HWY_API Vec256<float> MaskedLoad(Mask256<float> m, Full256<float> d,
2102  const float* HWY_RESTRICT p) {
2103  const Vec256<int32_t> mi =
2104  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2105  return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2106 }
2107 
2108 HWY_API Vec256<double> MaskedLoad(Mask256<double> m, Full256<double> d,
2109  const double* HWY_RESTRICT p) {
2110  const Vec256<int64_t> mi =
2111  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2112  return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2113 }
2114 
2115 #endif
2116 
2117 // ------------------------------ LoadDup128
2118 
2119 // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2120 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2121 template <typename T>
2122 HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2123 #if HWY_LOADDUP_ASM
2124  __m256i out;
2125  asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
2126  return Vec256<T>{out};
2127 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2128  // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
2129  // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
2130  // upper half undefined) is fine because we're overwriting that anyway.
2131  const __m128i v128 = LoadU(Full128<T>(), p).raw;
2132  return Vec256<T>{
2133  _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2134 #else
2135  return Vec256<T>{_mm256_broadcastsi128_si256(LoadU(Full128<T>(), p).raw)};
2136 #endif
2137 }
2139  const float* const HWY_RESTRICT p) {
2140 #if HWY_LOADDUP_ASM
2141  __m256 out;
2142  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
2143  return Vec256<float>{out};
2144 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2145  const __m128 v128 = LoadU(Full128<float>(), p).raw;
2146  return Vec256<float>{
2147  _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2148 #else
2149  return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))};
2150 #endif
2151 }
2153  const double* const HWY_RESTRICT p) {
2154 #if HWY_LOADDUP_ASM
2155  __m256d out;
2156  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
2157  return Vec256<double>{out};
2158 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2159  const __m128d v128 = LoadU(Full128<double>(), p).raw;
2160  return Vec256<double>{
2161  _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2162 #else
2163  return Vec256<double>{
2164  _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))};
2165 #endif
2166 }
2167 
2168 // ------------------------------ Store
2169 
2170 template <typename T>
2171 HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
2172  _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2173 }
2175  float* HWY_RESTRICT aligned) {
2176  _mm256_store_ps(aligned, v.raw);
2177 }
2179  double* HWY_RESTRICT aligned) {
2180  _mm256_store_pd(aligned, v.raw);
2181 }
2182 
2183 template <typename T>
2184 HWY_API void StoreU(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT p) {
2185  _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
2186 }
2188  float* HWY_RESTRICT p) {
2189  _mm256_storeu_ps(p, v.raw);
2190 }
2192  double* HWY_RESTRICT p) {
2193  _mm256_storeu_pd(p, v.raw);
2194 }
2195 
2196 // ------------------------------ BlendedStore
2197 
2198 #if HWY_TARGET <= HWY_AVX3
2199 
2200 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2202  T* HWY_RESTRICT p) {
2203  _mm256_mask_storeu_epi8(p, m.raw, v.raw);
2204 }
2205 
2206 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2207 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2208  T* HWY_RESTRICT p) {
2209  _mm256_mask_storeu_epi16(p, m.raw, v.raw);
2210 }
2211 
2212 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2213 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2214  T* HWY_RESTRICT p) {
2215  _mm256_mask_storeu_epi32(p, m.raw, v.raw);
2216 }
2217 
2218 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2219 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2220  T* HWY_RESTRICT p) {
2221  _mm256_mask_storeu_epi64(p, m.raw, v.raw);
2222 }
2223 
2225  Full256<float> /* tag */, float* HWY_RESTRICT p) {
2226  _mm256_mask_storeu_ps(p, m.raw, v.raw);
2227 }
2228 
2230  Full256<double> /* tag */, double* HWY_RESTRICT p) {
2231  _mm256_mask_storeu_pd(p, m.raw, v.raw);
2232 }
2233 
2234 #else // AVX2
2235 
2236 // Intel SDM says "No AC# reported for any mask bit combinations". However, AMD
2237 // allows AC# if "Alignment checking enabled and: 256-bit memory operand not
2238 // 32-byte aligned". Fortunately AC# is not enabled by default and requires both
2239 // OS support (CR0) and the application to set rflags.AC. We assume these remain
2240 // disabled because x86/x64 code and compiler output often contain misaligned
2241 // scalar accesses, which would also fault.
2242 //
2243 // Caveat: these are slow on AMD Jaguar/Bulldozer.
2244 
2245 template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2246 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2247  T* HWY_RESTRICT p) {
2248  // There is no maskload_epi8/16. Blending is also unsafe because loading a
2249  // full vector that crosses the array end causes asan faults. Resort to scalar
2250  // code; the caller should instead use memcpy, assuming m is FirstN(d, n).
2251  const RebindToUnsigned<decltype(d)> du;
2252  using TU = TFromD<decltype(du)>;
2253  alignas(32) TU buf[32 / sizeof(T)];
2254  alignas(32) TU mask[32 / sizeof(T)];
2255  Store(BitCast(du, v), du, buf);
2256  Store(BitCast(du, VecFromMask(d, m)), du, mask);
2257  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
2258  if (mask[i]) {
2259  CopyBytes<sizeof(T)>(buf + i, p + i);
2260  }
2261  }
2262 }
2263 
2264 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2265 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2266  T* HWY_RESTRICT p) {
2267  auto pi = reinterpret_cast<int*>(p); // NOLINT
2268  _mm256_maskstore_epi32(pi, m.raw, v.raw);
2269 }
2270 
2271 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2272 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2273  T* HWY_RESTRICT p) {
2274  auto pi = reinterpret_cast<long long*>(p); // NOLINT
2275  _mm256_maskstore_epi64(pi, m.raw, v.raw);
2276 }
2277 
2278 HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, Full256<float> d,
2279  float* HWY_RESTRICT p) {
2280  const Vec256<int32_t> mi =
2281  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2282  _mm256_maskstore_ps(p, mi.raw, v.raw);
2283 }
2284 
2285 HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m,
2286  Full256<double> d, double* HWY_RESTRICT p) {
2287  const Vec256<int64_t> mi =
2288  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2289  _mm256_maskstore_pd(p, mi.raw, v.raw);
2290 }
2291 
2292 #endif
2293 
2294 // ------------------------------ Non-temporal stores
2295 
2296 template <typename T>
2297 HWY_API void Stream(Vec256<T> v, Full256<T> /* tag */,
2298  T* HWY_RESTRICT aligned) {
2299  _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2300 }
2302  float* HWY_RESTRICT aligned) {
2303  _mm256_stream_ps(aligned, v.raw);
2304 }
2306  double* HWY_RESTRICT aligned) {
2307  _mm256_stream_pd(aligned, v.raw);
2308 }
2309 
2310 // ------------------------------ Scatter
2311 
2312 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2313 HWY_DIAGNOSTICS(push)
2314 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2315 
2316 #if HWY_TARGET <= HWY_AVX3
2317 namespace detail {
2318 
2319 template <typename T>
2321  Full256<T> /* tag */, T* HWY_RESTRICT base,
2322  const Vec256<int32_t> offset) {
2323  _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
2324 }
2325 template <typename T>
2327  Full256<T> /* tag */, T* HWY_RESTRICT base,
2328  const Vec256<int32_t> index) {
2329  _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
2330 }
2331 
2332 template <typename T>
2334  Full256<T> /* tag */, T* HWY_RESTRICT base,
2335  const Vec256<int64_t> offset) {
2336  _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
2337 }
2338 template <typename T>
2340  Full256<T> /* tag */, T* HWY_RESTRICT base,
2341  const Vec256<int64_t> index) {
2342  _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
2343 }
2344 
2345 } // namespace detail
2346 
2347 template <typename T, typename Offset>
2348 HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2349  const Vec256<Offset> offset) {
2350  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2351  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2352 }
2353 template <typename T, typename Index>
2354 HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2355  const Vec256<Index> index) {
2356  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2357  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2358 }
2359 
2361  float* HWY_RESTRICT base,
2362  const Vec256<int32_t> offset) {
2363  _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
2364 }
2366  float* HWY_RESTRICT base,
2367  const Vec256<int32_t> index) {
2368  _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
2369 }
2370 
2372  double* HWY_RESTRICT base,
2373  const Vec256<int64_t> offset) {
2374  _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
2375 }
2377  double* HWY_RESTRICT base,
2378  const Vec256<int64_t> index) {
2379  _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
2380 }
2381 
2382 #else
2383 
2384 template <typename T, typename Offset>
2385 HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2386  const Vec256<Offset> offset) {
2387  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2388 
2389  constexpr size_t N = 32 / sizeof(T);
2390  alignas(32) T lanes[N];
2391  Store(v, d, lanes);
2392 
2393  alignas(32) Offset offset_lanes[N];
2394  Store(offset, Full256<Offset>(), offset_lanes);
2395 
2396  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2397  for (size_t i = 0; i < N; ++i) {
2398  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2399  }
2400 }
2401 
2402 template <typename T, typename Index>
2403 HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2404  const Vec256<Index> index) {
2405  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2406 
2407  constexpr size_t N = 32 / sizeof(T);
2408  alignas(32) T lanes[N];
2409  Store(v, d, lanes);
2410 
2411  alignas(32) Index index_lanes[N];
2412  Store(index, Full256<Index>(), index_lanes);
2413 
2414  for (size_t i = 0; i < N; ++i) {
2415  base[index_lanes[i]] = lanes[i];
2416  }
2417 }
2418 
2419 #endif
2420 
2421 // ------------------------------ Gather
2422 
2423 namespace detail {
2424 
2425 template <typename T>
2427  Full256<T> /* tag */,
2428  const T* HWY_RESTRICT base,
2429  const Vec256<int32_t> offset) {
2430  return Vec256<T>{_mm256_i32gather_epi32(
2431  reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
2432 }
2433 template <typename T>
2435  Full256<T> /* tag */,
2436  const T* HWY_RESTRICT base,
2437  const Vec256<int32_t> index) {
2438  return Vec256<T>{_mm256_i32gather_epi32(
2439  reinterpret_cast<const int32_t*>(base), index.raw, 4)};
2440 }
2441 
2442 template <typename T>
2444  Full256<T> /* tag */,
2445  const T* HWY_RESTRICT base,
2446  const Vec256<int64_t> offset) {
2447  return Vec256<T>{_mm256_i64gather_epi64(
2448  reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
2449 }
2450 template <typename T>
2452  Full256<T> /* tag */,
2453  const T* HWY_RESTRICT base,
2454  const Vec256<int64_t> index) {
2455  return Vec256<T>{_mm256_i64gather_epi64(
2456  reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
2457 }
2458 
2459 } // namespace detail
2460 
2461 template <typename T, typename Offset>
2462 HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
2463  const Vec256<Offset> offset) {
2464  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2465  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2466 }
2467 template <typename T, typename Index>
2468 HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
2469  const Vec256<Index> index) {
2470  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2471  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2472 }
2473 
2475  const float* HWY_RESTRICT base,
2476  const Vec256<int32_t> offset) {
2477  return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
2478 }
2480  const float* HWY_RESTRICT base,
2481  const Vec256<int32_t> index) {
2482  return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
2483 }
2484 
2486  const double* HWY_RESTRICT base,
2487  const Vec256<int64_t> offset) {
2488  return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
2489 }
2491  const double* HWY_RESTRICT base,
2492  const Vec256<int64_t> index) {
2493  return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
2494 }
2495 
2496 HWY_DIAGNOSTICS(pop)
2497 
2498 // ================================================== SWIZZLE
2499 
2500 // ------------------------------ LowerHalf
2501 
2502 template <typename T>
2503 HWY_API Vec128<T> LowerHalf(Full128<T> /* tag */, Vec256<T> v) {
2504  return Vec128<T>{_mm256_castsi256_si128(v.raw)};
2505 }
2507  return Vec128<float>{_mm256_castps256_ps128(v.raw)};
2508 }
2510  return Vec128<double>{_mm256_castpd256_pd128(v.raw)};
2511 }
2512 
2513 template <typename T>
2514 HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
2515  return LowerHalf(Full128<T>(), v);
2516 }
2517 
2518 // ------------------------------ UpperHalf
2519 
2520 template <typename T>
2522  return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)};
2523 }
2525  return Vec128<float>{_mm256_extractf128_ps(v.raw, 1)};
2526 }
2528  return Vec128<double>{_mm256_extractf128_pd(v.raw, 1)};
2529 }
2530 
2531 // ------------------------------ GetLane (LowerHalf)
2532 template <typename T>
2534  return GetLane(LowerHalf(v));
2535 }
2536 
2537 // ------------------------------ ZeroExtendVector
2538 
2539 // Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper
2540 // bits undefined. Although it makes sense for them to be zero (VEX encoded
2541 // 128-bit instructions zero the upper lanes to avoid large penalties), a
2542 // compiler could decide to optimize out code that relies on this.
2543 //
2544 // The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
2545 // zeroing, but it is not available on MSVC nor GCC until 10.1. For older GCC,
2546 // we can still obtain the desired code thanks to pattern recognition; note that
2547 // the expensive insert instruction is not actually generated, see
2548 // https://gcc.godbolt.org/z/1MKGaP.
2549 
2550 template <typename T>
2551 HWY_API Vec256<T> ZeroExtendVector(Full256<T> /* tag */, Vec128<T> lo) {
2552 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2553  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2554 #else
2555  return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2556 #endif
2557 }
2559  Vec128<float> lo) {
2560 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2561  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
2562 #else
2563  return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
2564 #endif
2565 }
2567  Vec128<double> lo) {
2568 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2569  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
2570 #else
2571  return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
2572 #endif
2573 }
2574 
2575 // ------------------------------ Combine
2576 
2577 template <typename T>
2578 HWY_API Vec256<T> Combine(Full256<T> d, Vec128<T> hi, Vec128<T> lo) {
2579  const auto lo256 = ZeroExtendVector(d, lo);
2580  return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2581 }
2583  Vec128<float> lo) {
2584  const auto lo256 = ZeroExtendVector(d, lo);
2585  return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)};
2586 }
2588  Vec128<double> lo) {
2589  const auto lo256 = ZeroExtendVector(d, lo);
2590  return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)};
2591 }
2592 
2593 // ------------------------------ ShiftLeftBytes
2594 
2595 template <int kBytes, typename T>
2596 HWY_API Vec256<T> ShiftLeftBytes(Full256<T> /* tag */, const Vec256<T> v) {
2597  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2598  // This is the same operation as _mm256_bslli_epi128.
2599  return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)};
2600 }
2601 
2602 template <int kBytes, typename T>
2603 HWY_API Vec256<T> ShiftLeftBytes(const Vec256<T> v) {
2604  return ShiftLeftBytes<kBytes>(Full256<T>(), v);
2605 }
2606 
2607 // ------------------------------ ShiftLeftLanes
2608 
2609 template <int kLanes, typename T>
2610 HWY_API Vec256<T> ShiftLeftLanes(Full256<T> d, const Vec256<T> v) {
2611  const Repartition<uint8_t, decltype(d)> d8;
2612  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2613 }
2614 
2615 template <int kLanes, typename T>
2616 HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) {
2617  return ShiftLeftLanes<kLanes>(Full256<T>(), v);
2618 }
2619 
2620 // ------------------------------ ShiftRightBytes
2621 
2622 template <int kBytes, typename T>
2623 HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
2624  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2625  // This is the same operation as _mm256_bsrli_epi128.
2626  return Vec256<T>{_mm256_srli_si256(v.raw, kBytes)};
2627 }
2628 
2629 // ------------------------------ ShiftRightLanes
2630 template <int kLanes, typename T>
2631 HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
2632  const Repartition<uint8_t, decltype(d)> d8;
2633  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2634 }
2635 
2636 // ------------------------------ CombineShiftRightBytes
2637 
2638 // Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
2639 template <int kBytes, typename T, class V = Vec256<T>>
2640 HWY_API V CombineShiftRightBytes(Full256<T> d, V hi, V lo) {
2641  const Repartition<uint8_t, decltype(d)> d8;
2642  return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
2643  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2644 }
2645 
2646 // ------------------------------ Broadcast/splat any lane
2647 
2648 // Unsigned
2649 template <int kLane>
2650 HWY_API Vec256<uint16_t> Broadcast(const Vec256<uint16_t> v) {
2651  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2652  if (kLane < 4) {
2653  const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2654  return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
2655  } else {
2656  const __m256i hi =
2657  _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2658  return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
2659  }
2660 }
2661 template <int kLane>
2662 HWY_API Vec256<uint32_t> Broadcast(const Vec256<uint32_t> v) {
2663  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2664  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2665 }
2666 template <int kLane>
2668  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2669  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2670 }
2671 
2672 // Signed
2673 template <int kLane>
2674 HWY_API Vec256<int16_t> Broadcast(const Vec256<int16_t> v) {
2675  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2676  if (kLane < 4) {
2677  const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2678  return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
2679  } else {
2680  const __m256i hi =
2681  _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2682  return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
2683  }
2684 }
2685 template <int kLane>
2686 HWY_API Vec256<int32_t> Broadcast(const Vec256<int32_t> v) {
2687  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2688  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2689 }
2690 template <int kLane>
2692  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2693  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2694 }
2695 
2696 // Float
2697 template <int kLane>
2698 HWY_API Vec256<float> Broadcast(Vec256<float> v) {
2699  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2700  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
2701 }
2702 template <int kLane>
2704  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2705  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
2706 }
2707 
2708 // ------------------------------ Hard-coded shuffles
2709 
2710 // Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2711 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2712 // right (the previous least-significant lane is now most-significant =>
2713 // 47650321). These could also be implemented via CombineShiftRightBytes but
2714 // the shuffle_abcd notation is more convenient.
2715 
2716 // Swap 32-bit halves in 64-bit halves.
2718  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0xB1)};
2719 }
2721  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0xB1)};
2722 }
2724  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
2725 }
2726 
2727 // Swap 64-bit halves
2729  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2730 }
2732  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2733 }
2735  // Shorter encoding than _mm256_permute_ps.
2736  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)};
2737 }
2739  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2740 }
2742  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2743 }
2745  // Shorter encoding than _mm256_permute_pd.
2746  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)};
2747 }
2748 
2749 // Rotate right 32 bits
2751  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2752 }
2754  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2755 }
2757  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)};
2758 }
2759 // Rotate left 32 bits
2761  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2762 }
2764  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2765 }
2767  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)};
2768 }
2769 
2770 // Reverse
2772  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2773 }
2775  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2776 }
2778  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)};
2779 }
2780 
2781 // ------------------------------ TableLookupLanes
2782 
2783 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2784 template <typename T>
2785 struct Indices256 {
2786  __m256i raw;
2787 };
2788 
2789 // Native 8x32 instruction: indices remain unchanged
2790 template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 4)>
2792  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2793 #if HWY_IS_DEBUG_BUILD
2794  const Full256<TI> di;
2795  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2796  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(32 / sizeof(T))))));
2797 #endif
2798  return Indices256<T>{vec.raw};
2799 }
2800 
2801 // 64-bit lanes: convert indices to 8x32 unless AVX3 is available
2802 template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 8)>
2803 HWY_API Indices256<T> IndicesFromVec(Full256<T> d, Vec256<TI> idx64) {
2804  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2805  const Rebind<TI, decltype(d)> di;
2806  (void)di; // potentially unused
2807 #if HWY_IS_DEBUG_BUILD
2808  HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) &&
2809  AllTrue(di, Lt(idx64, Set(di, static_cast<TI>(32 / sizeof(T))))));
2810 #endif
2811 
2812 #if HWY_TARGET <= HWY_AVX3
2813  (void)d;
2814  return Indices256<T>{idx64.raw};
2815 #else
2816  const Repartition<float, decltype(d)> df; // 32-bit!
2817  // Replicate 64-bit index into upper 32 bits
2818  const Vec256<TI> dup =
2819  BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)});
2820  // For each idx64 i, idx32 are 2*i and 2*i+1.
2821  const Vec256<TI> idx32 = dup + dup + Set(di, TI(1) << 32);
2822  return Indices256<T>{idx32.raw};
2823 #endif
2824 }
2825 
2826 template <typename T, typename TI>
2827 HWY_API Indices256<T> SetTableIndices(const Full256<T> d, const TI* idx) {
2828  const Rebind<TI, decltype(d)> di;
2829  return IndicesFromVec(d, LoadU(di, idx));
2830 }
2831 
2832 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2834  return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2835 }
2836 
2837 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2838 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
2839 #if HWY_TARGET <= HWY_AVX3
2840  return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
2841 #else
2842  return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2843 #endif
2844 }
2845 
2847  const Indices256<float> idx) {
2848  return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
2849 }
2850 
2852  const Indices256<double> idx) {
2853 #if HWY_TARGET <= HWY_AVX3
2854  return Vec256<double>{_mm256_permutexvar_pd(idx.raw, v.raw)};
2855 #else
2856  const Full256<double> df;
2857  const Full256<uint64_t> du;
2858  return BitCast(df, Vec256<uint64_t>{_mm256_permutevar8x32_epi32(
2859  BitCast(du, v).raw, idx.raw)});
2860 #endif
2861 }
2862 
2863 // ------------------------------ Reverse (RotateRight)
2864 
2865 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2866 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
2867  alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2868  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2869 }
2870 
2871 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2872 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
2873  alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0};
2874  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2875 }
2876 
2877 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2878 HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
2879 #if HWY_TARGET <= HWY_AVX3
2880  const RebindToSigned<decltype(d)> di;
2881  alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2882  7, 6, 5, 4, 3, 2, 1, 0};
2883  const Vec256<int16_t> idx = Load(di, kReverse);
2884  return BitCast(d, Vec256<int16_t>{
2885  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2886 #else
2887  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
2888  const Vec256<uint32_t> rev32 = Reverse(du32, BitCast(du32, v));
2889  return BitCast(d, RotateRight<16>(rev32));
2890 #endif
2891 }
2892 
2893 // ------------------------------ Reverse2
2894 
2895 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2897  const Full256<uint32_t> du32;
2898  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2899 }
2900 
2901 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2902 HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
2903  return Shuffle2301(v);
2904 }
2905 
2906 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2907 HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
2908  return Shuffle01(v);
2909 }
2910 
2911 // ------------------------------ Reverse4
2912 
2913 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2915 #if HWY_TARGET <= HWY_AVX3
2916  const RebindToSigned<decltype(d)> di;
2917  alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
2918  11, 10, 9, 8, 15, 14, 13, 12};
2919  const Vec256<int16_t> idx = Load(di, kReverse4);
2920  return BitCast(d, Vec256<int16_t>{
2921  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2922 #else
2923  const RepartitionToWide<decltype(d)> dw;
2924  return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
2925 #endif
2926 }
2927 
2928 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2929 HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
2930  return Shuffle0123(v);
2931 }
2932 
2933 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2934 HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
2935  return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2936 }
2938  return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2939 }
2940 
2941 // ------------------------------ Reverse8
2942 
2943 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2945 #if HWY_TARGET <= HWY_AVX3
2946  const RebindToSigned<decltype(d)> di;
2947  alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2948  15, 14, 13, 12, 11, 10, 9, 8};
2949  const Vec256<int16_t> idx = Load(di, kReverse8);
2950  return BitCast(d, Vec256<int16_t>{
2951  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2952 #else
2953  const RepartitionToWide<decltype(d)> dw;
2954  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
2955 #endif
2956 }
2957 
2958 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2959 HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
2960  return Reverse(d, v);
2961 }
2962 
2963 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2964 HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
2965  HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes
2966 }
2967 
2968 // ------------------------------ InterleaveLower
2969 
2970 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2971 // the least-significant lane) and "b". To concatenate two half-width integers
2972 // into one, use ZipLower/Upper instead (also works with scalar).
2973 
2974 HWY_API Vec256<uint8_t> InterleaveLower(const Vec256<uint8_t> a,
2975  const Vec256<uint8_t> b) {
2976  return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
2977 }
2978 HWY_API Vec256<uint16_t> InterleaveLower(const Vec256<uint16_t> a,
2979  const Vec256<uint16_t> b) {
2980  return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
2981 }
2982 HWY_API Vec256<uint32_t> InterleaveLower(const Vec256<uint32_t> a,
2983  const Vec256<uint32_t> b) {
2984  return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
2985 }
2986 HWY_API Vec256<uint64_t> InterleaveLower(const Vec256<uint64_t> a,
2987  const Vec256<uint64_t> b) {
2988  return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
2989 }
2990 
2991 HWY_API Vec256<int8_t> InterleaveLower(const Vec256<int8_t> a,
2992  const Vec256<int8_t> b) {
2993  return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
2994 }
2995 HWY_API Vec256<int16_t> InterleaveLower(const Vec256<int16_t> a,
2996  const Vec256<int16_t> b) {
2997  return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
2998 }
2999 HWY_API Vec256<int32_t> InterleaveLower(const Vec256<int32_t> a,
3000  const Vec256<int32_t> b) {
3001  return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3002 }
3003 HWY_API Vec256<int64_t> InterleaveLower(const Vec256<int64_t> a,
3004  const Vec256<int64_t> b) {
3005  return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3006 }
3007 
3008 HWY_API Vec256<float> InterleaveLower(const Vec256<float> a,
3009  const Vec256<float> b) {
3010  return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
3011 }
3013  const Vec256<double> b) {
3014  return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
3015 }
3016 
3017 // ------------------------------ InterleaveUpper
3018 
3019 // All functions inside detail lack the required D parameter.
3020 namespace detail {
3021 
3022 HWY_API Vec256<uint8_t> InterleaveUpper(const Vec256<uint8_t> a,
3023  const Vec256<uint8_t> b) {
3024  return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3025 }
3026 HWY_API Vec256<uint16_t> InterleaveUpper(const Vec256<uint16_t> a,
3027  const Vec256<uint16_t> b) {
3028  return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3029 }
3030 HWY_API Vec256<uint32_t> InterleaveUpper(const Vec256<uint32_t> a,
3031  const Vec256<uint32_t> b) {
3032  return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3033 }
3034 HWY_API Vec256<uint64_t> InterleaveUpper(const Vec256<uint64_t> a,
3035  const Vec256<uint64_t> b) {
3036  return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3037 }
3038 
3039 HWY_API Vec256<int8_t> InterleaveUpper(const Vec256<int8_t> a,
3040  const Vec256<int8_t> b) {
3041  return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3042 }
3043 HWY_API Vec256<int16_t> InterleaveUpper(const Vec256<int16_t> a,
3044  const Vec256<int16_t> b) {
3045  return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3046 }
3047 HWY_API Vec256<int32_t> InterleaveUpper(const Vec256<int32_t> a,
3048  const Vec256<int32_t> b) {
3049  return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3050 }
3051 HWY_API Vec256<int64_t> InterleaveUpper(const Vec256<int64_t> a,
3052  const Vec256<int64_t> b) {
3053  return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3054 }
3055 
3056 HWY_API Vec256<float> InterleaveUpper(const Vec256<float> a,
3057  const Vec256<float> b) {
3058  return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
3059 }
3061  const Vec256<double> b) {
3062  return Vec256<double>{_mm256_unpackhi_pd(a.raw, b.raw)};
3063 }
3064 
3065 } // namespace detail
3066 
3067 template <typename T, class V = Vec256<T>>
3068 HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
3069  return detail::InterleaveUpper(a, b);
3070 }
3071 
3072 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3073 
3074 // Same as Interleave*, except that the return lanes are double-width integers;
3075 // this is necessary because the single-lane scalar cannot return two values.
3076 template <typename T, typename TW = MakeWide<T>>
3078  return BitCast(Full256<TW>(), InterleaveLower(a, b));
3079 }
3080 template <typename T, typename TW = MakeWide<T>>
3082  return BitCast(dw, InterleaveLower(a, b));
3083 }
3084 
3085 template <typename T, typename TW = MakeWide<T>>
3087  return BitCast(dw, InterleaveUpper(Full256<T>(), a, b));
3088 }
3089 
3090 // ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
3091 
3092 // _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is
3093 // slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on
3094 // UpperUpper at the cost of one extra cycle/instruction.
3095 
3096 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3097 template <typename T>
3098 HWY_API Vec256<T> ConcatLowerLower(Full256<T> d, const Vec256<T> hi,
3099  const Vec256<T> lo) {
3100  const Half<decltype(d)> d2;
3101  return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
3102 }
3104  const Vec256<float> lo) {
3105  const Half<decltype(d)> d2;
3106  return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
3107 }
3109  const Vec256<double> hi,
3110  const Vec256<double> lo) {
3111  const Half<decltype(d)> d2;
3112  return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
3113 }
3114 
3115 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
3116 template <typename T>
3117 HWY_API Vec256<T> ConcatLowerUpper(Full256<T> /* tag */, const Vec256<T> hi,
3118  const Vec256<T> lo) {
3119  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3120 }
3122  const Vec256<float> hi,
3123  const Vec256<float> lo) {
3124  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
3125 }
3127  const Vec256<double> hi,
3128  const Vec256<double> lo) {
3129  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
3130 }
3131 
3132 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3133 template <typename T>
3134 HWY_API Vec256<T> ConcatUpperLower(Full256<T> /* tag */, const Vec256<T> hi,
3135  const Vec256<T> lo) {
3136  return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3137 }
3139  const Vec256<float> hi,
3140  const Vec256<float> lo) {
3141  return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
3142 }
3144  const Vec256<double> hi,
3145  const Vec256<double> lo) {
3146  return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
3147 }
3148 
3149 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3150 template <typename T>
3151 HWY_API Vec256<T> ConcatUpperUpper(Full256<T> d, const Vec256<T> hi,
3152  const Vec256<T> lo) {
3153  const Half<decltype(d)> d2;
3154  return ConcatUpperLower(d, hi, ZeroExtendVector(d, UpperHalf(d2, lo)));
3155 }
3156 
3157 // ------------------------------ ConcatOdd
3158 
3159 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3160 HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3161  const RebindToUnsigned<decltype(d)> du;
3162 #if HWY_TARGET <= HWY_AVX3
3163  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3164  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3165  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3166  BitCast(du, hi).raw)});
3167 #else
3168  const RebindToFloat<decltype(d)> df;
3169  const Vec256<float> v3131{_mm256_shuffle_ps(
3170  BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3171  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw,
3172  _MM_SHUFFLE(3, 1, 2, 0))};
3173 #endif
3174 }
3175 
3177  Vec256<float> lo) {
3178  const RebindToUnsigned<decltype(d)> du;
3179 #if HWY_TARGET <= HWY_AVX3
3180  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3181  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3182  __mmask8{0xFF}, hi.raw)};
3183 #else
3184  const Vec256<float> v3131{
3185  _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
3186  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3187  BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3188 #endif
3189 }
3190 
3191 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3192 HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3193  const RebindToUnsigned<decltype(d)> du;
3194 #if HWY_TARGET <= HWY_AVX3
3195  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3196  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3197  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3198  BitCast(du, hi).raw)});
3199 #else
3200  const RebindToFloat<decltype(d)> df;
3201  const Vec256<double> v31{
3202  _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)};
3203  return Vec256<T>{
3204  _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3205 #endif
3206 }
3207 
3209  Vec256<double> lo) {
3210 #if HWY_TARGET <= HWY_AVX3
3211  const RebindToUnsigned<decltype(d)> du;
3212  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3213  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3214  __mmask8{0xFF}, hi.raw)};
3215 #else
3216  (void)d;
3217  const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)};
3218  return Vec256<double>{
3219  _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3220 #endif
3221 }
3222 
3223 // ------------------------------ ConcatEven
3224 
3225 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3226 HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3227  const RebindToUnsigned<decltype(d)> du;
3228 #if HWY_TARGET <= HWY_AVX3
3229  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3230  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3231  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3232  BitCast(du, hi).raw)});
3233 #else
3234  const RebindToFloat<decltype(d)> df;
3235  const Vec256<float> v2020{_mm256_shuffle_ps(
3236  BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3237  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
3238  _MM_SHUFFLE(3, 1, 2, 0))};
3239 
3240 #endif
3241 }
3242 
3244  Vec256<float> lo) {
3245  const RebindToUnsigned<decltype(d)> du;
3246 #if HWY_TARGET <= HWY_AVX3
3247  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3248  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3249  __mmask8{0xFF}, hi.raw)};
3250 #else
3251  const Vec256<float> v2020{
3252  _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
3253  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3254  BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3255 
3256 #endif
3257 }
3258 
3259 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3260 HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3261  const RebindToUnsigned<decltype(d)> du;
3262 #if HWY_TARGET <= HWY_AVX3
3263  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3264  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3265  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3266  BitCast(du, hi).raw)});
3267 #else
3268  const RebindToFloat<decltype(d)> df;
3269  const Vec256<double> v20{
3270  _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
3271  return Vec256<T>{
3272  _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3273 
3274 #endif
3275 }
3276 
3278  Vec256<double> lo) {
3279 #if HWY_TARGET <= HWY_AVX3
3280  const RebindToUnsigned<decltype(d)> du;
3281  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3282  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3283  __mmask8{0xFF}, hi.raw)};
3284 #else
3285  (void)d;
3286  const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
3287  return Vec256<double>{
3288  _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3289 #endif
3290 }
3291 
3292 // ------------------------------ DupEven (InterleaveLower)
3293 
3294 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3296  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3297 }
3299  return Vec256<float>{
3300  _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3301 }
3302 
3303 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3304 HWY_API Vec256<T> DupEven(const Vec256<T> v) {
3305  return InterleaveLower(Full256<T>(), v, v);
3306 }
3307 
3308 // ------------------------------ DupOdd (InterleaveUpper)
3309 
3310 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3312  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3313 }
3315  return Vec256<float>{
3316  _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3317 }
3318 
3319 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3320 HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
3321  return InterleaveUpper(Full256<T>(), v, v);
3322 }
3323 
3324 // ------------------------------ OddEven
3325 
3326 namespace detail {
3327 
3328 template <typename T>
3329 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec256<T> a,
3330  const Vec256<T> b) {
3331  const Full256<T> d;
3332  const Full256<uint8_t> d8;
3333  alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3334  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3335  return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
3336 }
3337 template <typename T>
3338 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec256<T> a,
3339  const Vec256<T> b) {
3340  return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
3341 }
3342 template <typename T>
3343 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec256<T> a,
3344  const Vec256<T> b) {
3345  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
3346 }
3347 template <typename T>
3348 HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec256<T> a,
3349  const Vec256<T> b) {
3350  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
3351 }
3352 
3353 } // namespace detail
3354 
3355 template <typename T>
3356 HWY_API Vec256<T> OddEven(const Vec256<T> a, const Vec256<T> b) {
3357  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3358 }
3359 HWY_API Vec256<float> OddEven(const Vec256<float> a, const Vec256<float> b) {
3360  return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
3361 }
3362 
3364  return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
3365 }
3366 
3367 // ------------------------------ OddEvenBlocks
3368 
3369 template <typename T>
3371  return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)};
3372 }
3373 
3375  return Vec256<float>{_mm256_blend_ps(odd.raw, even.raw, 0xFu)};
3376 }
3377 
3379  return Vec256<double>{_mm256_blend_pd(odd.raw, even.raw, 0x3u)};
3380 }
3381 
3382 // ------------------------------ SwapAdjacentBlocks
3383 
3384 template <typename T>
3385 HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
3386  return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
3387 }
3388 
3390  const Full256<float> df;
3391  const Full256<int32_t> di;
3392  // Avoid _mm256_permute2f128_ps - slow on AMD.
3393  return BitCast(df, Vec256<int32_t>{_mm256_permute4x64_epi64(
3394  BitCast(di, v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
3395 }
3396 
3398  return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
3399 }
3400 
3401 // ------------------------------ ReverseBlocks (ConcatLowerUpper)
3402 
3403 template <typename T>
3404 HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
3405  return ConcatLowerUpper(d, v, v);
3406 }
3407 
3408 // ------------------------------ TableLookupBytes (ZeroExtendVector)
3409 
3410 // Both full
3411 template <typename T, typename TI>
3412 HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes,
3413  const Vec256<TI> from) {
3414  return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3415 }
3416 
3417 // Partial index vector
3418 template <typename T, typename TI, size_t NI>
3420  const Vec128<TI, NI> from) {
3421  // First expand to full 128, then 256.
3422  const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
3423  const auto tbl_full = TableLookupBytes(bytes, from_256);
3424  // Shrink to 128, then partial.
3425  return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
3426 }
3427 
3428 // Partial table vector
3429 template <typename T, size_t N, typename TI>
3431  const Vec256<TI> from) {
3432  // First expand to full 128, then 256.
3433  const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
3434  return TableLookupBytes(bytes_256, from);
3435 }
3436 
3437 // Partial both are handled by x86_128.
3438 
3439 // ------------------------------ Shl (Mul, ZipLower)
3440 
3441 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
3442 namespace detail {
3443 
3444 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
3445 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3446 HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
3447  const Full256<T> d;
3448  const RepartitionToWide<decltype(d)> dw;
3449  const Rebind<float, decltype(dw)> df;
3450  const auto zero = Zero(d);
3451  // Move into exponent (this u16 will become the upper half of an f32)
3452  const auto exp = ShiftLeft<23 - 16>(v);
3453  const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
3454  // Insert 0 into lower halves for reinterpreting as binary32.
3455  const auto f0 = ZipLower(dw, zero, upper);
3456  const auto f1 = ZipUpper(dw, zero, upper);
3457  // Do not use ConvertTo because it checks for overflow, which is redundant
3458  // because we only care about v in [0, 16).
3459  const Vec256<int32_t> bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)};
3460  const Vec256<int32_t> bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)};
3461  return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3462 }
3463 
3464 } // namespace detail
3465 #endif // HWY_TARGET > HWY_AVX3
3466 
3468  const Vec256<uint16_t> bits) {
3469 #if HWY_TARGET <= HWY_AVX3
3470  return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
3471 #else
3472  return v * detail::Pow2(bits);
3473 #endif
3474 }
3475 
3477  const Vec256<uint32_t> bits) {
3478  return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
3479 }
3480 
3482  const Vec256<uint64_t> bits) {
3483  return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
3484 }
3485 
3486 // Signed left shift is the same as unsigned.
3487 template <typename T, HWY_IF_SIGNED(T)>
3489  const Full256<T> di;
3490  const Full256<MakeUnsigned<T>> du;
3491  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
3492 }
3493 
3494 // ------------------------------ Shr (MulHigh, IfThenElse, Not)
3495 
3497  const Vec256<uint16_t> bits) {
3498 #if HWY_TARGET <= HWY_AVX3
3499  return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
3500 #else
3501  const Full256<uint16_t> d;
3502  // For bits=0, we cannot mul by 2^16, so fix the result later.
3503  const auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
3504  // Replace output with input where bits == 0.
3505  return IfThenElse(bits == Zero(d), v, out);
3506 #endif
3507 }
3508 
3510  const Vec256<uint32_t> bits) {
3511  return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
3512 }
3513 
3515  const Vec256<uint64_t> bits) {
3516  return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
3517 }
3518 
3520  const Vec256<int16_t> bits) {
3521 #if HWY_TARGET <= HWY_AVX3
3522  return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
3523 #else
3524  return detail::SignedShr(Full256<int16_t>(), v, bits);
3525 #endif
3526 }
3527 
3529  const Vec256<int32_t> bits) {
3530  return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
3531 }
3532 
3534  const Vec256<int64_t> bits) {
3535 #if HWY_TARGET <= HWY_AVX3
3536  return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
3537 #else
3538  return detail::SignedShr(Full256<int64_t>(), v, bits);
3539 #endif
3540 }
3541 
3542 HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
3543  const Vec256<uint64_t> b) {
3544  const DFromV<decltype(a)> du64;
3545  const RepartitionToNarrow<decltype(du64)> du32;
3546  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3547  const auto a32 = BitCast(du32, a);
3548  const auto b32 = BitCast(du32, b);
3549  // Inputs for MulEven: we only need the lower 32 bits
3550  const auto aH = Shuffle2301(a32);
3551  const auto bH = Shuffle2301(b32);
3552 
3553  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
3554  // the even (lower 64 bits of every 128-bit block) results. See
3555  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
3556  const auto aLbL = MulEven(a32, b32);
3557  const auto w3 = aLbL & maskL;
3558 
3559  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3560  const auto w2 = t2 & maskL;
3561  const auto w1 = ShiftRight<32>(t2);
3562 
3563  const auto t = MulEven(a32, bH) + w2;
3564  const auto k = ShiftRight<32>(t);
3565 
3566  const auto mulH = MulEven(aH, bH) + w1 + k;
3567  const auto mulL = ShiftLeft<32>(t) + w3;
3568  return InterleaveLower(mulL, mulH);
3569 }
3570 
3571 HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
3572  const Vec256<uint64_t> b) {
3573  const DFromV<decltype(a)> du64;
3574  const RepartitionToNarrow<decltype(du64)> du32;
3575  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3576  const auto a32 = BitCast(du32, a);
3577  const auto b32 = BitCast(du32, b);
3578  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
3579  const auto aH = Shuffle2301(a32);
3580  const auto bH = Shuffle2301(b32);
3581 
3582  // Same as above, but we're using the odd results (upper 64 bits per block).
3583  const auto aLbL = MulEven(a32, b32);
3584  const auto w3 = aLbL & maskL;
3585 
3586  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3587  const auto w2 = t2 & maskL;
3588  const auto w1 = ShiftRight<32>(t2);
3589 
3590  const auto t = MulEven(a32, bH) + w2;
3591  const auto k = ShiftRight<32>(t);
3592 
3593  const auto mulH = MulEven(aH, bH) + w1 + k;
3594  const auto mulL = ShiftLeft<32>(t) + w3;
3595  return InterleaveUpper(du64, mulL, mulH);
3596 }
3597 
3598 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3599 
3600 HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
3601  Vec256<bfloat16_t> a,
3602  Vec256<bfloat16_t> b,
3603  const Vec256<float> sum0,
3604  Vec256<float>& sum1) {
3605  // TODO(janwas): _mm256_dpbf16_ps when available
3606  const Repartition<uint16_t, decltype(df32)> du16;
3607  const RebindToUnsigned<decltype(df32)> du32;
3608  const Vec256<uint16_t> zero = Zero(du16);
3609  // Lane order within sum0/1 is undefined, hence we can avoid the
3610  // longer-latency lane-crossing PromoteTo.
3611  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
3612  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3613  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
3614  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3615  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3616  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3617 }
3618 
3619 // ================================================== CONVERT
3620 
3621 // ------------------------------ Promotions (part w/ narrow lanes -> full)
3622 
3624  const Vec128<float, 4> v) {
3625  return Vec256<double>{_mm256_cvtps_pd(v.raw)};
3626 }
3627 
3629  const Vec128<int32_t, 4> v) {
3630  return Vec256<double>{_mm256_cvtepi32_pd(v.raw)};
3631 }
3632 
3633 // Unsigned: zero-extend.
3634 // Note: these have 3 cycle latency; if inputs are already split across the
3635 // 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3636 HWY_API Vec256<uint16_t> PromoteTo(Full256<uint16_t> /* tag */,
3637  Vec128<uint8_t> v) {
3638  return Vec256<uint16_t>{_mm256_cvtepu8_epi16(v.raw)};
3639 }
3642  return Vec256<uint32_t>{_mm256_cvtepu8_epi32(v.raw)};
3643 }
3644 HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
3645  Vec128<uint8_t> v) {
3646  return Vec256<int16_t>{_mm256_cvtepu8_epi16(v.raw)};
3647 }
3650  return Vec256<int32_t>{_mm256_cvtepu8_epi32(v.raw)};
3651 }
3652 HWY_API Vec256<uint32_t> PromoteTo(Full256<uint32_t> /* tag */,
3653  Vec128<uint16_t> v) {
3654  return Vec256<uint32_t>{_mm256_cvtepu16_epi32(v.raw)};
3655 }
3656 HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
3657  Vec128<uint16_t> v) {
3658  return Vec256<int32_t>{_mm256_cvtepu16_epi32(v.raw)};
3659 }
3661  Vec128<uint32_t> v) {
3662  return Vec256<uint64_t>{_mm256_cvtepu32_epi64(v.raw)};
3663 }
3664 
3665 // Signed: replicate sign bit.
3666 // Note: these have 3 cycle latency; if inputs are already split across the
3667 // 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3668 // signed shift would be faster.
3669 HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
3670  Vec128<int8_t> v) {
3671  return Vec256<int16_t>{_mm256_cvtepi8_epi16(v.raw)};
3672 }
3674  Vec128<int8_t, 8> v) {
3675  return Vec256<int32_t>{_mm256_cvtepi8_epi32(v.raw)};
3676 }
3677 HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
3678  Vec128<int16_t> v) {
3679  return Vec256<int32_t>{_mm256_cvtepi16_epi32(v.raw)};
3680 }
3682  Vec128<int32_t> v) {
3683  return Vec256<int64_t>{_mm256_cvtepi32_epi64(v.raw)};
3684 }
3685 
3686 // ------------------------------ Demotions (full -> part w/ narrow lanes)
3687 
3688 HWY_API Vec128<uint16_t> DemoteTo(Full128<uint16_t> /* tag */,
3689  const Vec256<int32_t> v) {
3690  const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
3691  // Concatenating lower halves of both 128-bit blocks afterward is more
3692  // efficient than an extra input with low block = high block of v.
3693  return Vec128<uint16_t>{
3694  _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
3695 }
3696 
3697 HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
3698  const Vec256<int32_t> v) {
3699  const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
3700  return Vec128<int16_t>{
3701  _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
3702 }
3703 
3705  const Vec256<int32_t> v) {
3706  const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
3707  // Concatenate lower 64 bits of each 128-bit block
3708  const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
3709  const __m128i u16 = _mm256_castsi256_si128(u16_concat);
3710  // packus treats the input as signed; we want unsigned. Clear the MSB to get
3711  // unsigned saturation to u8.
3712  const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
3713  return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
3714 }
3715 
3716 HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
3717  const Vec256<int16_t> v) {
3718  const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw);
3719  return Vec128<uint8_t>{
3720  _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
3721 }
3722 
3724  const Vec256<int32_t> v) {
3725  const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
3726  // Concatenate lower 64 bits of each 128-bit block
3727  const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
3728  const __m128i i16 = _mm256_castsi256_si128(i16_concat);
3729  return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
3730 }
3731 
3732 HWY_API Vec128<int8_t> DemoteTo(Full128<int8_t> /* tag */,
3733  const Vec256<int16_t> v) {
3734  const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
3735  return Vec128<int8_t>{
3736  _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
3737 }
3738 
3739  // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
3740  // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
3741 HWY_DIAGNOSTICS(push)
3742 HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
3743 
3744 HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> df16,
3745  const Vec256<float> v) {
3746 #ifdef HWY_DISABLE_F16C
3747  const RebindToUnsigned<decltype(df16)> du16;
3748  const Rebind<uint32_t, decltype(df16)> du;
3749  const RebindToSigned<decltype(du)> di;
3750  const auto bits32 = BitCast(du, v);
3751  const auto sign = ShiftRight<31>(bits32);
3752  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3753  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3754 
3755  const auto k15 = Set(di, 15);
3756  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3757  const auto is_tiny = exp < Set(di, -24);
3758 
3759  const auto is_subnormal = exp < Set(di, -14);
3760  const auto biased_exp16 =
3761  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3762  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3763  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3764  (mantissa32 >> (Set(du, 13) + sub_exp));
3765  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3766  ShiftRight<13>(mantissa32)); // <1024
3767 
3768  const auto sign16 = ShiftLeft<15>(sign);
3769  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3770  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3771  return BitCast(df16, DemoteTo(du16, bits16));
3772 #else
3773  (void)df16;
3774  return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3775 #endif
3776 }
3777 
3778 HWY_DIAGNOSTICS(pop)
3779 
3780 HWY_API Vec128<bfloat16_t> DemoteTo(Full128<bfloat16_t> dbf16,
3781  const Vec256<float> v) {
3782  // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
3783  const Rebind<int32_t, decltype(dbf16)> di32;
3784  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3785  const Rebind<uint16_t, decltype(dbf16)> du16;
3786  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3787  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3788 }
3789 
3792  // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
3793  const RebindToUnsigned<decltype(dbf16)> du16;
3794  const Repartition<uint32_t, decltype(dbf16)> du32;
3795  const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
3796  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3797 }
3798 
3800  const Vec256<double> v) {
3801  return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
3802 }
3803 
3804 HWY_API Vec128<int32_t> DemoteTo(Full128<int32_t> /* tag */,
3805  const Vec256<double> v) {
3806  const auto clamped = detail::ClampF64ToI32Max(Full256<double>(), v);
3807  return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
3808 }
3809 
3810 // For already range-limited input [0, 255].
3811 HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
3812  const Full256<uint32_t> d32;
3813  alignas(32) static constexpr uint32_t k8From32[8] = {
3814  0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
3815  // Place first four bytes in lo[0], remaining 4 in hi[1].
3816  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
3817  // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
3818  const auto lo = LowerHalf(quad);
3819  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
3820  const auto pair = LowerHalf(lo | hi);
3821  return BitCast(Full64<uint8_t>(), pair);
3822 }
3823 
3824 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
3825 
3826 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
3827  const Vec256<int32_t> v) {
3828  return Vec256<float>{_mm256_cvtepi32_ps(v.raw)};
3829 }
3830 
3832 #if HWY_TARGET <= HWY_AVX3
3833  (void)dd;
3834  return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
3835 #else
3836  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
3837  const Repartition<uint32_t, decltype(dd)> d32;
3838  const Repartition<uint64_t, decltype(dd)> d64;
3839 
3840  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
3841  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
3842  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
3843 
3844  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
3845  const auto k52 = Set(d32, 0x43300000);
3846  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
3847 
3848  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
3849  return (v_upper - k84_63_52) + v_lower; // order matters!
3850 #endif
3851 }
3852 
3853 // Truncates (rounds toward zero).
3854 HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> d, const Vec256<float> v) {
3855  return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
3856 }
3857 
3859 #if HWY_TARGET <= HWY_AVX3
3860  return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw));
3861 #else
3862  using VI = decltype(Zero(di));
3863  const VI k0 = Zero(di);
3864  const VI k1 = Set(di, 1);
3865  const VI k51 = Set(di, 51);
3866 
3867  // Exponent indicates whether the number can be represented as int64_t.
3868  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
3869  const VI exp = biased_exp - Set(di, 0x3FF);
3870  const auto in_range = exp < Set(di, 63);
3871 
3872  // If we were to cap the exponent at 51 and add 2^52, the number would be in
3873  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
3874  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
3875  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
3876  // manually shift the mantissa into place (we already have many of the
3877  // inputs anyway).
3878  const VI shift_mnt = Max(k51 - exp, k0);
3879  const VI shift_int = Max(exp - k51, k0);
3880  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
3881  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
3882  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
3883  // For inputs larger than 2^52, insert zeros at the bottom.
3884  const VI shifted = int52 << shift_int;
3885  // Restore the one bit lost when shifting in the implicit 1-bit.
3886  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
3887 
3888  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
3889  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
3890  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
3891  const VI magnitude = IfThenElse(in_range, restored, limit);
3892 
3893  // If the input was negative, negate the integer (two's complement).
3894  return (magnitude ^ sign_mask) - sign_mask;
3895 #endif
3896 }
3897 
3898 HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
3899  const Full256<int32_t> di;
3900  return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw));
3901 }
3902 
3903 
3904 HWY_API Vec256<float> PromoteTo(Full256<float> df32,
3905  const Vec128<float16_t> v) {
3906 #ifdef HWY_DISABLE_F16C
3907  const RebindToSigned<decltype(df32)> di32;
3908  const RebindToUnsigned<decltype(df32)> du32;
3909  // Expand to u32 so we can shift.
3910  const auto bits16 = PromoteTo(du32, Vec128<uint16_t>{v.raw});
3911  const auto sign = ShiftRight<15>(bits16);
3912  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
3913  const auto mantissa = bits16 & Set(du32, 0x3FF);
3914  const auto subnormal =
3915  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
3916  Set(df32, 1.0f / 16384 / 1024));
3917 
3918  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
3919  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
3920  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3921  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
3922  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3923 #else
3924  (void)df32;
3925  return Vec256<float>{_mm256_cvtph_ps(v.raw)};
3926 #endif
3927 }
3928 
3929 HWY_API Vec256<float> PromoteTo(Full256<float> df32,
3930  const Vec128<bfloat16_t> v) {
3931  const Rebind<uint16_t, decltype(df32)> du16;
3932  const RebindToSigned<decltype(df32)> di32;
3933  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3934 }
3935 
3936 // ================================================== CRYPTO
3937 
3938 #if !defined(HWY_DISABLE_PCLMUL_AES)
3939 
3940 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3941 #ifdef HWY_NATIVE_AES
3942 #undef HWY_NATIVE_AES
3943 #else
3944 #define HWY_NATIVE_AES
3945 #endif
3946 
3948  Vec256<uint8_t> round_key) {
3949 #if HWY_TARGET == HWY_AVX3_DL
3950  return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)};
3951 #else
3952  const Full256<uint8_t> d;
3953  const Half<decltype(d)> d2;
3954  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3955  AESRound(LowerHalf(state), LowerHalf(round_key)));
3956 #endif
3957 }
3958 
3960  Vec256<uint8_t> round_key) {
3961 #if HWY_TARGET == HWY_AVX3_DL
3962  return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
3963 #else
3964  const Full256<uint8_t> d;
3965  const Half<decltype(d)> d2;
3966  return Combine(d,
3967  AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3968  AESLastRound(LowerHalf(state), LowerHalf(round_key)));
3969 #endif
3970 }
3971 
3973 #if HWY_TARGET == HWY_AVX3_DL
3974  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
3975 #else
3976  const Full256<uint64_t> d;
3977  const Half<decltype(d)> d2;
3978  return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)),
3979  CLMulLower(LowerHalf(a), LowerHalf(b)));
3980 #endif
3981 }
3982 
3984 #if HWY_TARGET == HWY_AVX3_DL
3985  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)};
3986 #else
3987  const Full256<uint64_t> d;
3988  const Half<decltype(d)> d2;
3989  return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)),
3990  CLMulUpper(LowerHalf(a), LowerHalf(b)));
3991 #endif
3992 }
3993 
3994 #endif // HWY_DISABLE_PCLMUL_AES
3995 
3996 // ================================================== MISC
3997 
3998 // Returns a vector with lane i=[0, N) set to "first" + i.
3999 template <typename T, typename T2>
4000 HWY_API Vec256<T> Iota(const Full256<T> d, const T2 first) {
4001  HWY_ALIGN T lanes[32 / sizeof(T)];
4002  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
4003  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
4004  }
4005  return Load(d, lanes);
4006 }
4007 
4008 #if HWY_TARGET <= HWY_AVX3
4009 
4010 // ------------------------------ LoadMaskBits
4011 
4012 // `p` points to at least 8 readable bytes, not all of which need be valid.
4013 template <typename T>
4014 HWY_API Mask256<T> LoadMaskBits(const Full256<T> /* tag */,
4015  const uint8_t* HWY_RESTRICT bits) {
4016  constexpr size_t N = 32 / sizeof(T);
4017  constexpr size_t kNumBytes = (N + 7) / 8;
4018 
4019  uint64_t mask_bits = 0;
4020  CopyBytes<kNumBytes>(bits, &mask_bits);
4021 
4022  if (N < 8) {
4023  mask_bits &= (1ull << N) - 1;
4024  }
4025 
4026  return Mask256<T>::FromBits(mask_bits);
4027 }
4028 
4029 // ------------------------------ StoreMaskBits
4030 
4031 // `p` points to at least 8 writable bytes.
4032 template <typename T>
4033 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4034  uint8_t* bits) {
4035  constexpr size_t N = 32 / sizeof(T);
4036  constexpr size_t kNumBytes = (N + 7) / 8;
4037 
4038  CopyBytes<kNumBytes>(&mask.raw, bits);
4039 
4040  // Non-full byte, need to clear the undefined upper bits.
4041  if (N < 8) {
4042  const int mask = static_cast<int>((1ull << N) - 1);
4043  bits[0] = static_cast<uint8_t>(bits[0] & mask);
4044  }
4045  return kNumBytes;
4046 }
4047 
4048 // ------------------------------ Mask testing
4049 
4050 template <typename T>
4051 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4052  return PopCount(static_cast<uint64_t>(mask.raw));
4053 }
4054 
4055 template <typename T>
4056 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
4057  const Mask256<T> mask) {
4058  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
4059 }
4060 
4061 // Beware: the suffix indicates the number of mask bits, not lane size!
4062 
4063 namespace detail {
4064 
4065 template <typename T>
4066 HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
4067 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4068  return _kortestz_mask32_u8(mask.raw, mask.raw);
4069 #else
4070  return mask.raw == 0;
4071 #endif
4072 }
4073 template <typename T>
4074 HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
4075 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4076  return _kortestz_mask16_u8(mask.raw, mask.raw);
4077 #else
4078  return mask.raw == 0;
4079 #endif
4080 }
4081 template <typename T>
4082 HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
4083 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4084  return _kortestz_mask8_u8(mask.raw, mask.raw);
4085 #else
4086  return mask.raw == 0;
4087 #endif
4088 }
4089 template <typename T>
4090 HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
4091  return (uint64_t{mask.raw} & 0xF) == 0;
4092 }
4093 
4094 } // namespace detail
4095 
4096 template <typename T>
4097 HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4098  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
4099 }
4100 
4101 namespace detail {
4102 
4103 template <typename T>
4104 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
4105 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4106  return _kortestc_mask32_u8(mask.raw, mask.raw);
4107 #else
4108  return mask.raw == 0xFFFFFFFFu;
4109 #endif
4110 }
4111 template <typename T>
4112 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
4113 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4114  return _kortestc_mask16_u8(mask.raw, mask.raw);
4115 #else
4116  return mask.raw == 0xFFFFu;
4117 #endif
4118 }
4119 template <typename T>
4120 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
4121 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4122  return _kortestc_mask8_u8(mask.raw, mask.raw);
4123 #else
4124  return mask.raw == 0xFFu;
4125 #endif
4126 }
4127 template <typename T>
4128 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
4129  // Cannot use _kortestc because we have less than 8 mask bits.
4130  return mask.raw == 0xFu;
4131 }
4132 
4133 } // namespace detail
4134 
4135 template <typename T>
4136 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4137  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
4138 }
4139 
4140 // ------------------------------ Compress
4141 
4142 // 16-bit is defined in x86_512 so we can use 512-bit vectors.
4143 
4144 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4146  return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
4147 }
4148 
4150  return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)};
4151 }
4152 
4153 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4154 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
4155  // See CompressIsPartition.
4156  alignas(16) constexpr uint64_t packed_array[16] = {
4157  0x3210, 0x3210, 0x3201, 0x3210, 0x3102, 0x3120, 0x3021, 0x3210,
4158  0x2103, 0x2130, 0x2031, 0x2310, 0x1032, 0x1320, 0x0321, 0x3210};
4159 
4160  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
4161  // _mm256_permutexvar_epi64 will ignore the upper bits.
4162  const Full256<T> d;
4163  const RebindToUnsigned<decltype(d)> du64;
4164  const auto packed = Set(du64, packed_array[mask.raw]);
4165  alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4166  const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
4167  return TableLookupLanes(v, indices);
4168 }
4169 
4170 // ------------------------------ CompressBits (LoadMaskBits)
4171 
4172 template <typename T>
4173 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4174  return Compress(v, LoadMaskBits(Full256<T>(), bits));
4175 }
4176 
4177 // ------------------------------ CompressStore
4178 
4179 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4181  T* HWY_RESTRICT unaligned) {
4182  const Rebind<uint16_t, decltype(d)> du;
4183  const auto vu = BitCast(du, v); // (required for float16_t inputs)
4184 
4185  const uint64_t mask_bits{mask.raw};
4186 
4187 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
4188  _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
4189 #else
4190  // Split into halves to keep the table size manageable.
4191  const Half<decltype(du)> duh;
4192  const auto vL = LowerHalf(duh, vu);
4193  const auto vH = UpperHalf(duh, vu);
4194 
4195  const uint64_t mask_bitsL = mask_bits & 0xFF;
4196  const uint64_t mask_bitsH = mask_bits >> 8;
4197 
4198  const auto idxL = detail::IndicesForCompress16(mask_bitsL);
4199  const auto idxH = detail::IndicesForCompress16(mask_bitsH);
4200 
4201  // Compress and 128-bit halves.
4202  const Vec128<uint16_t> cL{_mm_permutexvar_epi16(idxL.raw, vL.raw)};
4203  const Vec128<uint16_t> cH{_mm_permutexvar_epi16(idxH.raw, vH.raw)};
4204  const Half<decltype(d)> dh;
4205  StoreU(BitCast(dh, cL), dh, unaligned);
4206  StoreU(BitCast(dh, cH), dh, unaligned + PopCount(mask_bitsL));
4207 #endif // HWY_TARGET == HWY_AVX3_DL
4208 
4209  return PopCount(mask_bits);
4210 }
4211 
4212 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4213 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
4214  T* HWY_RESTRICT unaligned) {
4215  _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
4216  const size_t count = PopCount(uint64_t{mask.raw});
4217  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
4218 #if HWY_IS_MSAN
4219  __msan_unpoison(unaligned, count * sizeof(T));
4220 #endif
4221  return count;
4222 }
4223 
4224 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4225 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
4226  T* HWY_RESTRICT unaligned) {
4227  _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4228  const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4229  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
4230 #if HWY_IS_MSAN
4231  __msan_unpoison(unaligned, count * sizeof(T));
4232 #endif
4233  return count;
4234 }
4235 
4237  Full256<float> /* tag */,
4238  float* HWY_RESTRICT unaligned) {
4239  _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
4240  const size_t count = PopCount(uint64_t{mask.raw});
4241  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
4242 #if HWY_IS_MSAN
4243  __msan_unpoison(unaligned, count * sizeof(float));
4244 #endif
4245  return count;
4246 }
4247 
4249  Full256<double> /* tag */,
4250  double* HWY_RESTRICT unaligned) {
4251  _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
4252  const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4253  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
4254 #if HWY_IS_MSAN
4255  __msan_unpoison(unaligned, count * sizeof(double));
4256 #endif
4257  return count;
4258 }
4259 
4260 // ------------------------------ CompressBlendedStore (CompressStore)
4261 
4262 #if HWY_TARGET <= HWY_AVX3
4263 
4264 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4266  T* HWY_RESTRICT unaligned) {
4267  // Native (32 or 64-bit) AVX-512 instruction already does the blending at no
4268  // extra cost (latency 11, rthroughput 2 - same as compress plus store).
4269  return CompressStore(v, m, d, unaligned);
4270 }
4271 
4272 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4273 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4274  T* HWY_RESTRICT unaligned) {
4275 #if HWY_TARGET <= HWY_AVX3_DL
4276  return CompressStore(v, m, d, unaligned); // also native
4277 #else
4278  const size_t count = CountTrue(d, m);
4279  BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
4280  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
4281 #if HWY_IS_MSAN
4282  __msan_unpoison(unaligned, count * sizeof(T));
4283 #endif
4284  return count;
4285 #endif
4286 }
4287 
4288 #else // AVX2
4289 
4290 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4291 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4292  T* HWY_RESTRICT unaligned) {
4293  const size_t count = CountTrue(m);
4294  BlendedStore(FirstN(d, count), d, Compress(v, m));
4295  return count;
4296 }
4297 
4298 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4299 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4300  T* HWY_RESTRICT unaligned) {
4301  const size_t count = CountTrue(d, m);
4302  const Vec256<T> compressed = Compress(v, m);
4303 #if HWY_MEM_OPS_MIGHT_FAULT
4304  // BlendedStore tests mask for each lane, but we know that the mask is
4305  // FirstN, so we can just copy.
4306  alignas(32) T buf[16];
4307  Store(compressed, d, buf);
4308  memcpy(unaligned, buf, count * sizeof(T));
4309 #else
4310  BlendedStore(compressed, FirstN(d, count), d, unaligned);
4311 #endif
4312  return count;
4313 }
4314 
4315 #endif // AVX2
4316 
4317 // ------------------------------ CompressBitsStore (LoadMaskBits)
4318 
4319 template <typename T>
4320 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
4321  Full256<T> d, T* HWY_RESTRICT unaligned) {
4322  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4323 }
4324 
4325 #else // AVX2
4326 
4327 // ------------------------------ LoadMaskBits (TestBit)
4328 
4329 namespace detail {
4330 
4331 // 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_LE128 there.
4332 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4333 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4334  const RebindToUnsigned<decltype(d)> du;
4335  const Repartition<uint32_t, decltype(d)> du32;
4336  const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
4337 
4338  // Replicate bytes 8x such that each byte contains the bit that governs it.
4339  const Repartition<uint64_t, decltype(d)> du64;
4340  alignas(32) constexpr uint64_t kRep8[4] = {
4341  0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4342  0x0303030303030303ull};
4343  const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
4344 
4345  alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4346  1, 2, 4, 8, 16, 32, 64, 128};
4347  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4348 }
4349 
4350 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4351 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4352  const RebindToUnsigned<decltype(d)> du;
4353  alignas(32) constexpr uint16_t kBit[16] = {
4354  1, 2, 4, 8, 16, 32, 64, 128,
4355  0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4356  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4357  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4358 }
4359 
4360 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4361 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4362  const RebindToUnsigned<decltype(d)> du;
4363  alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4364  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4365  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4366 }
4367 
4368 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4369 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4370  const RebindToUnsigned<decltype(d)> du;
4371  alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4372  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4373 }
4374 
4375 } // namespace detail
4376 
4377 // `p` points to at least 8 readable bytes, not all of which need be valid.
4378 template <typename T>
4379 HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
4380  const uint8_t* HWY_RESTRICT bits) {
4381  constexpr size_t N = 32 / sizeof(T);
4382  constexpr size_t kNumBytes = (N + 7) / 8;
4383 
4384  uint64_t mask_bits = 0;
4385  CopyBytes<kNumBytes>(bits, &mask_bits);
4386 
4387  if (N < 8) {
4388  mask_bits &= (1ull << N) - 1;
4389  }
4390 
4391  return detail::LoadMaskBits256(d, mask_bits);
4392 }
4393 
4394 // ------------------------------ StoreMaskBits
4395 
4396 namespace detail {
4397 
4398 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4399 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4400  const Full256<T> d;
4401  const Full256<uint8_t> d8;
4402  const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
4403  // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
4404  return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
4405 }
4406 
4407 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4408 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4409 #if HWY_ARCH_X86_64
4410  const Full256<T> d;
4411  const Full256<uint8_t> d8;
4412  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4413  const uint64_t sign_bits8 = BitsFromMask(mask8);
4414  // Skip the bits from the lower byte of each u16 (better not to use the
4415  // same packs_epi16 as SSE4, because that requires an extra swizzle here).
4416  return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4417 #else
4418  // Slow workaround for 32-bit builds, which lack _pext_u64.
4419  // Remove useless lower half of each u16 while preserving the sign bit.
4420  // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes.
4421  const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4422  // Move odd qwords (value zero) to top so they don't affect the mask value.
4423  const auto compressed =
4424  _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4425  return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4426 #endif // HWY_ARCH_X86_64
4427 }
4428 
4429 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4430 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4431  const Full256<T> d;
4432  const Full256<float> df;
4433  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4434  return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4435 }
4436 
4437 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4438 HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4439  const Full256<T> d;
4440  const Full256<double> df;
4441  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4442  return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4443 }
4444 
4445 } // namespace detail
4446 
4447 // `p` points to at least 8 writable bytes.
4448 template <typename T>
4449 HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4450  uint8_t* bits) {
4451  constexpr size_t N = 32 / sizeof(T);
4452  constexpr size_t kNumBytes = (N + 7) / 8;
4453 
4454  const uint64_t mask_bits = detail::BitsFromMask(mask);
4455  CopyBytes<kNumBytes>(&mask_bits, bits);
4456  return kNumBytes;
4457 }
4458 
4459 // ------------------------------ Mask testing
4460 
4461 // Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask
4462 // lane is 0 or ~0.
4463 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4464 HWY_API bool AllFalse(const Full256<T> d, const Mask256<T> mask) {
4465  const Repartition<uint8_t, decltype(d)> d8;
4466  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4467  return detail::BitsFromMask(mask8) == 0;
4468 }
4469 
4470 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4471 HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4472  // Cheaper than PTEST, which is 2 uop / 3L.
4473  return detail::BitsFromMask(mask) == 0;
4474 }
4475 
4476 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4477 HWY_API bool AllTrue(const Full256<T> d, const Mask256<T> mask) {
4478  const Repartition<uint8_t, decltype(d)> d8;
4479  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4480  return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
4481 }
4482 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4483 HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4484  constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1;
4485  return detail::BitsFromMask(mask) == kAllBits;
4486 }
4487 
4488 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4489 HWY_API size_t CountTrue(const Full256<T> d, const Mask256<T> mask) {
4490  const Repartition<uint8_t, decltype(d)> d8;
4491  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4492  return PopCount(detail::BitsFromMask(mask8)) >> 1;
4493 }
4494 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4495 HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4496  return PopCount(detail::BitsFromMask(mask));
4497 }
4498 
4499 template <typename T>
4500 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
4501  const Mask256<T> mask) {
4502  const uint64_t mask_bits = detail::BitsFromMask(mask);
4503  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
4504 }
4505 
4506 // ------------------------------ Compress, CompressBits
4507 
4508 namespace detail {
4509 
4510 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4511 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
4512  uint64_t mask_bits) {
4513  const RebindToUnsigned<decltype(d)> d32;
4514  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
4515  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
4516  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
4517  // and unavailable in 32-bit builds. We instead compress each index into 4
4518  // bits, for a total of 1 KiB.
4519  alignas(16) constexpr uint32_t packed_array[256] = {
4520  0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4521  0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4522  0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4523  0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4524  0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4525  0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4526  0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4527  0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4528  0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4529  0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4530  0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4531  0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4532  0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4533  0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4534  0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4535  0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4536  0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4537  0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4538  0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4539  0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4540  0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4541  0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4542  0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4543  0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4544  0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4545  0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4546  0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4547  0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4548  0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4549  0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4550  0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4551  0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4552  0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4553  0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4554  0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4555  0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4556  0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4557  0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4558  0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4559  0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4560  0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4561  0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4562  0x10765432, 0x17654320, 0x07654321, 0x76543210};
4563 
4564  // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
4565  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4566  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
4567  // latency, it may be faster to use LoadDup128 and PSHUFB.
4568  const auto packed = Set(d32, packed_array[mask_bits]);
4569  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4570  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
4571 }
4572 
4573 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4574 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
4575  uint64_t mask_bits) {
4576  const Repartition<uint32_t, decltype(d)> d32;
4577 
4578  // For 64-bit, we still need 32-bit indices because there is no 64-bit
4579  // permutevar, but there are only 4 lanes, so we can afford to skip the
4580  // unpacking and load the entire index vector directly.
4581  alignas(32) constexpr uint32_t packed_array[128] = {
4582  0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
4583  2, 3, 0, 1, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
4584  4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7, //
4585  2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
4586  6, 7, 0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, //
4587  2, 3, 6, 7, 0, 1, 4, 5, 0, 1, 2, 3, 6, 7, 4, 5, //
4588  4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7, 2, 3, //
4589  2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
4590  return Indices256<uint32_t>{Load(d32, packed_array + 8 * mask_bits).raw};
4591 }
4592 
4593 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4594 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4595  const Full256<T> d;
4596  const Repartition<uint32_t, decltype(d)> du32;
4597 
4598  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
4599  const auto indices = IndicesFromBits(d, mask_bits);
4600  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
4601 }
4602 
4603 // LUTs are infeasible for 2^16 possible masks, so splice together two
4604 // half-vector Compress.
4605 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4606 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4607  const Full256<T> d;
4608  const RebindToUnsigned<decltype(d)> du;
4609  const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
4610  const Half<decltype(du)> duh;
4611  const auto half0 = LowerHalf(duh, vu16);
4612  const auto half1 = UpperHalf(duh, vu16);
4613 
4614  const uint64_t mask_bits0 = mask_bits & 0xFF;
4615  const uint64_t mask_bits1 = mask_bits >> 8;
4616  const auto compressed0 = detail::CompressBits(half0, mask_bits0);
4617  const auto compressed1 = detail::CompressBits(half1, mask_bits1);
4618 
4619  alignas(32) uint16_t all_true[16] = {};
4620  // Store mask=true lanes, left to right.
4621  const size_t num_true0 = PopCount(mask_bits0);
4622  Store(compressed0, duh, all_true);
4623  StoreU(compressed1, duh, all_true + num_true0);
4624 
4626  // Store mask=false lanes, right to left. The second vector fills the upper
4627  // half with right-aligned false lanes. The first vector is shifted
4628  // rightwards to overwrite the true lanes of the second.
4629  alignas(32) uint16_t all_false[16] = {};
4630  const size_t num_true1 = PopCount(mask_bits1);
4631  Store(compressed1, duh, all_false + 8);
4632  StoreU(compressed0, duh, all_false + num_true1);
4633 
4634  const auto mask = FirstN(du, num_true0 + num_true1);
4635  return BitCast(d,
4636  IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
4637  } else {
4638  // Only care about the mask=true lanes.
4639  return BitCast(d, Load(du, all_true));
4640  }
4641 }
4642 
4643 } // namespace detail
4644 
4645 template <typename T>
4646 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
4647  const uint64_t mask_bits = detail::BitsFromMask(m);
4648  return detail::Compress(v, mask_bits);
4649 }
4650 
4651 template <typename T>
4652 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4653  constexpr size_t N = 32 / sizeof(T);
4654  constexpr size_t kNumBytes = (N + 7) / 8;
4655 
4656  uint64_t mask_bits = 0;
4657  CopyBytes<kNumBytes>(bits, &mask_bits);
4658 
4659  if (N < 8) {
4660  mask_bits &= (1ull << N) - 1;
4661  }
4662 
4663  return detail::Compress(v, mask_bits);
4664 }
4665 
4666 // ------------------------------ CompressStore, CompressBitsStore
4667 
4668 template <typename T>
4669 HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4670  T* HWY_RESTRICT unaligned) {
4671  const uint64_t mask_bits = detail::BitsFromMask(m);
4672  StoreU(detail::Compress(v, mask_bits), d, unaligned);
4673  return PopCount(mask_bits);
4674 }
4675 
4676 template <typename T>
4677 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4678  T* HWY_RESTRICT unaligned) {
4679  const uint64_t mask_bits = detail::BitsFromMask(m);
4680  const size_t count = PopCount(mask_bits);
4681  BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
4682  return count;
4683 }
4684 
4685 template <typename T>
4686 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
4687  Full256<T> d, T* HWY_RESTRICT unaligned) {
4688  constexpr size_t N = 32 / sizeof(T);
4689  constexpr size_t kNumBytes = (N + 7) / 8;
4690 
4691  uint64_t mask_bits = 0;
4692  CopyBytes<kNumBytes>(bits, &mask_bits);
4693 
4694  if (N < 8) {
4695  mask_bits &= (1ull << N) - 1;
4696  }
4697 
4698  StoreU(detail::Compress(v, mask_bits), d, unaligned);
4699  return PopCount(mask_bits);
4700 }
4701 
4702 #endif // HWY_TARGET <= HWY_AVX3
4703 
4704 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
4705 // TableLookupBytes, ConcatUpperLower)
4706 
4707 HWY_API void StoreInterleaved3(const Vec256<uint8_t> v0,
4708  const Vec256<uint8_t> v1,
4709  const Vec256<uint8_t> v2, Full256<uint8_t> d,
4710  uint8_t* HWY_RESTRICT unaligned) {
4711  const auto k5 = Set(d, 5);
4712  const auto k6 = Set(d, 6);
4713 
4714  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
4715  // 0x80 so lanes to be filled from other vectors are 0 for blending.
4716  alignas(16) static constexpr uint8_t tbl_r0[16] = {
4717  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
4718  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
4719  alignas(16) static constexpr uint8_t tbl_g0[16] = {
4720  0x80, 0, 0x80, 0x80, 1, 0x80, //
4721  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
4722  const auto shuf_r0 = LoadDup128(d, tbl_r0);
4723  const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
4724  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
4725  const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
4726  const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
4727  const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
4728  const auto interleaved_10_00 = r0 | g0 | b0;
4729 
4730  // Second vector: g10,r10, bgr[9:6], b5,g5
4731  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
4732  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
4733  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
4734  const auto r1 = TableLookupBytes(v0, shuf_r1);
4735  const auto g1 = TableLookupBytes(v1, shuf_g1);
4736  const auto b1 = TableLookupBytes(v2, shuf_b1);
4737  const auto interleaved_15_05 = r1 | g1 | b1;
4738 
4739  // We want to write the lower halves of the interleaved vectors, then the
4740  // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but
4741  // that would require two ununaligned stores. For the lower halves, we can
4742  // merge two 128-bit stores for the same swizzling cost:
4743  const auto out0 = ConcatLowerLower(d, interleaved_15_05, interleaved_10_00);
4744  StoreU(out0, d, unaligned + 0 * 32);
4745 
4746  // Third vector: bgr[15:11], b10
4747  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
4748  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
4749  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
4750  const auto r2 = TableLookupBytes(v0, shuf_r2);
4751  const auto g2 = TableLookupBytes(v1, shuf_g2);
4752  const auto b2 = TableLookupBytes(v2, shuf_b2);
4753  const auto interleaved_1A_0A = r2 | g2 | b2;
4754 
4755  const auto out1 = ConcatUpperLower(d, interleaved_10_00, interleaved_1A_0A);
4756  StoreU(out1, d, unaligned + 1 * 32);
4757 
4758  const auto out2 = ConcatUpperUpper(d, interleaved_1A_0A, interleaved_15_05);
4759  StoreU(out2, d, unaligned + 2 * 32);
4760 }
4761 
4762 // ------------------------------ StoreInterleaved4
4763 
4764 HWY_API void StoreInterleaved4(const Vec256<uint8_t> v0,
4765  const Vec256<uint8_t> v1,
4766  const Vec256<uint8_t> v2,
4767  const Vec256<uint8_t> v3, Full256<uint8_t> d8,
4768  uint8_t* HWY_RESTRICT unaligned) {
4769  const RepartitionToWide<decltype(d8)> d16;
4770  const RepartitionToWide<decltype(d16)> d32;
4771  // let a,b,c,d denote v0..3.
4772  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
4773  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
4774  const auto ba8 = ZipUpper(d16, v0, v1);
4775  const auto dc8 = ZipUpper(d16, v2, v3);
4776  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a13 d..a10 | d..a03 d..a00
4777  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a17 d..a14 | d..a07 d..a04
4778  const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..a1B d..a18 | d..a0B d..a08
4779  const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..a1F d..a1C | d..a0F d..a0C
4780  // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can
4781  // efficiently combine two lower halves into 256 bits:
4782  const auto out0 = BitCast(d8, ConcatLowerLower(d32, dcba_4, dcba_0));
4783  const auto out1 = BitCast(d8, ConcatLowerLower(d32, dcba_C, dcba_8));
4784  StoreU(out0, d8, unaligned + 0 * 32);
4785  StoreU(out1, d8, unaligned + 1 * 32);
4786  const auto out2 = BitCast(d8, ConcatUpperUpper(d32, dcba_4, dcba_0));
4787  const auto out3 = BitCast(d8, ConcatUpperUpper(d32, dcba_C, dcba_8));
4788  StoreU(out2, d8, unaligned + 2 * 32);
4789  StoreU(out3, d8, unaligned + 3 * 32);
4790 }
4791 
4792 // ------------------------------ Reductions
4793 
4794 namespace detail {
4795 
4796 // Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block.
4797 // Same logic as x86/128.h, but with Vec256 arguments.
4798 template <typename T>
4799 HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
4800  const Vec256<T> v3210) {
4801  const auto v1032 = Shuffle1032(v3210);
4802  const auto v31_20_31_20 = v3210 + v1032;
4803  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
4804  return v20_31_20_31 + v31_20_31_20;
4805 }
4806 template <typename T>
4807 HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
4808  const Vec256<T> v3210) {
4809  const auto v1032 = Shuffle1032(v3210);
4810  const auto v31_20_31_20 = Min(v3210, v1032);
4811  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
4812  return Min(v20_31_20_31, v31_20_31_20);
4813 }
4814 template <typename T>
4815 HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
4816  const Vec256<T> v3210) {
4817  const auto v1032 = Shuffle1032(v3210);
4818  const auto v31_20_31_20 = Max(v3210, v1032);
4819  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
4820  return Max(v20_31_20_31, v31_20_31_20);
4821 }
4822 
4823 template <typename T>
4824 HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
4825  const Vec256<T> v10) {
4826  const auto v01 = Shuffle01(v10);
4827  return v10 + v01;
4828 }
4829 template <typename T>
4830 HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
4831  const Vec256<T> v10) {
4832  const auto v01 = Shuffle01(v10);
4833  return Min(v10, v01);
4834 }
4835 template <typename T>
4836 HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
4837  const Vec256<T> v10) {
4838  const auto v01 = Shuffle01(v10);
4839  return Max(v10, v01);
4840 }
4841 
4842 // u16/i16
4843 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4844 HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
4845  const Repartition<int32_t, Full256<T>> d32;
4846  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4847  const auto odd = ShiftRight<16>(BitCast(d32, v));
4848  const auto min = MinOfLanes(d32, Min(even, odd));
4849  // Also broadcast into odd lanes.
4850  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
4851 }
4852 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4853 HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
4854  const Repartition<int32_t, Full256<T>> d32;
4855  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4856  const auto odd = ShiftRight<16>(BitCast(d32, v));
4857  const auto min = MaxOfLanes(d32, Max(even, odd));
4858  // Also broadcast into odd lanes.
4859  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
4860 }
4861 
4862 } // namespace detail
4863 
4864 // Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
4865 template <typename T>
4866 HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
4867  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
4868  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), vLH + vHL);
4869 }
4870 template <typename T>
4871 HWY_API Vec256<T> MinOfLanes(Full256<T> d, const Vec256<T> vHL) {
4872  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
4873  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), Min(vLH, vHL));
4874 }
4875 template <typename T>
4876 HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
4877  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
4878  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
4879 }
4880 
4881 // NOLINTNEXTLINE(google-readability-namespace-comments)
4882 } // namespace HWY_NAMESPACE
4883 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:485
Raw raw
Definition: arm_neon-inl.h:518
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: x86_256-inl.h:75
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: x86_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:94
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: x86_256-inl.h:87
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: x86_256-inl.h:90
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: x86_256-inl.h:78
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: x86_256-inl.h:84
typename detail::Raw256< T >::type Raw
Definition: x86_256-inl.h:67
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: x86_256-inl.h:81
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:4784
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4773
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3106
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3095
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4066
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2434
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:770
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2426
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1120
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:247
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:203
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
long long int GatherIndex64
Definition: x86_128-inl.h:3088
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
u
Definition: rvv-inl.h:1405
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:598
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: arm_neon-inl.h:4797
Definition: wasm_256-inl.h:1752
__m256i raw
Definition: x86_256-inl.h:2786
Definition: wasm_256-inl.h:70
typename detail::RawMask256< sizeof(T)>::type Raw
Definition: x86_256-inl.h:125
Raw raw
Definition: x86_256-inl.h:131
static Mask256< T > FromBits(uint64_t mask_bits)
Definition: x86_256-inl.h:127
Definition: ops/shared-inl.h:40
HWY_INLINE __m256d operator()(__m256i v)
Definition: x86_256-inl.h:170
HWY_INLINE __m256 operator()(__m256i v)
Definition: x86_256-inl.h:166
HWY_INLINE __m256i operator()(__m256i v)
Definition: x86_256-inl.h:162
__m256d type
Definition: x86_256-inl.h:60
__m256 type
Definition: x86_256-inl.h:56
Definition: x86_256-inl.h:51
__m256i type
Definition: x86_256-inl.h:52
__mmask32 type
Definition: x86_256-inl.h:106
__mmask16 type
Definition: x86_256-inl.h:110
__mmask8 type
Definition: x86_256-inl.h:114
__mmask8 type
Definition: x86_256-inl.h:118
Definition: x86_256-inl.h:103
Definition: base.h:317
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()