Grok 10.0.0
x86_256-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when
17// compiling for that target.
18// External include guard in highway.h - see comment there.
19
20// WARNING: most operations do not cross 128-bit block boundaries. In
21// particular, "Broadcast", pack and zip behavior may be surprising.
22
23// Must come before HWY_COMPILER_CLANGCL
24#include <immintrin.h> // AVX2+
25
26#include "hwy/base.h"
27
28#if HWY_COMPILER_CLANGCL
29// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
30// including these headers when _MSC_VER is defined, like when using clang-cl.
31// Include these directly here.
32#include <avxintrin.h>
33// avxintrin defines __m256i and must come before avx2intrin.
34#include <avx2intrin.h>
35#include <bmi2intrin.h> // _pext_u64
36#include <f16cintrin.h>
37#include <fmaintrin.h>
38#include <smmintrin.h>
39#endif // HWY_COMPILER_CLANGCL
40
41#include <stddef.h>
42#include <stdint.h>
43
44#if HWY_IS_MSAN
45#include <sanitizer/msan_interface.h>
46#endif
47
48// For half-width vectors. Already includes base.h and shared-inl.h.
49#include "hwy/ops/x86_128-inl.h"
50
52namespace hwy {
53namespace HWY_NAMESPACE {
54namespace detail {
55
56template <typename T>
57struct Raw256 {
58 using type = __m256i;
59};
60template <>
61struct Raw256<float> {
62 using type = __m256;
63};
64template <>
65struct Raw256<double> {
66 using type = __m256d;
67};
68
69} // namespace detail
70
71template <typename T>
72class Vec256 {
73 using Raw = typename detail::Raw256<T>::type;
74
75 public:
76 // Compound assignment. Only usable if there is a corresponding non-member
77 // binary operator overload. For example, only f32 and f64 support division.
79 return *this = (*this * other);
80 }
82 return *this = (*this / other);
83 }
85 return *this = (*this + other);
86 }
88 return *this = (*this - other);
89 }
91 return *this = (*this & other);
92 }
94 return *this = (*this | other);
95 }
97 return *this = (*this ^ other);
98 }
99
101};
102
103#if HWY_TARGET <= HWY_AVX3
104
105namespace detail {
106
107// Template arg: sizeof(lane type)
108template <size_t size>
109struct RawMask256 {};
110template <>
111struct RawMask256<1> {
112 using type = __mmask32;
113};
114template <>
115struct RawMask256<2> {
116 using type = __mmask16;
117};
118template <>
119struct RawMask256<4> {
120 using type = __mmask8;
121};
122template <>
123struct RawMask256<8> {
124 using type = __mmask8;
125};
126
127} // namespace detail
128
129template <typename T>
130struct Mask256 {
131 using Raw = typename detail::RawMask256<sizeof(T)>::type;
132
133 static Mask256<T> FromBits(uint64_t mask_bits) {
134 return Mask256<T>{static_cast<Raw>(mask_bits)};
135 }
136
138};
139
140#else // AVX2
141
142// FF..FF or 0.
143template <typename T>
144struct Mask256 {
146};
147
148#endif // HWY_TARGET <= HWY_AVX3
149
150// ------------------------------ BitCast
151
152namespace detail {
153
154HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
155HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); }
156HWY_INLINE __m256i BitCastToInteger(__m256d v) {
157 return _mm256_castpd_si256(v);
158}
159
160template <typename T>
163}
164
165// Cannot rely on function overloading because return types differ.
166template <typename T>
168 HWY_INLINE __m256i operator()(__m256i v) { return v; }
169};
170template <>
172 HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); }
173};
174template <>
175struct BitCastFromInteger256<double> {
176 HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); }
177};
178
179template <typename T>
181 return Vec256<T>{BitCastFromInteger256<T>()(v.raw)};
182}
183
184} // namespace detail
185
186template <typename T, typename FromT>
187HWY_API Vec256<T> BitCast(Full256<T> d, Vec256<FromT> v) {
189}
190
191// ------------------------------ Set
192
193// Returns an all-zero vector.
194template <typename T>
195HWY_API Vec256<T> Zero(Full256<T> /* tag */) {
196 return Vec256<T>{_mm256_setzero_si256()};
197}
198HWY_API Vec256<float> Zero(Full256<float> /* tag */) {
199 return Vec256<float>{_mm256_setzero_ps()};
200}
202 return Vec256<double>{_mm256_setzero_pd()};
203}
204
205// Returns a vector with all lanes set to "t".
206HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
207 return Vec256<uint8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
208}
209HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
210 return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
211}
212HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
213 return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
214}
215HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
216 return Vec256<uint64_t>{
217 _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
218}
219HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
220 return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
221}
222HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
223 return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
224}
225HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
226 return Vec256<int32_t>{_mm256_set1_epi32(t)};
227}
228HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
229 return Vec256<int64_t>{
230 _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
231}
232HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
233 return Vec256<float>{_mm256_set1_ps(t)};
234}
235HWY_API Vec256<double> Set(Full256<double> /* tag */, const double t) {
236 return Vec256<double>{_mm256_set1_pd(t)};
237}
238
239HWY_DIAGNOSTICS(push)
240HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
241
242// Returns a vector with uninitialized elements.
243template <typename T>
244HWY_API Vec256<T> Undefined(Full256<T> /* tag */) {
245 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
246 // generate an XOR instruction.
247 return Vec256<T>{_mm256_undefined_si256()};
248}
250 return Vec256<float>{_mm256_undefined_ps()};
251}
253 return Vec256<double>{_mm256_undefined_pd()};
254}
255
257
258// ================================================== LOGICAL
259
260// ------------------------------ And
261
262template <typename T>
263HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
264 return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
265}
266
268 return Vec256<float>{_mm256_and_ps(a.raw, b.raw)};
269}
271 return Vec256<double>{_mm256_and_pd(a.raw, b.raw)};
272}
273
274// ------------------------------ AndNot
275
276// Returns ~not_mask & mask.
277template <typename T>
278HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
279 return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
280}
282 const Vec256<float> mask) {
283 return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
284}
286 const Vec256<double> mask) {
287 return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)};
288}
289
290// ------------------------------ Or
291
292template <typename T>
293HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
294 return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
295}
296
298 return Vec256<float>{_mm256_or_ps(a.raw, b.raw)};
299}
301 return Vec256<double>{_mm256_or_pd(a.raw, b.raw)};
302}
303
304// ------------------------------ Xor
305
306template <typename T>
307HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
308 return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
309}
310
312 return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)};
313}
315 return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)};
316}
317
318// ------------------------------ Not
319
320template <typename T>
321HWY_API Vec256<T> Not(const Vec256<T> v) {
322 using TU = MakeUnsigned<T>;
323#if HWY_TARGET <= HWY_AVX3
324 const __m256i vu = BitCast(Full256<TU>(), v).raw;
325 return BitCast(Full256<T>(),
326 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
327#else
328 return Xor(v, BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
329#endif
330}
331
332// ------------------------------ Or3
333
334template <typename T>
335HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
336#if HWY_TARGET <= HWY_AVX3
337 const Full256<T> d;
338 const RebindToUnsigned<decltype(d)> du;
339 using VU = VFromD<decltype(du)>;
340 const __m256i ret = _mm256_ternarylogic_epi64(
341 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
342 return BitCast(d, VU{ret});
343#else
344 return Or(o1, Or(o2, o3));
345#endif
346}
347
348// ------------------------------ OrAnd
349
350template <typename T>
351HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
352#if HWY_TARGET <= HWY_AVX3
353 const Full256<T> d;
354 const RebindToUnsigned<decltype(d)> du;
355 using VU = VFromD<decltype(du)>;
356 const __m256i ret = _mm256_ternarylogic_epi64(
357 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
358 return BitCast(d, VU{ret});
359#else
360 return Or(o, And(a1, a2));
361#endif
362}
363
364// ------------------------------ IfVecThenElse
365
366template <typename T>
367HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
368#if HWY_TARGET <= HWY_AVX3
369 const Full256<T> d;
370 const RebindToUnsigned<decltype(d)> du;
371 using VU = VFromD<decltype(du)>;
372 return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
373 BitCast(du, yes).raw,
374 BitCast(du, no).raw, 0xCA)});
375#else
376 return IfThenElse(MaskFromVec(mask), yes, no);
377#endif
378}
379
380// ------------------------------ Operator overloads (internal-only if float)
381
382template <typename T>
383HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
384 return And(a, b);
385}
386
387template <typename T>
388HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
389 return Or(a, b);
390}
391
392template <typename T>
393HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
394 return Xor(a, b);
395}
396
397// ------------------------------ PopulationCount
398
399// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
400#if HWY_TARGET == HWY_AVX3_DL
401
402#ifdef HWY_NATIVE_POPCNT
403#undef HWY_NATIVE_POPCNT
404#else
405#define HWY_NATIVE_POPCNT
406#endif
407
408namespace detail {
409
410template <typename T>
412 return Vec256<T>{_mm256_popcnt_epi8(v.raw)};
413}
414template <typename T>
416 return Vec256<T>{_mm256_popcnt_epi16(v.raw)};
417}
418template <typename T>
420 return Vec256<T>{_mm256_popcnt_epi32(v.raw)};
421}
422template <typename T>
424 return Vec256<T>{_mm256_popcnt_epi64(v.raw)};
425}
426
427} // namespace detail
428
429template <typename T>
431 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
432}
433
434#endif // HWY_TARGET == HWY_AVX3_DL
435
436// ================================================== SIGN
437
438// ------------------------------ CopySign
439
440template <typename T>
441HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
442 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
443
444 const Full256<T> d;
445 const auto msb = SignBit(d);
446
447#if HWY_TARGET <= HWY_AVX3
448 const Rebind<MakeUnsigned<T>, decltype(d)> du;
449 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
450 // 0 0 0 | 0
451 // 0 0 1 | 0
452 // 0 1 0 | 1
453 // 0 1 1 | 1
454 // 1 0 0 | 0
455 // 1 0 1 | 1
456 // 1 1 0 | 0
457 // 1 1 1 | 1
458 // The lane size does not matter because we are not using predication.
459 const __m256i out = _mm256_ternarylogic_epi32(
460 BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
461 return BitCast(d, decltype(Zero(du)){out});
462#else
463 return Or(AndNot(msb, magn), And(msb, sign));
464#endif
465}
466
467template <typename T>
468HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) {
469#if HWY_TARGET <= HWY_AVX3
470 // AVX3 can also handle abs < 0, so no extra action needed.
471 return CopySign(abs, sign);
472#else
473 return Or(abs, And(SignBit(Full256<T>()), sign));
474#endif
475}
476
477// ================================================== MASK
478
479#if HWY_TARGET <= HWY_AVX3
480
481// ------------------------------ IfThenElse
482
483// Returns mask ? b : a.
484
485namespace detail {
486
487// Templates for signed/unsigned integer of a particular size.
488template <typename T>
490 Vec256<T> yes, Vec256<T> no) {
491 return Vec256<T>{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
492}
493template <typename T>
495 Vec256<T> yes, Vec256<T> no) {
496 return Vec256<T>{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
497}
498template <typename T>
500 Vec256<T> yes, Vec256<T> no) {
501 return Vec256<T>{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
502}
503template <typename T>
505 Vec256<T> yes, Vec256<T> no) {
506 return Vec256<T>{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
507}
508
509} // namespace detail
510
511template <typename T>
512HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
513 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
514}
516 Vec256<float> no) {
517 return Vec256<float>{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)};
518}
520 Vec256<double> no) {
521 return Vec256<double>{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)};
522}
523
524namespace detail {
525
526template <typename T>
528 Vec256<T> yes) {
529 return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)};
530}
531template <typename T>
533 Vec256<T> yes) {
534 return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)};
535}
536template <typename T>
538 Vec256<T> yes) {
539 return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)};
540}
541template <typename T>
543 Vec256<T> yes) {
544 return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)};
545}
546
547} // namespace detail
548
549template <typename T>
550HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
551 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
552}
554 return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)};
555}
557 Vec256<double> yes) {
558 return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)};
559}
560
561namespace detail {
562
563template <typename T>
565 Vec256<T> no) {
566 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
567 return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
568}
569template <typename T>
571 Vec256<T> no) {
572 return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
573}
574template <typename T>
576 Vec256<T> no) {
577 return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
578}
579template <typename T>
581 Vec256<T> no) {
582 return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
583}
584
585} // namespace detail
586
587template <typename T>
588HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
589 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
590}
592 return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
593}
595 return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
596}
597
598template <typename T, HWY_IF_FLOAT(T)>
600 // AVX3 MaskFromVec only looks at the MSB
601 return IfThenZeroElse(MaskFromVec(v), v);
602}
603
604// ------------------------------ Mask logical
605
606namespace detail {
607
608template <typename T>
610 const Mask256<T> b) {
611#if HWY_COMPILER_HAS_MASK_INTRINSICS
612 return Mask256<T>{_kand_mask32(a.raw, b.raw)};
613#else
614 return Mask256<T>{static_cast<__mmask32>(a.raw & b.raw)};
615#endif
616}
617template <typename T>
619 const Mask256<T> b) {
620#if HWY_COMPILER_HAS_MASK_INTRINSICS
621 return Mask256<T>{_kand_mask16(a.raw, b.raw)};
622#else
623 return Mask256<T>{static_cast<__mmask16>(a.raw & b.raw)};
624#endif
625}
626template <typename T>
628 const Mask256<T> b) {
629#if HWY_COMPILER_HAS_MASK_INTRINSICS
630 return Mask256<T>{_kand_mask8(a.raw, b.raw)};
631#else
632 return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
633#endif
634}
635template <typename T>
637 const Mask256<T> b) {
638#if HWY_COMPILER_HAS_MASK_INTRINSICS
639 return Mask256<T>{_kand_mask8(a.raw, b.raw)};
640#else
641 return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
642#endif
643}
644
645template <typename T>
647 const Mask256<T> b) {
648#if HWY_COMPILER_HAS_MASK_INTRINSICS
649 return Mask256<T>{_kandn_mask32(a.raw, b.raw)};
650#else
651 return Mask256<T>{static_cast<__mmask32>(~a.raw & b.raw)};
652#endif
653}
654template <typename T>
656 const Mask256<T> b) {
657#if HWY_COMPILER_HAS_MASK_INTRINSICS
658 return Mask256<T>{_kandn_mask16(a.raw, b.raw)};
659#else
660 return Mask256<T>{static_cast<__mmask16>(~a.raw & b.raw)};
661#endif
662}
663template <typename T>
665 const Mask256<T> b) {
666#if HWY_COMPILER_HAS_MASK_INTRINSICS
667 return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
668#else
669 return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
670#endif
671}
672template <typename T>
674 const Mask256<T> b) {
675#if HWY_COMPILER_HAS_MASK_INTRINSICS
676 return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
677#else
678 return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
679#endif
680}
681
682template <typename T>
684 const Mask256<T> b) {
685#if HWY_COMPILER_HAS_MASK_INTRINSICS
686 return Mask256<T>{_kor_mask32(a.raw, b.raw)};
687#else
688 return Mask256<T>{static_cast<__mmask32>(a.raw | b.raw)};
689#endif
690}
691template <typename T>
693 const Mask256<T> b) {
694#if HWY_COMPILER_HAS_MASK_INTRINSICS
695 return Mask256<T>{_kor_mask16(a.raw, b.raw)};
696#else
697 return Mask256<T>{static_cast<__mmask16>(a.raw | b.raw)};
698#endif
699}
700template <typename T>
702 const Mask256<T> b) {
703#if HWY_COMPILER_HAS_MASK_INTRINSICS
704 return Mask256<T>{_kor_mask8(a.raw, b.raw)};
705#else
706 return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
707#endif
708}
709template <typename T>
711 const Mask256<T> b) {
712#if HWY_COMPILER_HAS_MASK_INTRINSICS
713 return Mask256<T>{_kor_mask8(a.raw, b.raw)};
714#else
715 return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
716#endif
717}
718
719template <typename T>
721 const Mask256<T> b) {
722#if HWY_COMPILER_HAS_MASK_INTRINSICS
723 return Mask256<T>{_kxor_mask32(a.raw, b.raw)};
724#else
725 return Mask256<T>{static_cast<__mmask32>(a.raw ^ b.raw)};
726#endif
727}
728template <typename T>
730 const Mask256<T> b) {
731#if HWY_COMPILER_HAS_MASK_INTRINSICS
732 return Mask256<T>{_kxor_mask16(a.raw, b.raw)};
733#else
734 return Mask256<T>{static_cast<__mmask16>(a.raw ^ b.raw)};
735#endif
736}
737template <typename T>
739 const Mask256<T> b) {
740#if HWY_COMPILER_HAS_MASK_INTRINSICS
741 return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
742#else
743 return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
744#endif
745}
746template <typename T>
748 const Mask256<T> b) {
749#if HWY_COMPILER_HAS_MASK_INTRINSICS
750 return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
751#else
752 return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
753#endif
754}
755
756} // namespace detail
757
758template <typename T>
760 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
761}
762
763template <typename T>
764HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
765 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
766}
767
768template <typename T>
769HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
770 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
771}
772
773template <typename T>
774HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
775 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
776}
777
778template <typename T>
780 // Flip only the valid bits.
781 constexpr size_t N = 32 / sizeof(T);
782 return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
783}
784
785#else // AVX2
786
787// ------------------------------ Mask
788
789// Mask and Vec are the same (true = FF..FF).
790template <typename T>
791HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
792 return Mask256<T>{v.raw};
793}
794
795template <typename T>
796HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
797 return Vec256<T>{v.raw};
798}
799
800template <typename T>
801HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
802 return Vec256<T>{v.raw};
803}
804
805// ------------------------------ IfThenElse
806
807// mask ? yes : no
808template <typename T>
809HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
810 const Vec256<T> no) {
811 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
812}
813HWY_API Vec256<float> IfThenElse(const Mask256<float> mask,
814 const Vec256<float> yes,
815 const Vec256<float> no) {
816 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
817}
818HWY_API Vec256<double> IfThenElse(const Mask256<double> mask,
819 const Vec256<double> yes,
820 const Vec256<double> no) {
821 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
822}
823
824// mask ? yes : 0
825template <typename T>
826HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
827 return yes & VecFromMask(Full256<T>(), mask);
828}
829
830// mask ? 0 : no
831template <typename T>
832HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
833 return AndNot(VecFromMask(Full256<T>(), mask), no);
834}
835
836template <typename T, HWY_IF_FLOAT(T)>
837HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
838 const auto zero = Zero(Full256<T>());
839 // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
840 return IfThenElse(MaskFromVec(v), zero, v);
841}
842
843// ------------------------------ Mask logical
844
845template <typename T>
846HWY_API Mask256<T> Not(const Mask256<T> m) {
847 return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
848}
849
850template <typename T>
851HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
852 const Full256<T> d;
853 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
854}
855
856template <typename T>
857HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
858 const Full256<T> d;
859 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
860}
861
862template <typename T>
863HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
864 const Full256<T> d;
865 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
866}
867
868template <typename T>
869HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
870 const Full256<T> d;
871 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
872}
873
874#endif // HWY_TARGET <= HWY_AVX3
875
876// ================================================== COMPARE
877
878#if HWY_TARGET <= HWY_AVX3
879
880// Comparisons set a mask bit to 1 if the condition is true, else 0.
881
882template <typename TFrom, typename TTo>
883HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
884 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
885 return Mask256<TTo>{m.raw};
886}
887
888namespace detail {
889
890template <typename T>
892 const Vec256<T> bit) {
893 return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)};
894}
895template <typename T>
897 const Vec256<T> bit) {
898 return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)};
899}
900template <typename T>
902 const Vec256<T> bit) {
903 return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)};
904}
905template <typename T>
907 const Vec256<T> bit) {
908 return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)};
909}
910
911} // namespace detail
912
913template <typename T>
914HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
915 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
916 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
917}
918
919// ------------------------------ Equality
920
921template <typename T, HWY_IF_LANE_SIZE(T, 1)>
923 return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)};
924}
925template <typename T, HWY_IF_LANE_SIZE(T, 2)>
926HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
927 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
928}
929template <typename T, HWY_IF_LANE_SIZE(T, 4)>
930HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
931 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
932}
933template <typename T, HWY_IF_LANE_SIZE(T, 8)>
934HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
935 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
936}
937
938HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
939 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
940}
941
943 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
944}
945
946// ------------------------------ Inequality
947
948template <typename T, HWY_IF_LANE_SIZE(T, 1)>
950 return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)};
951}
952template <typename T, HWY_IF_LANE_SIZE(T, 2)>
953HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
954 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
955}
956template <typename T, HWY_IF_LANE_SIZE(T, 4)>
957HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
958 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
959}
960template <typename T, HWY_IF_LANE_SIZE(T, 8)>
961HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
962 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
963}
964
965HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
966 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
967}
968
970 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
971}
972
973// ------------------------------ Strict inequality
974
975HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
976 return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
977}
978HWY_API Mask256<int16_t> operator>(Vec256<int16_t> a, Vec256<int16_t> b) {
979 return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
980}
981HWY_API Mask256<int32_t> operator>(Vec256<int32_t> a, Vec256<int32_t> b) {
982 return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
983}
984HWY_API Mask256<int64_t> operator>(Vec256<int64_t> a, Vec256<int64_t> b) {
985 return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
986}
987
989 return Mask256<uint8_t>{_mm256_cmpgt_epu8_mask(a.raw, b.raw)};
990}
992 const Vec256<uint16_t> b) {
993 return Mask256<uint16_t>{_mm256_cmpgt_epu16_mask(a.raw, b.raw)};
994}
996 const Vec256<uint32_t> b) {
997 return Mask256<uint32_t>{_mm256_cmpgt_epu32_mask(a.raw, b.raw)};
998}
1000 const Vec256<uint64_t> b) {
1001 return Mask256<uint64_t>{_mm256_cmpgt_epu64_mask(a.raw, b.raw)};
1002}
1003
1004HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
1005 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1006}
1008 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1009}
1010
1011// ------------------------------ Weak inequality
1012
1013HWY_API Mask256<float> operator>=(Vec256<float> a, Vec256<float> b) {
1014 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1015}
1017 return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1018}
1019
1020// ------------------------------ Mask
1021
1022namespace detail {
1023
1024template <typename T>
1026 return Mask256<T>{_mm256_movepi8_mask(v.raw)};
1027}
1028template <typename T>
1030 return Mask256<T>{_mm256_movepi16_mask(v.raw)};
1031}
1032template <typename T>
1034 return Mask256<T>{_mm256_movepi32_mask(v.raw)};
1035}
1036template <typename T>
1038 return Mask256<T>{_mm256_movepi64_mask(v.raw)};
1039}
1040
1041} // namespace detail
1042
1043template <typename T>
1044HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
1045 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1046}
1047// There do not seem to be native floating-point versions of these instructions.
1050}
1053}
1054
1055template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1057 return Vec256<T>{_mm256_movm_epi8(v.raw)};
1058}
1059
1060template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1061HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1062 return Vec256<T>{_mm256_movm_epi16(v.raw)};
1063}
1064
1065template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1066HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1067 return Vec256<T>{_mm256_movm_epi32(v.raw)};
1068}
1069
1070template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1071HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
1072 return Vec256<T>{_mm256_movm_epi64(v.raw)};
1073}
1074
1076 return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))};
1077}
1078
1080 return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))};
1081}
1082
1083template <typename T>
1084HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
1085 return VecFromMask(v);
1086}
1087
1088#else // AVX2
1089
1090// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1091
1092template <typename TFrom, typename TTo>
1093HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
1094 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1095 return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
1096}
1097
1098template <typename T>
1099HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
1100 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1101 return (v & bit) == bit;
1102}
1103
1104// ------------------------------ Equality
1105
1106template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1107HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1108 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1109}
1110
1111template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1112HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1113 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1114}
1115
1116template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1117HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1118 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1119}
1120
1121template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1122HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1123 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1124}
1125
1126HWY_API Mask256<float> operator==(const Vec256<float> a,
1127 const Vec256<float> b) {
1128 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1129}
1130
1131HWY_API Mask256<double> operator==(const Vec256<double> a,
1132 const Vec256<double> b) {
1133 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1134}
1135
1136// ------------------------------ Inequality
1137
1138template <typename T, HWY_IF_NOT_FLOAT(T)>
1139HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1140 return Not(a == b);
1141}
1142
1143HWY_API Mask256<float> operator!=(const Vec256<float> a,
1144 const Vec256<float> b) {
1145 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1146}
1147HWY_API Mask256<double> operator!=(const Vec256<double> a,
1148 const Vec256<double> b) {
1149 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1150}
1151
1152// ------------------------------ Strict inequality
1153
1154// Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
1155// to perform an unsigned comparison instead of the intended signed. Workaround
1156// is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
1157#if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1158#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1159#else
1160#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1161#endif
1162
1163HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
1164#if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1165 using i8x32 = signed char __attribute__((__vector_size__(32)));
1166 return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
1167 reinterpret_cast<i8x32>(b.raw))};
1168#else
1169 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1170#endif
1171}
1172HWY_API Mask256<int16_t> operator>(const Vec256<int16_t> a,
1173 const Vec256<int16_t> b) {
1174 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1175}
1176HWY_API Mask256<int32_t> operator>(const Vec256<int32_t> a,
1177 const Vec256<int32_t> b) {
1178 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1179}
1180HWY_API Mask256<int64_t> operator>(const Vec256<int64_t> a,
1181 const Vec256<int64_t> b) {
1182 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1183}
1184
1185template <typename T, HWY_IF_UNSIGNED(T)>
1186HWY_API Mask256<T> operator>(const Vec256<T> a, const Vec256<T> b) {
1187 const Full256<T> du;
1188 const RebindToSigned<decltype(du)> di;
1189 const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1190 return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1191}
1192
1193HWY_API Mask256<float> operator>(const Vec256<float> a, const Vec256<float> b) {
1194 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1195}
1196HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
1197 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1198}
1199
1200// ------------------------------ Weak inequality
1201
1202HWY_API Mask256<float> operator>=(const Vec256<float> a,
1203 const Vec256<float> b) {
1204 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1205}
1206HWY_API Mask256<double> operator>=(const Vec256<double> a,
1207 const Vec256<double> b) {
1208 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1209}
1210
1211#endif // HWY_TARGET <= HWY_AVX3
1212
1213// ------------------------------ Reversed comparisons
1214
1215template <typename T>
1216HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
1217 return b > a;
1218}
1219
1220template <typename T>
1222 return b >= a;
1223}
1224
1225// ------------------------------ Min (Gt, IfThenElse)
1226
1227// Unsigned
1228HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
1229 return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1230}
1231HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a,
1232 const Vec256<uint16_t> b) {
1233 return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1234}
1235HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a,
1236 const Vec256<uint32_t> b) {
1237 return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1238}
1239HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a,
1240 const Vec256<uint64_t> b) {
1241#if HWY_TARGET <= HWY_AVX3
1242 return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1243#else
1244 const Full256<uint64_t> du;
1245 const Full256<int64_t> di;
1246 const auto msb = Set(du, 1ull << 63);
1247 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1248 return IfThenElse(gt, b, a);
1249#endif
1250}
1251
1252// Signed
1253HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) {
1254 return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1255}
1256HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) {
1257 return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1258}
1259HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) {
1260 return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1261}
1262HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) {
1263#if HWY_TARGET <= HWY_AVX3
1264 return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1265#else
1266 return IfThenElse(a < b, a, b);
1267#endif
1268}
1269
1270// Float
1271HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) {
1272 return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1273}
1275 return Vec256<double>{_mm256_min_pd(a.raw, b.raw)};
1276}
1277
1278// ------------------------------ Max (Gt, IfThenElse)
1279
1280// Unsigned
1281HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
1282 return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1283}
1284HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a,
1285 const Vec256<uint16_t> b) {
1286 return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1287}
1288HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a,
1289 const Vec256<uint32_t> b) {
1290 return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1291}
1292HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a,
1293 const Vec256<uint64_t> b) {
1294#if HWY_TARGET <= HWY_AVX3
1295 return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1296#else
1297 const Full256<uint64_t> du;
1298 const Full256<int64_t> di;
1299 const auto msb = Set(du, 1ull << 63);
1300 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
1301 return IfThenElse(gt, a, b);
1302#endif
1303}
1304
1305// Signed
1306HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) {
1307 return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1308}
1309HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) {
1310 return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1311}
1312HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) {
1313 return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1314}
1315HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) {
1316#if HWY_TARGET <= HWY_AVX3
1317 return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1318#else
1319 return IfThenElse(a < b, b, a);
1320#endif
1321}
1322
1323// Float
1324HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) {
1325 return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1326}
1328 return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
1329}
1330
1331// ------------------------------ FirstN (Iota, Lt)
1332
1333template <typename T>
1334HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
1335#if HWY_TARGET <= HWY_AVX3
1336 (void)d;
1337 constexpr size_t N = 32 / sizeof(T);
1338#if HWY_ARCH_X86_64
1339 const uint64_t all = (1ull << N) - 1;
1340 // BZHI only looks at the lower 8 bits of n!
1341 return Mask256<T>::FromBits((n > 255) ? all : _bzhi_u64(all, n));
1342#else
1343 const uint32_t all = static_cast<uint32_t>((1ull << N) - 1);
1344 // BZHI only looks at the lower 8 bits of n!
1345 return Mask256<T>::FromBits(
1346 (n > 255) ? all : _bzhi_u32(all, static_cast<uint32_t>(n)));
1347#endif // HWY_ARCH_X86_64
1348#else
1349 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1350 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
1351#endif
1352}
1353
1354// ================================================== ARITHMETIC
1355
1356// ------------------------------ Addition
1357
1358// Unsigned
1359HWY_API Vec256<uint8_t> operator+(const Vec256<uint8_t> a,
1360 const Vec256<uint8_t> b) {
1361 return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1362}
1363HWY_API Vec256<uint16_t> operator+(const Vec256<uint16_t> a,
1364 const Vec256<uint16_t> b) {
1365 return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
1366}
1367HWY_API Vec256<uint32_t> operator+(const Vec256<uint32_t> a,
1368 const Vec256<uint32_t> b) {
1369 return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
1370}
1372 const Vec256<uint64_t> b) {
1373 return Vec256<uint64_t>{_mm256_add_epi64(a.raw, b.raw)};
1374}
1375
1376// Signed
1377HWY_API Vec256<int8_t> operator+(const Vec256<int8_t> a,
1378 const Vec256<int8_t> b) {
1379 return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1380}
1381HWY_API Vec256<int16_t> operator+(const Vec256<int16_t> a,
1382 const Vec256<int16_t> b) {
1383 return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1384}
1385HWY_API Vec256<int32_t> operator+(const Vec256<int32_t> a,
1386 const Vec256<int32_t> b) {
1387 return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1388}
1390 const Vec256<int64_t> b) {
1391 return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)};
1392}
1393
1394// Float
1395HWY_API Vec256<float> operator+(const Vec256<float> a, const Vec256<float> b) {
1396 return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1397}
1399 const Vec256<double> b) {
1400 return Vec256<double>{_mm256_add_pd(a.raw, b.raw)};
1401}
1402
1403// ------------------------------ Subtraction
1404
1405// Unsigned
1406HWY_API Vec256<uint8_t> operator-(const Vec256<uint8_t> a,
1407 const Vec256<uint8_t> b) {
1408 return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1409}
1410HWY_API Vec256<uint16_t> operator-(const Vec256<uint16_t> a,
1411 const Vec256<uint16_t> b) {
1412 return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1413}
1414HWY_API Vec256<uint32_t> operator-(const Vec256<uint32_t> a,
1415 const Vec256<uint32_t> b) {
1416 return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1417}
1419 const Vec256<uint64_t> b) {
1420 return Vec256<uint64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1421}
1422
1423// Signed
1424HWY_API Vec256<int8_t> operator-(const Vec256<int8_t> a,
1425 const Vec256<int8_t> b) {
1426 return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1427}
1428HWY_API Vec256<int16_t> operator-(const Vec256<int16_t> a,
1429 const Vec256<int16_t> b) {
1430 return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1431}
1432HWY_API Vec256<int32_t> operator-(const Vec256<int32_t> a,
1433 const Vec256<int32_t> b) {
1434 return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1435}
1437 const Vec256<int64_t> b) {
1438 return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)};
1439}
1440
1441// Float
1442HWY_API Vec256<float> operator-(const Vec256<float> a, const Vec256<float> b) {
1443 return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1444}
1446 const Vec256<double> b) {
1447 return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1448}
1449
1450// ------------------------------ SumsOf8
1451HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
1452 return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
1453}
1454
1455// ------------------------------ SaturatedAdd
1456
1457// Returns a + b clamped to the destination range.
1458
1459// Unsigned
1460HWY_API Vec256<uint8_t> SaturatedAdd(const Vec256<uint8_t> a,
1461 const Vec256<uint8_t> b) {
1462 return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
1463}
1464HWY_API Vec256<uint16_t> SaturatedAdd(const Vec256<uint16_t> a,
1465 const Vec256<uint16_t> b) {
1466 return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
1467}
1468
1469// Signed
1470HWY_API Vec256<int8_t> SaturatedAdd(const Vec256<int8_t> a,
1471 const Vec256<int8_t> b) {
1472 return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
1473}
1474HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
1475 const Vec256<int16_t> b) {
1476 return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1477}
1478
1479// ------------------------------ SaturatedSub
1480
1481// Returns a - b clamped to the destination range.
1482
1483// Unsigned
1484HWY_API Vec256<uint8_t> SaturatedSub(const Vec256<uint8_t> a,
1485 const Vec256<uint8_t> b) {
1486 return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
1487}
1488HWY_API Vec256<uint16_t> SaturatedSub(const Vec256<uint16_t> a,
1489 const Vec256<uint16_t> b) {
1490 return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
1491}
1492
1493// Signed
1494HWY_API Vec256<int8_t> SaturatedSub(const Vec256<int8_t> a,
1495 const Vec256<int8_t> b) {
1496 return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
1497}
1498HWY_API Vec256<int16_t> SaturatedSub(const Vec256<int16_t> a,
1499 const Vec256<int16_t> b) {
1500 return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1501}
1502
1503// ------------------------------ Average
1504
1505// Returns (a + b + 1) / 2
1506
1507// Unsigned
1508HWY_API Vec256<uint8_t> AverageRound(const Vec256<uint8_t> a,
1509 const Vec256<uint8_t> b) {
1510 return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
1511}
1512HWY_API Vec256<uint16_t> AverageRound(const Vec256<uint16_t> a,
1513 const Vec256<uint16_t> b) {
1514 return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
1515}
1516
1517// ------------------------------ Abs (Sub)
1518
1519// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1520HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
1521#if HWY_COMPILER_MSVC
1522 // Workaround for incorrect codegen? (wrong result)
1523 const auto zero = Zero(Full256<int8_t>());
1524 return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
1525#else
1526 return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
1527#endif
1528}
1529HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
1530 return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
1531}
1532HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
1533 return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
1534}
1535// i64 is implemented after BroadcastSignBit.
1536
1537HWY_API Vec256<float> Abs(const Vec256<float> v) {
1538 const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
1539 return v & BitCast(Full256<float>(), mask);
1540}
1542 const Vec256<int64_t> mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
1543 return v & BitCast(Full256<double>(), mask);
1544}
1545
1546// ------------------------------ Integer multiplication
1547
1548// Unsigned
1549HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1550 return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1551}
1552HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1553 return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1554}
1555
1556// Signed
1557HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
1558 return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1559}
1560HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
1561 return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1562}
1563
1564// Returns the upper 16 bits of a * b in each lane.
1565HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1566 return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
1567}
1568HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
1569 return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
1570}
1571
1572HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t> a, Vec256<int16_t> b) {
1573 return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
1574}
1575
1576// Multiplies even lanes (0, 2 ..) and places the double-wide result into
1577// even and the upper half into its odd neighbor lane.
1578HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1579 return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1580}
1581HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1582 return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1583}
1584
1585// ------------------------------ ShiftLeft
1586
1587template <int kBits>
1588HWY_API Vec256<uint16_t> ShiftLeft(const Vec256<uint16_t> v) {
1589 return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)};
1590}
1591
1592template <int kBits>
1593HWY_API Vec256<uint32_t> ShiftLeft(const Vec256<uint32_t> v) {
1594 return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)};
1595}
1596
1597template <int kBits>
1599 return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)};
1600}
1601
1602template <int kBits>
1603HWY_API Vec256<int16_t> ShiftLeft(const Vec256<int16_t> v) {
1604 return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)};
1605}
1606
1607template <int kBits>
1608HWY_API Vec256<int32_t> ShiftLeft(const Vec256<int32_t> v) {
1609 return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)};
1610}
1611
1612template <int kBits>
1614 return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
1615}
1616
1617template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
1618HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
1619 const Full256<T> d8;
1620 const RepartitionToWide<decltype(d8)> d16;
1621 const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
1622 return kBits == 1
1623 ? (v + v)
1624 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1625}
1626
1627// ------------------------------ ShiftRight
1628
1629template <int kBits>
1630HWY_API Vec256<uint16_t> ShiftRight(const Vec256<uint16_t> v) {
1631 return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)};
1632}
1633
1634template <int kBits>
1635HWY_API Vec256<uint32_t> ShiftRight(const Vec256<uint32_t> v) {
1636 return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)};
1637}
1638
1639template <int kBits>
1641 return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)};
1642}
1643
1644template <int kBits>
1645HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
1646 const Full256<uint8_t> d8;
1647 // Use raw instead of BitCast to support N=1.
1648 const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
1649 return shifted & Set(d8, 0xFF >> kBits);
1650}
1651
1652template <int kBits>
1653HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
1654 return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
1655}
1656
1657template <int kBits>
1658HWY_API Vec256<int32_t> ShiftRight(const Vec256<int32_t> v) {
1659 return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
1660}
1661
1662template <int kBits>
1663HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
1664 const Full256<int8_t> di;
1665 const Full256<uint8_t> du;
1666 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1667 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1668 return (shifted ^ shifted_sign) - shifted_sign;
1669}
1670
1671// i64 is implemented after BroadcastSignBit.
1672
1673// ------------------------------ RotateRight
1674
1675template <int kBits>
1677 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1678#if HWY_TARGET <= HWY_AVX3
1679 return Vec256<uint32_t>{_mm256_ror_epi32(v.raw, kBits)};
1680#else
1681 if (kBits == 0) return v;
1682 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
1683#endif
1684}
1685
1686template <int kBits>
1688 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1689#if HWY_TARGET <= HWY_AVX3
1690 return Vec256<uint64_t>{_mm256_ror_epi64(v.raw, kBits)};
1691#else
1692 if (kBits == 0) return v;
1693 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
1694#endif
1695}
1696
1697// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1698
1699HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
1700 return VecFromMask(v < Zero(Full256<int8_t>()));
1701}
1702
1704 return ShiftRight<15>(v);
1705}
1706
1708 return ShiftRight<31>(v);
1709}
1710
1712#if HWY_TARGET == HWY_AVX2
1713 return VecFromMask(v < Zero(Full256<int64_t>()));
1714#else
1715 return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
1716#endif
1717}
1718
1719template <int kBits>
1721#if HWY_TARGET <= HWY_AVX3
1722 return Vec256<int64_t>{_mm256_srai_epi64(v.raw, kBits)};
1723#else
1724 const Full256<int64_t> di;
1725 const Full256<uint64_t> du;
1726 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1727 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
1728 return right | sign;
1729#endif
1730}
1731
1732HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
1733#if HWY_TARGET <= HWY_AVX3
1734 return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
1735#else
1736 const auto zero = Zero(Full256<int64_t>());
1737 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
1738#endif
1739}
1740
1741// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1743 Vec256<int8_t> no) {
1744 // int8: AVX2 IfThenElse only looks at the MSB.
1745 return IfThenElse(MaskFromVec(v), yes, no);
1746}
1747
1748template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1750 static_assert(IsSigned<T>(), "Only works for signed/float");
1751 const Full256<T> d;
1752 const RebindToSigned<decltype(d)> di;
1753
1754 // 16-bit: no native blendv, so copy sign to lower byte's MSB.
1755 v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
1756 return IfThenElse(MaskFromVec(v), yes, no);
1757}
1758
1759template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1760HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
1761 static_assert(IsSigned<T>(), "Only works for signed/float");
1762 const Full256<T> d;
1763 const RebindToFloat<decltype(d)> df;
1764
1765 // 32/64-bit: use float IfThenElse, which only looks at the MSB.
1766 const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
1767 return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
1768}
1769
1770// ------------------------------ ShiftLeftSame
1771
1772HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
1773 const int bits) {
1774 return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1775}
1776HWY_API Vec256<uint32_t> ShiftLeftSame(const Vec256<uint32_t> v,
1777 const int bits) {
1778 return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1779}
1781 const int bits) {
1782 return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1783}
1784
1785HWY_API Vec256<int16_t> ShiftLeftSame(const Vec256<int16_t> v, const int bits) {
1786 return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1787}
1788
1789HWY_API Vec256<int32_t> ShiftLeftSame(const Vec256<int32_t> v, const int bits) {
1790 return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1791}
1792
1794 return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1795}
1796
1797template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1798HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
1799 const Full256<T> d8;
1800 const RepartitionToWide<decltype(d8)> d16;
1801 const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
1802 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
1803}
1804
1805// ------------------------------ ShiftRightSame (BroadcastSignBit)
1806
1807HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
1808 const int bits) {
1809 return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1810}
1811HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v,
1812 const int bits) {
1813 return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1814}
1816 const int bits) {
1817 return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1818}
1819
1820HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
1821 const Full256<uint8_t> d8;
1822 const RepartitionToWide<decltype(d8)> d16;
1823 const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
1824 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
1825}
1826
1827HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
1828 const int bits) {
1829 return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
1830}
1831
1832HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v,
1833 const int bits) {
1834 return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1835}
1837 const int bits) {
1838#if HWY_TARGET <= HWY_AVX3
1839 return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1840#else
1841 const Full256<int64_t> di;
1842 const Full256<uint64_t> du;
1843 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1844 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
1845 return right | sign;
1846#endif
1847}
1848
1849HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
1850 const Full256<int8_t> di;
1851 const Full256<uint8_t> du;
1852 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1853 const auto shifted_sign =
1854 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1855 return (shifted ^ shifted_sign) - shifted_sign;
1856}
1857
1858// ------------------------------ Neg (Xor, Sub)
1859
1860template <typename T, HWY_IF_FLOAT(T)>
1861HWY_API Vec256<T> Neg(const Vec256<T> v) {
1862 return Xor(v, SignBit(Full256<T>()));
1863}
1864
1865template <typename T, HWY_IF_NOT_FLOAT(T)>
1866HWY_API Vec256<T> Neg(const Vec256<T> v) {
1867 return Zero(Full256<T>()) - v;
1868}
1869
1870// ------------------------------ Floating-point mul / div
1871
1872HWY_API Vec256<float> operator*(const Vec256<float> a, const Vec256<float> b) {
1873 return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
1874}
1876 const Vec256<double> b) {
1877 return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
1878}
1879
1880HWY_API Vec256<float> operator/(const Vec256<float> a, const Vec256<float> b) {
1881 return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
1882}
1884 const Vec256<double> b) {
1885 return Vec256<double>{_mm256_div_pd(a.raw, b.raw)};
1886}
1887
1888// Approximate reciprocal
1889HWY_API Vec256<float> ApproximateReciprocal(const Vec256<float> v) {
1890 return Vec256<float>{_mm256_rcp_ps(v.raw)};
1891}
1892
1893// Absolute value of difference.
1894HWY_API Vec256<float> AbsDiff(const Vec256<float> a, const Vec256<float> b) {
1895 return Abs(a - b);
1896}
1897
1898// ------------------------------ Floating-point multiply-add variants
1899
1900// Returns mul * x + add
1901HWY_API Vec256<float> MulAdd(const Vec256<float> mul, const Vec256<float> x,
1902 const Vec256<float> add) {
1903#ifdef HWY_DISABLE_BMI2_FMA
1904 return mul * x + add;
1905#else
1906 return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
1907#endif
1908}
1910 const Vec256<double> add) {
1911#ifdef HWY_DISABLE_BMI2_FMA
1912 return mul * x + add;
1913#else
1914 return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
1915#endif
1916}
1917
1918// Returns add - mul * x
1919HWY_API Vec256<float> NegMulAdd(const Vec256<float> mul, const Vec256<float> x,
1920 const Vec256<float> add) {
1921#ifdef HWY_DISABLE_BMI2_FMA
1922 return add - mul * x;
1923#else
1924 return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
1925#endif
1926}
1928 const Vec256<double> x,
1929 const Vec256<double> add) {
1930#ifdef HWY_DISABLE_BMI2_FMA
1931 return add - mul * x;
1932#else
1933 return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
1934#endif
1935}
1936
1937// Returns mul * x - sub
1938HWY_API Vec256<float> MulSub(const Vec256<float> mul, const Vec256<float> x,
1939 const Vec256<float> sub) {
1940#ifdef HWY_DISABLE_BMI2_FMA
1941 return mul * x - sub;
1942#else
1943 return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
1944#endif
1945}
1947 const Vec256<double> sub) {
1948#ifdef HWY_DISABLE_BMI2_FMA
1949 return mul * x - sub;
1950#else
1951 return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
1952#endif
1953}
1954
1955// Returns -mul * x - sub
1956HWY_API Vec256<float> NegMulSub(const Vec256<float> mul, const Vec256<float> x,
1957 const Vec256<float> sub) {
1958#ifdef HWY_DISABLE_BMI2_FMA
1959 return Neg(mul * x) - sub;
1960#else
1961 return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1962#endif
1963}
1965 const Vec256<double> x,
1966 const Vec256<double> sub) {
1967#ifdef HWY_DISABLE_BMI2_FMA
1968 return Neg(mul * x) - sub;
1969#else
1970 return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1971#endif
1972}
1973
1974// ------------------------------ Floating-point square root
1975
1976// Full precision square root
1977HWY_API Vec256<float> Sqrt(const Vec256<float> v) {
1978 return Vec256<float>{_mm256_sqrt_ps(v.raw)};
1979}
1981 return Vec256<double>{_mm256_sqrt_pd(v.raw)};
1982}
1983
1984// Approximate reciprocal square root
1985HWY_API Vec256<float> ApproximateReciprocalSqrt(const Vec256<float> v) {
1986 return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
1987}
1988
1989// ------------------------------ Floating-point rounding
1990
1991// Toward nearest integer, tie to even
1992HWY_API Vec256<float> Round(const Vec256<float> v) {
1993 return Vec256<float>{
1994 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1995}
1997 return Vec256<double>{
1998 _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1999}
2000
2001// Toward zero, aka truncate
2002HWY_API Vec256<float> Trunc(const Vec256<float> v) {
2003 return Vec256<float>{
2004 _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2005}
2007 return Vec256<double>{
2008 _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2009}
2010
2011// Toward +infinity, aka ceiling
2012HWY_API Vec256<float> Ceil(const Vec256<float> v) {
2013 return Vec256<float>{
2014 _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2015}
2017 return Vec256<double>{
2018 _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2019}
2020
2021// Toward -infinity, aka floor
2022HWY_API Vec256<float> Floor(const Vec256<float> v) {
2023 return Vec256<float>{
2024 _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2025}
2027 return Vec256<double>{
2028 _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2029}
2030
2031// ------------------------------ Floating-point classification
2032
2034#if HWY_TARGET <= HWY_AVX3
2035 return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x81)};
2036#else
2037 return Mask256<float>{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)};
2038#endif
2039}
2041#if HWY_TARGET <= HWY_AVX3
2042 return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x81)};
2043#else
2044 return Mask256<double>{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)};
2045#endif
2046}
2047
2048#if HWY_TARGET <= HWY_AVX3
2049
2051 return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x18)};
2052}
2054 return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x18)};
2055}
2056
2058 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
2059 // and negate the mask.
2060 return Not(Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x99)});
2061}
2063 return Not(Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x99)});
2064}
2065
2066#else
2067
2068template <typename T, HWY_IF_FLOAT(T)>
2069HWY_API Mask256<T> IsInf(const Vec256<T> v) {
2070 const Full256<T> d;
2071 const RebindToSigned<decltype(d)> di;
2072 const VFromD<decltype(di)> vi = BitCast(di, v);
2073 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
2074 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
2075}
2076
2077// Returns whether normal/subnormal/zero.
2078template <typename T, HWY_IF_FLOAT(T)>
2079HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
2080 const Full256<T> d;
2081 const RebindToUnsigned<decltype(d)> du;
2082 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
2083 const VFromD<decltype(du)> vu = BitCast(du, v);
2084 // Shift left to clear the sign bit, then right so we can compare with the
2085 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
2086 // negative and non-negative floats would be greater). MSVC seems to generate
2087 // incorrect code if we instead add vu + vu.
2088 const VFromD<decltype(di)> exp =
2089 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
2090 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
2091}
2092
2093#endif // HWY_TARGET <= HWY_AVX3
2094
2095// ================================================== MEMORY
2096
2097// ------------------------------ Load
2098
2099template <typename T>
2100HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
2101 return Vec256<T>{
2102 _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
2103}
2105 const float* HWY_RESTRICT aligned) {
2106 return Vec256<float>{_mm256_load_ps(aligned)};
2107}
2109 const double* HWY_RESTRICT aligned) {
2110 return Vec256<double>{_mm256_load_pd(aligned)};
2111}
2112
2113template <typename T>
2114HWY_API Vec256<T> LoadU(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2115 return Vec256<T>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
2116}
2118 const float* HWY_RESTRICT p) {
2119 return Vec256<float>{_mm256_loadu_ps(p)};
2120}
2122 const double* HWY_RESTRICT p) {
2123 return Vec256<double>{_mm256_loadu_pd(p)};
2124}
2125
2126// ------------------------------ MaskedLoad
2127
2128#if HWY_TARGET <= HWY_AVX3
2129
2130template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2132 const T* HWY_RESTRICT p) {
2133 return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, p)};
2134}
2135
2136template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2137HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2138 const T* HWY_RESTRICT p) {
2139 return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2140}
2141
2142template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2143HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2144 const T* HWY_RESTRICT p) {
2145 return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2146}
2147
2148template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2149HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2150 const T* HWY_RESTRICT p) {
2151 return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2152}
2153
2155 const float* HWY_RESTRICT p) {
2156 return Vec256<float>{_mm256_maskz_loadu_ps(m.raw, p)};
2157}
2158
2160 const double* HWY_RESTRICT p) {
2161 return Vec256<double>{_mm256_maskz_loadu_pd(m.raw, p)};
2162}
2163
2164#else // AVX2
2165
2166// There is no maskload_epi8/16, so blend instead.
2167template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2168HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2169 const T* HWY_RESTRICT p) {
2170 return IfThenElseZero(m, LoadU(d, p));
2171}
2172
2173template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2174HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2175 const T* HWY_RESTRICT p) {
2176 auto pi = reinterpret_cast<const int*>(p); // NOLINT
2177 return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2178}
2179
2180template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2181HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
2182 const T* HWY_RESTRICT p) {
2183 auto pi = reinterpret_cast<const long long*>(p); // NOLINT
2184 return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2185}
2186
2187HWY_API Vec256<float> MaskedLoad(Mask256<float> m, Full256<float> d,
2188 const float* HWY_RESTRICT p) {
2189 const Vec256<int32_t> mi =
2190 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2191 return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2192}
2193
2194HWY_API Vec256<double> MaskedLoad(Mask256<double> m, Full256<double> d,
2195 const double* HWY_RESTRICT p) {
2196 const Vec256<int64_t> mi =
2197 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2198 return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2199}
2200
2201#endif
2202
2203// ------------------------------ LoadDup128
2204
2205// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2206// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2207template <typename T>
2208HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
2209#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2210 // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
2211 // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
2212 // upper half undefined) is fine because we're overwriting that anyway.
2213 // This workaround seems in turn to generate incorrect code in MSVC 2022
2214 // (19.31), so use broadcastsi128 there.
2215 const __m128i v128 = LoadU(Full128<T>(), p).raw;
2216 return Vec256<T>{
2217 _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2218#else
2219 return Vec256<T>{_mm256_broadcastsi128_si256(LoadU(Full128<T>(), p).raw)};
2220#endif
2221}
2223 const float* const HWY_RESTRICT p) {
2224#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2225 const __m128 v128 = LoadU(Full128<float>(), p).raw;
2226 return Vec256<float>{
2227 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2228#else
2229 return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))};
2230#endif
2231}
2233 const double* const HWY_RESTRICT p) {
2234#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
2235 const __m128d v128 = LoadU(Full128<double>(), p).raw;
2236 return Vec256<double>{
2237 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2238#else
2239 return Vec256<double>{
2240 _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))};
2241#endif
2242}
2243
2244// ------------------------------ Store
2245
2246template <typename T>
2247HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
2248 _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2249}
2251 float* HWY_RESTRICT aligned) {
2252 _mm256_store_ps(aligned, v.raw);
2253}
2255 double* HWY_RESTRICT aligned) {
2256 _mm256_store_pd(aligned, v.raw);
2257}
2258
2259template <typename T>
2260HWY_API void StoreU(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT p) {
2261 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
2262}
2264 float* HWY_RESTRICT p) {
2265 _mm256_storeu_ps(p, v.raw);
2266}
2268 double* HWY_RESTRICT p) {
2269 _mm256_storeu_pd(p, v.raw);
2270}
2271
2272// ------------------------------ BlendedStore
2273
2274#if HWY_TARGET <= HWY_AVX3
2275
2276template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2278 T* HWY_RESTRICT p) {
2279 _mm256_mask_storeu_epi8(p, m.raw, v.raw);
2280}
2281
2282template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2283HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2284 T* HWY_RESTRICT p) {
2285 _mm256_mask_storeu_epi16(p, m.raw, v.raw);
2286}
2287
2288template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2289HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2290 T* HWY_RESTRICT p) {
2291 _mm256_mask_storeu_epi32(p, m.raw, v.raw);
2292}
2293
2294template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2295HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2296 T* HWY_RESTRICT p) {
2297 _mm256_mask_storeu_epi64(p, m.raw, v.raw);
2298}
2299
2301 Full256<float> /* tag */, float* HWY_RESTRICT p) {
2302 _mm256_mask_storeu_ps(p, m.raw, v.raw);
2303}
2304
2306 Full256<double> /* tag */, double* HWY_RESTRICT p) {
2307 _mm256_mask_storeu_pd(p, m.raw, v.raw);
2308}
2309
2310#else // AVX2
2311
2312// Intel SDM says "No AC# reported for any mask bit combinations". However, AMD
2313// allows AC# if "Alignment checking enabled and: 256-bit memory operand not
2314// 32-byte aligned". Fortunately AC# is not enabled by default and requires both
2315// OS support (CR0) and the application to set rflags.AC. We assume these remain
2316// disabled because x86/x64 code and compiler output often contain misaligned
2317// scalar accesses, which would also fault.
2318//
2319// Caveat: these are slow on AMD Jaguar/Bulldozer.
2320
2321template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2322HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2323 T* HWY_RESTRICT p) {
2324 // There is no maskload_epi8/16. Blending is also unsafe because loading a
2325 // full vector that crosses the array end causes asan faults. Resort to scalar
2326 // code; the caller should instead use memcpy, assuming m is FirstN(d, n).
2327 const RebindToUnsigned<decltype(d)> du;
2328 using TU = TFromD<decltype(du)>;
2329 alignas(32) TU buf[32 / sizeof(T)];
2330 alignas(32) TU mask[32 / sizeof(T)];
2331 Store(BitCast(du, v), du, buf);
2332 Store(BitCast(du, VecFromMask(d, m)), du, mask);
2333 for (size_t i = 0; i < 32 / sizeof(T); ++i) {
2334 if (mask[i]) {
2335 CopyBytes<sizeof(T)>(buf + i, p + i);
2336 }
2337 }
2338}
2339
2340template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2341HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2342 T* HWY_RESTRICT p) {
2343 auto pi = reinterpret_cast<int*>(p); // NOLINT
2344 _mm256_maskstore_epi32(pi, m.raw, v.raw);
2345}
2346
2347template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2348HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
2349 T* HWY_RESTRICT p) {
2350 auto pi = reinterpret_cast<long long*>(p); // NOLINT
2351 _mm256_maskstore_epi64(pi, m.raw, v.raw);
2352}
2353
2354HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, Full256<float> d,
2355 float* HWY_RESTRICT p) {
2356 const Vec256<int32_t> mi =
2357 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2358 _mm256_maskstore_ps(p, mi.raw, v.raw);
2359}
2360
2361HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m,
2362 Full256<double> d, double* HWY_RESTRICT p) {
2363 const Vec256<int64_t> mi =
2364 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2365 _mm256_maskstore_pd(p, mi.raw, v.raw);
2366}
2367
2368#endif
2369
2370// ------------------------------ Non-temporal stores
2371
2372template <typename T>
2373HWY_API void Stream(Vec256<T> v, Full256<T> /* tag */,
2374 T* HWY_RESTRICT aligned) {
2375 _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2376}
2378 float* HWY_RESTRICT aligned) {
2379 _mm256_stream_ps(aligned, v.raw);
2380}
2382 double* HWY_RESTRICT aligned) {
2383 _mm256_stream_pd(aligned, v.raw);
2384}
2385
2386// ------------------------------ Scatter
2387
2388// Work around warnings in the intrinsic definitions (passing -1 as a mask).
2389HWY_DIAGNOSTICS(push)
2390HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2391
2392#if HWY_TARGET <= HWY_AVX3
2393namespace detail {
2394
2395template <typename T>
2397 Full256<T> /* tag */, T* HWY_RESTRICT base,
2398 const Vec256<int32_t> offset) {
2399 _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
2400}
2401template <typename T>
2403 Full256<T> /* tag */, T* HWY_RESTRICT base,
2404 const Vec256<int32_t> index) {
2405 _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
2406}
2407
2408template <typename T>
2410 Full256<T> /* tag */, T* HWY_RESTRICT base,
2411 const Vec256<int64_t> offset) {
2412 _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
2413}
2414template <typename T>
2416 Full256<T> /* tag */, T* HWY_RESTRICT base,
2417 const Vec256<int64_t> index) {
2418 _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
2419}
2420
2421} // namespace detail
2422
2423template <typename T, typename Offset>
2424HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2425 const Vec256<Offset> offset) {
2426 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2427 return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2428}
2429template <typename T, typename Index>
2430HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2431 const Vec256<Index> index) {
2432 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2433 return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2434}
2435
2437 float* HWY_RESTRICT base,
2438 const Vec256<int32_t> offset) {
2439 _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
2440}
2442 float* HWY_RESTRICT base,
2443 const Vec256<int32_t> index) {
2444 _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
2445}
2446
2448 double* HWY_RESTRICT base,
2449 const Vec256<int64_t> offset) {
2450 _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
2451}
2453 double* HWY_RESTRICT base,
2454 const Vec256<int64_t> index) {
2455 _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
2456}
2457
2458#else
2459
2460template <typename T, typename Offset>
2461HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2462 const Vec256<Offset> offset) {
2463 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2464
2465 constexpr size_t N = 32 / sizeof(T);
2466 alignas(32) T lanes[N];
2467 Store(v, d, lanes);
2468
2469 alignas(32) Offset offset_lanes[N];
2470 Store(offset, Full256<Offset>(), offset_lanes);
2471
2472 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2473 for (size_t i = 0; i < N; ++i) {
2474 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2475 }
2476}
2477
2478template <typename T, typename Index>
2479HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
2480 const Vec256<Index> index) {
2481 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2482
2483 constexpr size_t N = 32 / sizeof(T);
2484 alignas(32) T lanes[N];
2485 Store(v, d, lanes);
2486
2487 alignas(32) Index index_lanes[N];
2488 Store(index, Full256<Index>(), index_lanes);
2489
2490 for (size_t i = 0; i < N; ++i) {
2491 base[index_lanes[i]] = lanes[i];
2492 }
2493}
2494
2495#endif
2496
2497// ------------------------------ Gather
2498
2499namespace detail {
2500
2501template <typename T>
2503 Full256<T> /* tag */,
2504 const T* HWY_RESTRICT base,
2505 const Vec256<int32_t> offset) {
2506 return Vec256<T>{_mm256_i32gather_epi32(
2507 reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
2508}
2509template <typename T>
2511 Full256<T> /* tag */,
2512 const T* HWY_RESTRICT base,
2513 const Vec256<int32_t> index) {
2514 return Vec256<T>{_mm256_i32gather_epi32(
2515 reinterpret_cast<const int32_t*>(base), index.raw, 4)};
2516}
2517
2518template <typename T>
2520 Full256<T> /* tag */,
2521 const T* HWY_RESTRICT base,
2522 const Vec256<int64_t> offset) {
2523 return Vec256<T>{_mm256_i64gather_epi64(
2524 reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
2525}
2526template <typename T>
2528 Full256<T> /* tag */,
2529 const T* HWY_RESTRICT base,
2530 const Vec256<int64_t> index) {
2531 return Vec256<T>{_mm256_i64gather_epi64(
2532 reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
2533}
2534
2535} // namespace detail
2536
2537template <typename T, typename Offset>
2538HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
2539 const Vec256<Offset> offset) {
2540 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2541 return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2542}
2543template <typename T, typename Index>
2544HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
2545 const Vec256<Index> index) {
2546 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2547 return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2548}
2549
2551 const float* HWY_RESTRICT base,
2552 const Vec256<int32_t> offset) {
2553 return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
2554}
2556 const float* HWY_RESTRICT base,
2557 const Vec256<int32_t> index) {
2558 return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
2559}
2560
2562 const double* HWY_RESTRICT base,
2563 const Vec256<int64_t> offset) {
2564 return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
2565}
2567 const double* HWY_RESTRICT base,
2568 const Vec256<int64_t> index) {
2569 return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
2570}
2571
2572HWY_DIAGNOSTICS(pop)
2573
2574// ================================================== SWIZZLE
2575
2576// ------------------------------ LowerHalf
2577
2578template <typename T>
2579HWY_API Vec128<T> LowerHalf(Full128<T> /* tag */, Vec256<T> v) {
2580 return Vec128<T>{_mm256_castsi256_si128(v.raw)};
2581}
2583 return Vec128<float>{_mm256_castps256_ps128(v.raw)};
2584}
2586 return Vec128<double>{_mm256_castpd256_pd128(v.raw)};
2587}
2588
2589template <typename T>
2590HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
2591 return LowerHalf(Full128<T>(), v);
2592}
2593
2594// ------------------------------ UpperHalf
2595
2596template <typename T>
2598 return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)};
2599}
2601 return Vec128<float>{_mm256_extractf128_ps(v.raw, 1)};
2602}
2604 return Vec128<double>{_mm256_extractf128_pd(v.raw, 1)};
2605}
2606
2607// ------------------------------ ExtractLane (Store)
2608template <typename T>
2609HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
2610 const Full256<T> d;
2611 HWY_DASSERT(i < Lanes(d));
2612 alignas(32) T lanes[32 / sizeof(T)];
2613 Store(v, d, lanes);
2614 return lanes[i];
2615}
2616
2617// ------------------------------ InsertLane (Store)
2618template <typename T>
2619HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
2620 const Full256<T> d;
2621 HWY_DASSERT(i < Lanes(d));
2622 alignas(64) T lanes[64 / sizeof(T)];
2623 Store(v, d, lanes);
2624 lanes[i] = t;
2625 return Load(d, lanes);
2626}
2627
2628// ------------------------------ GetLane (LowerHalf)
2629template <typename T>
2631 return GetLane(LowerHalf(v));
2632}
2633
2634// ------------------------------ ZeroExtendVector
2635
2636// Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper
2637// bits undefined. Although it makes sense for them to be zero (VEX encoded
2638// 128-bit instructions zero the upper lanes to avoid large penalties), a
2639// compiler could decide to optimize out code that relies on this.
2640//
2641// The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
2642// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For
2643// older GCC, we can still obtain the desired code thanks to pattern
2644// recognition; note that the expensive insert instruction is not actually
2645// generated, see https://gcc.godbolt.org/z/1MKGaP.
2646
2647#if !defined(HWY_HAVE_ZEXT)
2648#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \
2649 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
2650 (!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC >= 1000)
2651#define HWY_HAVE_ZEXT 1
2652#else
2653#define HWY_HAVE_ZEXT 0
2654#endif
2655#endif // defined(HWY_HAVE_ZEXT)
2656
2657template <typename T>
2658HWY_API Vec256<T> ZeroExtendVector(Full256<T> /* tag */, Vec128<T> lo) {
2659#if HWY_HAVE_ZEXT
2660return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2661#else
2662 return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2663#endif
2664}
2666 Vec128<float> lo) {
2667#if HWY_HAVE_ZEXT
2668 return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
2669#else
2670 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
2671#endif
2672}
2674 Vec128<double> lo) {
2675#if HWY_HAVE_ZEXT
2676 return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
2677#else
2678 return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
2679#endif
2680}
2681
2682// ------------------------------ Combine
2683
2684template <typename T>
2685HWY_API Vec256<T> Combine(Full256<T> d, Vec128<T> hi, Vec128<T> lo) {
2686 const auto lo256 = ZeroExtendVector(d, lo);
2687 return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2688}
2690 Vec128<float> lo) {
2691 const auto lo256 = ZeroExtendVector(d, lo);
2692 return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)};
2693}
2695 Vec128<double> lo) {
2696 const auto lo256 = ZeroExtendVector(d, lo);
2697 return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)};
2698}
2699
2700// ------------------------------ ShiftLeftBytes
2701
2702template <int kBytes, typename T>
2703HWY_API Vec256<T> ShiftLeftBytes(Full256<T> /* tag */, const Vec256<T> v) {
2704 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2705 // This is the same operation as _mm256_bslli_epi128.
2706 return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)};
2707}
2708
2709template <int kBytes, typename T>
2710HWY_API Vec256<T> ShiftLeftBytes(const Vec256<T> v) {
2711 return ShiftLeftBytes<kBytes>(Full256<T>(), v);
2712}
2713
2714// ------------------------------ ShiftLeftLanes
2715
2716template <int kLanes, typename T>
2717HWY_API Vec256<T> ShiftLeftLanes(Full256<T> d, const Vec256<T> v) {
2718 const Repartition<uint8_t, decltype(d)> d8;
2719 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2720}
2721
2722template <int kLanes, typename T>
2723HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) {
2724 return ShiftLeftLanes<kLanes>(Full256<T>(), v);
2725}
2726
2727// ------------------------------ ShiftRightBytes
2728
2729template <int kBytes, typename T>
2730HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
2731 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2732 // This is the same operation as _mm256_bsrli_epi128.
2733 return Vec256<T>{_mm256_srli_si256(v.raw, kBytes)};
2734}
2735
2736// ------------------------------ ShiftRightLanes
2737template <int kLanes, typename T>
2738HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
2739 const Repartition<uint8_t, decltype(d)> d8;
2740 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2741}
2742
2743// ------------------------------ CombineShiftRightBytes
2744
2745// Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
2746template <int kBytes, typename T, class V = Vec256<T>>
2747HWY_API V CombineShiftRightBytes(Full256<T> d, V hi, V lo) {
2748 const Repartition<uint8_t, decltype(d)> d8;
2749 return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
2750 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2751}
2752
2753// ------------------------------ Broadcast/splat any lane
2754
2755// Unsigned
2756template <int kLane>
2757HWY_API Vec256<uint16_t> Broadcast(const Vec256<uint16_t> v) {
2758 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2759 if (kLane < 4) {
2760 const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2761 return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
2762 } else {
2763 const __m256i hi =
2764 _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2765 return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
2766 }
2767}
2768template <int kLane>
2769HWY_API Vec256<uint32_t> Broadcast(const Vec256<uint32_t> v) {
2770 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2771 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2772}
2773template <int kLane>
2775 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2776 return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2777}
2778
2779// Signed
2780template <int kLane>
2781HWY_API Vec256<int16_t> Broadcast(const Vec256<int16_t> v) {
2782 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2783 if (kLane < 4) {
2784 const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2785 return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
2786 } else {
2787 const __m256i hi =
2788 _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2789 return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
2790 }
2791}
2792template <int kLane>
2793HWY_API Vec256<int32_t> Broadcast(const Vec256<int32_t> v) {
2794 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2795 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
2796}
2797template <int kLane>
2799 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2800 return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
2801}
2802
2803// Float
2804template <int kLane>
2805HWY_API Vec256<float> Broadcast(Vec256<float> v) {
2806 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2807 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
2808}
2809template <int kLane>
2811 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2812 return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
2813}
2814
2815// ------------------------------ Hard-coded shuffles
2816
2817// Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2818// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2819// right (the previous least-significant lane is now most-significant =>
2820// 47650321). These could also be implemented via CombineShiftRightBytes but
2821// the shuffle_abcd notation is more convenient.
2822
2823// Swap 32-bit halves in 64-bit halves.
2824template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2826 return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0xB1)};
2827}
2829 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
2830}
2831
2832namespace detail {
2833
2834template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2836 const Full256<T> d;
2837 const RebindToFloat<decltype(d)> df;
2838 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
2839 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2840 BitCast(df, b).raw, m)});
2841}
2842template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2844 const Full256<T> d;
2845 const RebindToFloat<decltype(d)> df;
2846 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
2847 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2848 BitCast(df, b).raw, m)});
2849}
2850template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2852 const Full256<T> d;
2853 const RebindToFloat<decltype(d)> df;
2854 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
2855 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2856 BitCast(df, b).raw, m)});
2857}
2858
2859} // namespace detail
2860
2861// Swap 64-bit halves
2863 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2864}
2866 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2867}
2869 // Shorter encoding than _mm256_permute_ps.
2870 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)};
2871}
2873 return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2874}
2876 return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
2877}
2879 // Shorter encoding than _mm256_permute_pd.
2880 return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)};
2881}
2882
2883// Rotate right 32 bits
2885 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2886}
2888 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
2889}
2891 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)};
2892}
2893// Rotate left 32 bits
2895 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2896}
2898 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
2899}
2901 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)};
2902}
2903
2904// Reverse
2906 return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2907}
2909 return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
2910}
2912 return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)};
2913}
2914
2915// ------------------------------ TableLookupLanes
2916
2917// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2918template <typename T>
2919struct Indices256 {
2920 __m256i raw;
2921};
2922
2923// Native 8x32 instruction: indices remain unchanged
2924template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 4)>
2926 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2927#if HWY_IS_DEBUG_BUILD
2928 const Full256<TI> di;
2929 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2930 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(32 / sizeof(T))))));
2931#endif
2932 return Indices256<T>{vec.raw};
2933}
2934
2935// 64-bit lanes: convert indices to 8x32 unless AVX3 is available
2936template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 8)>
2937HWY_API Indices256<T> IndicesFromVec(Full256<T> d, Vec256<TI> idx64) {
2938 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2939 const Rebind<TI, decltype(d)> di;
2940 (void)di; // potentially unused
2941#if HWY_IS_DEBUG_BUILD
2942 HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) &&
2943 AllTrue(di, Lt(idx64, Set(di, static_cast<TI>(32 / sizeof(T))))));
2944#endif
2945
2946#if HWY_TARGET <= HWY_AVX3
2947 (void)d;
2948 return Indices256<T>{idx64.raw};
2949#else
2950 const Repartition<float, decltype(d)> df; // 32-bit!
2951 // Replicate 64-bit index into upper 32 bits
2952 const Vec256<TI> dup =
2953 BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)});
2954 // For each idx64 i, idx32 are 2*i and 2*i+1.
2955 const Vec256<TI> idx32 = dup + dup + Set(di, TI(1) << 32);
2956 return Indices256<T>{idx32.raw};
2957#endif
2958}
2959
2960template <typename T, typename TI>
2961HWY_API Indices256<T> SetTableIndices(const Full256<T> d, const TI* idx) {
2962 const Rebind<TI, decltype(d)> di;
2963 return IndicesFromVec(d, LoadU(di, idx));
2964}
2965
2966template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2968 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2969}
2970
2971template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2972HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
2973#if HWY_TARGET <= HWY_AVX3
2974 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
2975#else
2976 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
2977#endif
2978}
2979
2981 const Indices256<float> idx) {
2982 return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
2983}
2984
2986 const Indices256<double> idx) {
2987#if HWY_TARGET <= HWY_AVX3
2988 return Vec256<double>{_mm256_permutexvar_pd(idx.raw, v.raw)};
2989#else
2990 const Full256<double> df;
2991 const Full256<uint64_t> du;
2992 return BitCast(df, Vec256<uint64_t>{_mm256_permutevar8x32_epi32(
2993 BitCast(du, v).raw, idx.raw)});
2994#endif
2995}
2996
2997// ------------------------------ SwapAdjacentBlocks
2998
2999template <typename T>
3000HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
3001 return Vec256<T>{_mm256_permute2x128_si256(v.raw, v.raw, 0x01)};
3002}
3003
3005 return Vec256<float>{_mm256_permute2f128_ps(v.raw, v.raw, 0x01)};
3006}
3007
3009 return Vec256<double>{_mm256_permute2f128_pd(v.raw, v.raw, 0x01)};
3010}
3011
3012// ------------------------------ Reverse (RotateRight)
3013
3014template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3015HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3016 alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3017 return TableLookupLanes(v, SetTableIndices(d, kReverse));
3018}
3019
3020template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3021HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3022 alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0};
3023 return TableLookupLanes(v, SetTableIndices(d, kReverse));
3024}
3025
3026template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3027HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
3028#if HWY_TARGET <= HWY_AVX3
3029 const RebindToSigned<decltype(d)> di;
3030 alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
3031 7, 6, 5, 4, 3, 2, 1, 0};
3032 const Vec256<int16_t> idx = Load(di, kReverse);
3033 return BitCast(d, Vec256<int16_t>{
3034 _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3035#else
3036 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3037 const Vec256<uint32_t> rev32 = Reverse(du32, BitCast(du32, v));
3038 return BitCast(d, RotateRight<16>(rev32));
3039#endif
3040}
3041
3042// ------------------------------ Reverse2
3043
3044template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3046 const Full256<uint32_t> du32;
3047 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
3048}
3049
3050template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3051HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
3052 return Shuffle2301(v);
3053}
3054
3055template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3056HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
3057 return Shuffle01(v);
3058}
3059
3060// ------------------------------ Reverse4 (SwapAdjacentBlocks)
3061
3062template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3064#if HWY_TARGET <= HWY_AVX3
3065 const RebindToSigned<decltype(d)> di;
3066 alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
3067 11, 10, 9, 8, 15, 14, 13, 12};
3068 const Vec256<int16_t> idx = Load(di, kReverse4);
3069 return BitCast(d, Vec256<int16_t>{
3070 _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3071#else
3072 const RepartitionToWide<decltype(d)> dw;
3073 return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
3074#endif
3075}
3076
3077template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3078HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
3079 return Shuffle0123(v);
3080}
3081
3082template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3083HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
3084 // Could also use _mm256_permute4x64_epi64.
3086}
3087
3088// ------------------------------ Reverse8
3089
3090template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3092#if HWY_TARGET <= HWY_AVX3
3093 const RebindToSigned<decltype(d)> di;
3094 alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3095 15, 14, 13, 12, 11, 10, 9, 8};
3096 const Vec256<int16_t> idx = Load(di, kReverse8);
3097 return BitCast(d, Vec256<int16_t>{
3098 _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3099#else
3100 const RepartitionToWide<decltype(d)> dw;
3101 return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
3102#endif
3103}
3104
3105template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3106HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
3107 return Reverse(d, v);
3108}
3109
3110template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3111HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
3112 HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes
3113}
3114
3115// ------------------------------ InterleaveLower
3116
3117// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3118// the least-significant lane) and "b". To concatenate two half-width integers
3119// into one, use ZipLower/Upper instead (also works with scalar).
3120
3121HWY_API Vec256<uint8_t> InterleaveLower(const Vec256<uint8_t> a,
3122 const Vec256<uint8_t> b) {
3123 return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3124}
3125HWY_API Vec256<uint16_t> InterleaveLower(const Vec256<uint16_t> a,
3126 const Vec256<uint16_t> b) {
3127 return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3128}
3129HWY_API Vec256<uint32_t> InterleaveLower(const Vec256<uint32_t> a,
3130 const Vec256<uint32_t> b) {
3131 return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3132}
3133HWY_API Vec256<uint64_t> InterleaveLower(const Vec256<uint64_t> a,
3134 const Vec256<uint64_t> b) {
3135 return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3136}
3137
3138HWY_API Vec256<int8_t> InterleaveLower(const Vec256<int8_t> a,
3139 const Vec256<int8_t> b) {
3140 return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
3141}
3142HWY_API Vec256<int16_t> InterleaveLower(const Vec256<int16_t> a,
3143 const Vec256<int16_t> b) {
3144 return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3145}
3146HWY_API Vec256<int32_t> InterleaveLower(const Vec256<int32_t> a,
3147 const Vec256<int32_t> b) {
3148 return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3149}
3150HWY_API Vec256<int64_t> InterleaveLower(const Vec256<int64_t> a,
3151 const Vec256<int64_t> b) {
3152 return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3153}
3154
3155HWY_API Vec256<float> InterleaveLower(const Vec256<float> a,
3156 const Vec256<float> b) {
3157 return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
3158}
3160 const Vec256<double> b) {
3161 return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
3162}
3163
3164// ------------------------------ InterleaveUpper
3165
3166// All functions inside detail lack the required D parameter.
3167namespace detail {
3168
3169HWY_API Vec256<uint8_t> InterleaveUpper(const Vec256<uint8_t> a,
3170 const Vec256<uint8_t> b) {
3171 return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3172}
3173HWY_API Vec256<uint16_t> InterleaveUpper(const Vec256<uint16_t> a,
3174 const Vec256<uint16_t> b) {
3175 return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3176}
3177HWY_API Vec256<uint32_t> InterleaveUpper(const Vec256<uint32_t> a,
3178 const Vec256<uint32_t> b) {
3179 return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3180}
3181HWY_API Vec256<uint64_t> InterleaveUpper(const Vec256<uint64_t> a,
3182 const Vec256<uint64_t> b) {
3183 return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3184}
3185
3186HWY_API Vec256<int8_t> InterleaveUpper(const Vec256<int8_t> a,
3187 const Vec256<int8_t> b) {
3188 return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3189}
3190HWY_API Vec256<int16_t> InterleaveUpper(const Vec256<int16_t> a,
3191 const Vec256<int16_t> b) {
3192 return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3193}
3194HWY_API Vec256<int32_t> InterleaveUpper(const Vec256<int32_t> a,
3195 const Vec256<int32_t> b) {
3196 return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3197}
3198HWY_API Vec256<int64_t> InterleaveUpper(const Vec256<int64_t> a,
3199 const Vec256<int64_t> b) {
3200 return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3201}
3202
3203HWY_API Vec256<float> InterleaveUpper(const Vec256<float> a,
3204 const Vec256<float> b) {
3205 return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
3206}
3208 const Vec256<double> b) {
3209 return Vec256<double>{_mm256_unpackhi_pd(a.raw, b.raw)};
3210}
3211
3212} // namespace detail
3213
3214template <typename T, class V = Vec256<T>>
3215HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
3216 return detail::InterleaveUpper(a, b);
3217}
3218
3219// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3220
3221// Same as Interleave*, except that the return lanes are double-width integers;
3222// this is necessary because the single-lane scalar cannot return two values.
3223template <typename T, typename TW = MakeWide<T>>
3225 return BitCast(Full256<TW>(), InterleaveLower(a, b));
3226}
3227template <typename T, typename TW = MakeWide<T>>
3229 return BitCast(dw, InterleaveLower(a, b));
3230}
3231
3232template <typename T, typename TW = MakeWide<T>>
3234 return BitCast(dw, InterleaveUpper(Full256<T>(), a, b));
3235}
3236
3237// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
3238
3239// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
3240// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
3241// extra cost) for LowerLower and UpperLower.
3242
3243// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3244template <typename T>
3245HWY_API Vec256<T> ConcatLowerLower(Full256<T> d, const Vec256<T> hi,
3246 const Vec256<T> lo) {
3247 const Half<decltype(d)> d2;
3248 return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
3249}
3251 const Vec256<float> lo) {
3252 const Half<decltype(d)> d2;
3253 return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
3254}
3256 const Vec256<double> hi,
3257 const Vec256<double> lo) {
3258 const Half<decltype(d)> d2;
3259 return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
3260}
3261
3262// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
3263template <typename T>
3264HWY_API Vec256<T> ConcatLowerUpper(Full256<T> /* tag */, const Vec256<T> hi,
3265 const Vec256<T> lo) {
3266 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3267}
3269 const Vec256<float> hi,
3270 const Vec256<float> lo) {
3271 return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
3272}
3274 const Vec256<double> hi,
3275 const Vec256<double> lo) {
3276 return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
3277}
3278
3279// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3280template <typename T>
3281HWY_API Vec256<T> ConcatUpperLower(Full256<T> /* tag */, const Vec256<T> hi,
3282 const Vec256<T> lo) {
3283 return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3284}
3286 const Vec256<float> hi,
3287 const Vec256<float> lo) {
3288 return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
3289}
3291 const Vec256<double> hi,
3292 const Vec256<double> lo) {
3293 return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
3294}
3295
3296// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3297template <typename T>
3298HWY_API Vec256<T> ConcatUpperUpper(Full256<T> /* tag */, const Vec256<T> hi,
3299 const Vec256<T> lo) {
3300 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
3301}
3303 const Vec256<float> hi,
3304 const Vec256<float> lo) {
3305 return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
3306}
3308 const Vec256<double> hi,
3309 const Vec256<double> lo) {
3310 return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
3311}
3312
3313// ------------------------------ ConcatOdd
3314
3315template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3316HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3317 const RebindToUnsigned<decltype(d)> du;
3318#if HWY_TARGET == HWY_AVX3_DL
3319 alignas(32) constexpr uint8_t kIdx[32] = {
3320 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
3321 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
3322 return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi8(
3323 BitCast(du, lo).raw, Load(du, kIdx).raw,
3324 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
3325#else
3326 const RepartitionToWide<decltype(du)> dw;
3327 // Unsigned 8-bit shift so we can pack.
3328 const Vec256<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
3329 const Vec256<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
3330 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3331 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3332#endif
3333}
3334
3335template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3336HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3337 const RebindToUnsigned<decltype(d)> du;
3338#if HWY_TARGET <= HWY_AVX3
3339 alignas(32) constexpr uint16_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
3340 17, 19, 21, 23, 25, 27, 29, 31};
3341 return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi16(
3342 BitCast(du, lo).raw, Load(du, kIdx).raw,
3343 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
3344#else
3345 const RepartitionToWide<decltype(du)> dw;
3346 // Unsigned 16-bit shift so we can pack.
3347 const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
3348 const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
3349 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3350 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3351#endif
3352}
3353
3354template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3355HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3356 const RebindToUnsigned<decltype(d)> du;
3357#if HWY_TARGET <= HWY_AVX3
3358 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3359 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3360 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3361 BitCast(du, hi).raw)});
3362#else
3363 const RebindToFloat<decltype(d)> df;
3364 const Vec256<float> v3131{_mm256_shuffle_ps(
3365 BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3366 return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw,
3367 _MM_SHUFFLE(3, 1, 2, 0))};
3368#endif
3369}
3370
3372 Vec256<float> lo) {
3373 const RebindToUnsigned<decltype(d)> du;
3374#if HWY_TARGET <= HWY_AVX3
3375 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3376 return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3377 __mmask8{0xFF}, hi.raw)};
3378#else
3379 const Vec256<float> v3131{
3380 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
3381 return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3382 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3383#endif
3384}
3385
3386template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3387HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3388 const RebindToUnsigned<decltype(d)> du;
3389#if HWY_TARGET <= HWY_AVX3
3390 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3391 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3392 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3393 BitCast(du, hi).raw)});
3394#else
3395 const RebindToFloat<decltype(d)> df;
3396 const Vec256<double> v31{
3397 _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)};
3398 return Vec256<T>{
3399 _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3400#endif
3401}
3402
3404 Vec256<double> lo) {
3405#if HWY_TARGET <= HWY_AVX3
3406 const RebindToUnsigned<decltype(d)> du;
3407 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3408 return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3409 __mmask8{0xFF}, hi.raw)};
3410#else
3411 (void)d;
3412 const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)};
3413 return Vec256<double>{
3414 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3415#endif
3416}
3417
3418// ------------------------------ ConcatEven
3419
3420template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3421HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3422 const RebindToUnsigned<decltype(d)> du;
3423#if HWY_TARGET == HWY_AVX3_DL
3424 alignas(64) constexpr uint8_t kIdx[32] = {
3425 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3426 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3427 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi8(
3428 BitCast(du, lo).raw, Load(du, kIdx).raw,
3429 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
3430#else
3431 const RepartitionToWide<decltype(du)> dw;
3432 // Isolate lower 8 bits per u16 so we can pack.
3433 const Vec256<uint16_t> mask = Set(dw, 0x00FF);
3434 const Vec256<uint16_t> uH = And(BitCast(dw, hi), mask);
3435 const Vec256<uint16_t> uL = And(BitCast(dw, lo), mask);
3436 const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
3437 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3438#endif
3439}
3440
3441template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3442HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3443 const RebindToUnsigned<decltype(d)> du;
3444#if HWY_TARGET <= HWY_AVX3
3445 alignas(64) constexpr uint16_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3446 16, 18, 20, 22, 24, 26, 28, 30};
3447 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi16(
3448 BitCast(du, lo).raw, Load(du, kIdx).raw,
3449 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
3450#else
3451 const RepartitionToWide<decltype(du)> dw;
3452 // Isolate lower 16 bits per u32 so we can pack.
3453 const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
3454 const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
3455 const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
3456 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3457 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3458#endif
3459}
3460
3461template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3462HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3463 const RebindToUnsigned<decltype(d)> du;
3464#if HWY_TARGET <= HWY_AVX3
3465 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3466 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3467 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3468 BitCast(du, hi).raw)});
3469#else
3470 const RebindToFloat<decltype(d)> df;
3471 const Vec256<float> v2020{_mm256_shuffle_ps(
3472 BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3473 return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
3474 _MM_SHUFFLE(3, 1, 2, 0))};
3475
3476#endif
3477}
3478
3480 Vec256<float> lo) {
3481 const RebindToUnsigned<decltype(d)> du;
3482#if HWY_TARGET <= HWY_AVX3
3483 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3484 return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
3485 __mmask8{0xFF}, hi.raw)};
3486#else
3487 const Vec256<float> v2020{
3488 _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
3489 return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
3490 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3491
3492#endif
3493}
3494
3495template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3496HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3497 const RebindToUnsigned<decltype(d)> du;
3498#if HWY_TARGET <= HWY_AVX3
3499 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3500 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3501 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3502 BitCast(du, hi).raw)});
3503#else
3504 const RebindToFloat<decltype(d)> df;
3505 const Vec256<double> v20{
3506 _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
3507 return Vec256<T>{
3508 _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3509
3510#endif
3511}
3512
3514 Vec256<double> lo) {
3515#if HWY_TARGET <= HWY_AVX3
3516 const RebindToUnsigned<decltype(d)> du;
3517 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3518 return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
3519 __mmask8{0xFF}, hi.raw)};
3520#else
3521 (void)d;
3522 const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
3523 return Vec256<double>{
3524 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3525#endif
3526}
3527
3528// ------------------------------ DupEven (InterleaveLower)
3529
3530template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3532 return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3533}
3535 return Vec256<float>{
3536 _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3537}
3538
3539template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3540HWY_API Vec256<T> DupEven(const Vec256<T> v) {
3541 return InterleaveLower(Full256<T>(), v, v);
3542}
3543
3544// ------------------------------ DupOdd (InterleaveUpper)
3545
3546template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3548 return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3549}
3551 return Vec256<float>{
3552 _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3553}
3554
3555template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3556HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
3557 return InterleaveUpper(Full256<T>(), v, v);
3558}
3559
3560// ------------------------------ OddEven
3561
3562namespace detail {
3563
3564template <typename T>
3565HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec256<T> a,
3566 const Vec256<T> b) {
3567 const Full256<T> d;
3568 const Full256<uint8_t> d8;
3569 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3570 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3571 return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
3572}
3573template <typename T>
3574HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec256<T> a,
3575 const Vec256<T> b) {
3576 return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
3577}
3578template <typename T>
3579HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec256<T> a,
3580 const Vec256<T> b) {
3581 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
3582}
3583template <typename T>
3584HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec256<T> a,
3585 const Vec256<T> b) {
3586 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
3587}
3588
3589} // namespace detail
3590
3591template <typename T>
3592HWY_API Vec256<T> OddEven(const Vec256<T> a, const Vec256<T> b) {
3593 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3594}
3595HWY_API Vec256<float> OddEven(const Vec256<float> a, const Vec256<float> b) {
3596 return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
3597}
3598
3600 return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
3601}
3602
3603// ------------------------------ OddEvenBlocks
3604
3605template <typename T>
3607 return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)};
3608}
3609
3611 return Vec256<float>{_mm256_blend_ps(odd.raw, even.raw, 0xFu)};
3612}
3613
3615 return Vec256<double>{_mm256_blend_pd(odd.raw, even.raw, 0x3u)};
3616}
3617
3618// ------------------------------ ReverseBlocks (ConcatLowerUpper)
3619
3620template <typename T>
3621HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
3622 return ConcatLowerUpper(d, v, v);
3623}
3624
3625// ------------------------------ TableLookupBytes (ZeroExtendVector)
3626
3627// Both full
3628template <typename T, typename TI>
3629HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes,
3630 const Vec256<TI> from) {
3631 return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3632}
3633
3634// Partial index vector
3635template <typename T, typename TI, size_t NI>
3637 const Vec128<TI, NI> from) {
3638 // First expand to full 128, then 256.
3639 const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
3640 const auto tbl_full = TableLookupBytes(bytes, from_256);
3641 // Shrink to 128, then partial.
3642 return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
3643}
3644
3645// Partial table vector
3646template <typename T, size_t N, typename TI>
3648 const Vec256<TI> from) {
3649 // First expand to full 128, then 256.
3650 const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
3651 return TableLookupBytes(bytes_256, from);
3652}
3653
3654// Partial both are handled by x86_128.
3655
3656// ------------------------------ Shl (Mul, ZipLower)
3657
3658#if HWY_TARGET > HWY_AVX3 // AVX2 or older
3659namespace detail {
3660
3661// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
3662template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3663HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
3664 const Full256<T> d;
3665 const RepartitionToWide<decltype(d)> dw;
3666 const Rebind<float, decltype(dw)> df;
3667 const auto zero = Zero(d);
3668 // Move into exponent (this u16 will become the upper half of an f32)
3669 const auto exp = ShiftLeft<23 - 16>(v);
3670 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
3671 // Insert 0 into lower halves for reinterpreting as binary32.
3672 const auto f0 = ZipLower(dw, zero, upper);
3673 const auto f1 = ZipUpper(dw, zero, upper);
3674 // Do not use ConvertTo because it checks for overflow, which is redundant
3675 // because we only care about v in [0, 16).
3676 const Vec256<int32_t> bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)};
3677 const Vec256<int32_t> bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)};
3678 return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3679}
3680
3681} // namespace detail
3682#endif // HWY_TARGET > HWY_AVX3
3683
3685 const Vec256<uint16_t> bits) {
3686#if HWY_TARGET <= HWY_AVX3
3687 return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
3688#else
3689 return v * detail::Pow2(bits);
3690#endif
3691}
3692
3694 const Vec256<uint32_t> bits) {
3695 return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
3696}
3697
3699 const Vec256<uint64_t> bits) {
3700 return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
3701}
3702
3703// Signed left shift is the same as unsigned.
3704template <typename T, HWY_IF_SIGNED(T)>
3705HWY_API Vec256<T> operator<<(const Vec256<T> v, const Vec256<T> bits) {
3706 const Full256<T> di;
3707 const Full256<MakeUnsigned<T>> du;
3708 return BitCast(di, BitCast(du, v) << BitCast(du, bits));
3709}
3710
3711// ------------------------------ Shr (MulHigh, IfThenElse, Not)
3712
3714 const Vec256<uint16_t> bits) {
3715#if HWY_TARGET <= HWY_AVX3
3716 return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
3717#else
3718 const Full256<uint16_t> d;
3719 // For bits=0, we cannot mul by 2^16, so fix the result later.
3720 const auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
3721 // Replace output with input where bits == 0.
3722 return IfThenElse(bits == Zero(d), v, out);
3723#endif
3724}
3725
3727 const Vec256<uint32_t> bits) {
3728 return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
3729}
3730
3732 const Vec256<uint64_t> bits) {
3733 return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
3734}
3735
3737 const Vec256<int16_t> bits) {
3738#if HWY_TARGET <= HWY_AVX3
3739 return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
3740#else
3741 return detail::SignedShr(Full256<int16_t>(), v, bits);
3742#endif
3743}
3744
3746 const Vec256<int32_t> bits) {
3747 return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
3748}
3749
3751 const Vec256<int64_t> bits) {
3752#if HWY_TARGET <= HWY_AVX3
3753 return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
3754#else
3755 return detail::SignedShr(Full256<int64_t>(), v, bits);
3756#endif
3757}
3758
3759HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
3760 const Vec256<uint64_t> b) {
3761 const DFromV<decltype(a)> du64;
3762 const RepartitionToNarrow<decltype(du64)> du32;
3763 const auto maskL = Set(du64, 0xFFFFFFFFULL);
3764 const auto a32 = BitCast(du32, a);
3765 const auto b32 = BitCast(du32, b);
3766 // Inputs for MulEven: we only need the lower 32 bits
3767 const auto aH = Shuffle2301(a32);
3768 const auto bH = Shuffle2301(b32);
3769
3770 // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
3771 // the even (lower 64 bits of every 128-bit block) results. See
3772 // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
3773 const auto aLbL = MulEven(a32, b32);
3774 const auto w3 = aLbL & maskL;
3775
3776 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3777 const auto w2 = t2 & maskL;
3778 const auto w1 = ShiftRight<32>(t2);
3779
3780 const auto t = MulEven(a32, bH) + w2;
3781 const auto k = ShiftRight<32>(t);
3782
3783 const auto mulH = MulEven(aH, bH) + w1 + k;
3784 const auto mulL = ShiftLeft<32>(t) + w3;
3785 return InterleaveLower(mulL, mulH);
3786}
3787
3788HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
3789 const Vec256<uint64_t> b) {
3790 const DFromV<decltype(a)> du64;
3791 const RepartitionToNarrow<decltype(du64)> du32;
3792 const auto maskL = Set(du64, 0xFFFFFFFFULL);
3793 const auto a32 = BitCast(du32, a);
3794 const auto b32 = BitCast(du32, b);
3795 // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
3796 const auto aH = Shuffle2301(a32);
3797 const auto bH = Shuffle2301(b32);
3798
3799 // Same as above, but we're using the odd results (upper 64 bits per block).
3800 const auto aLbL = MulEven(a32, b32);
3801 const auto w3 = aLbL & maskL;
3802
3803 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3804 const auto w2 = t2 & maskL;
3805 const auto w1 = ShiftRight<32>(t2);
3806
3807 const auto t = MulEven(a32, bH) + w2;
3808 const auto k = ShiftRight<32>(t);
3809
3810 const auto mulH = MulEven(aH, bH) + w1 + k;
3811 const auto mulL = ShiftLeft<32>(t) + w3;
3812 return InterleaveUpper(du64, mulL, mulH);
3813}
3814
3815// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3816
3817HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
3818 Vec256<bfloat16_t> a,
3819 Vec256<bfloat16_t> b,
3820 const Vec256<float> sum0,
3821 Vec256<float>& sum1) {
3822 // TODO(janwas): _mm256_dpbf16_ps when available
3823 const Repartition<uint16_t, decltype(df32)> du16;
3824 const RebindToUnsigned<decltype(df32)> du32;
3825 const Vec256<uint16_t> zero = Zero(du16);
3826 // Lane order within sum0/1 is undefined, hence we can avoid the
3827 // longer-latency lane-crossing PromoteTo.
3828 const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
3829 const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3830 const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
3831 const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3832 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3833 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3834}
3835
3836// ================================================== CONVERT
3837
3838// ------------------------------ Promotions (part w/ narrow lanes -> full)
3839
3841 const Vec128<float, 4> v) {
3842 return Vec256<double>{_mm256_cvtps_pd(v.raw)};
3843}
3844
3846 const Vec128<int32_t, 4> v) {
3847 return Vec256<double>{_mm256_cvtepi32_pd(v.raw)};
3848}
3849
3850// Unsigned: zero-extend.
3851// Note: these have 3 cycle latency; if inputs are already split across the
3852// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3853HWY_API Vec256<uint16_t> PromoteTo(Full256<uint16_t> /* tag */,
3854 Vec128<uint8_t> v) {
3855 return Vec256<uint16_t>{_mm256_cvtepu8_epi16(v.raw)};
3856}
3859 return Vec256<uint32_t>{_mm256_cvtepu8_epi32(v.raw)};
3860}
3861HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
3862 Vec128<uint8_t> v) {
3863 return Vec256<int16_t>{_mm256_cvtepu8_epi16(v.raw)};
3864}
3867 return Vec256<int32_t>{_mm256_cvtepu8_epi32(v.raw)};
3868}
3869HWY_API Vec256<uint32_t> PromoteTo(Full256<uint32_t> /* tag */,
3870 Vec128<uint16_t> v) {
3871 return Vec256<uint32_t>{_mm256_cvtepu16_epi32(v.raw)};
3872}
3873HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
3874 Vec128<uint16_t> v) {
3875 return Vec256<int32_t>{_mm256_cvtepu16_epi32(v.raw)};
3876}
3879 return Vec256<uint64_t>{_mm256_cvtepu32_epi64(v.raw)};
3880}
3881
3882// Signed: replicate sign bit.
3883// Note: these have 3 cycle latency; if inputs are already split across the
3884// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3885// signed shift would be faster.
3886HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
3887 Vec128<int8_t> v) {
3888 return Vec256<int16_t>{_mm256_cvtepi8_epi16(v.raw)};
3889}
3892 return Vec256<int32_t>{_mm256_cvtepi8_epi32(v.raw)};
3893}
3894HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
3895 Vec128<int16_t> v) {
3896 return Vec256<int32_t>{_mm256_cvtepi16_epi32(v.raw)};
3897}
3900 return Vec256<int64_t>{_mm256_cvtepi32_epi64(v.raw)};
3901}
3902
3903// ------------------------------ Demotions (full -> part w/ narrow lanes)
3904
3905HWY_API Vec128<uint16_t> DemoteTo(Full128<uint16_t> /* tag */,
3906 const Vec256<int32_t> v) {
3907 const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
3908 // Concatenating lower halves of both 128-bit blocks afterward is more
3909 // efficient than an extra input with low block = high block of v.
3910 return Vec128<uint16_t>{
3911 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
3912}
3913
3914HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
3915 const Vec256<int32_t> v) {
3916 const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
3917 return Vec128<int16_t>{
3918 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
3919}
3920
3922 const Vec256<int32_t> v) {
3923 const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
3924 // Concatenate lower 64 bits of each 128-bit block
3925 const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
3926 const __m128i u16 = _mm256_castsi256_si128(u16_concat);
3927 // packus treats the input as signed; we want unsigned. Clear the MSB to get
3928 // unsigned saturation to u8.
3929 const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
3930 return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
3931}
3932
3933HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
3934 const Vec256<int16_t> v) {
3935 const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw);
3936 return Vec128<uint8_t>{
3937 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
3938}
3939
3941 const Vec256<int32_t> v) {
3942 const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
3943 // Concatenate lower 64 bits of each 128-bit block
3944 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
3945 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
3946 return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
3947}
3948
3949HWY_API Vec128<int8_t> DemoteTo(Full128<int8_t> /* tag */,
3950 const Vec256<int16_t> v) {
3951 const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
3952 return Vec128<int8_t>{
3953 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
3954}
3955
3956 // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
3957 // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
3958HWY_DIAGNOSTICS(push)
3959HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
3960
3961HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> df16,
3962 const Vec256<float> v) {
3963#ifdef HWY_DISABLE_F16C
3964 const RebindToUnsigned<decltype(df16)> du16;
3965 const Rebind<uint32_t, decltype(df16)> du;
3966 const RebindToSigned<decltype(du)> di;
3967 const auto bits32 = BitCast(du, v);
3968 const auto sign = ShiftRight<31>(bits32);
3969 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
3970 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
3971
3972 const auto k15 = Set(di, 15);
3973 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
3974 const auto is_tiny = exp < Set(di, -24);
3975
3976 const auto is_subnormal = exp < Set(di, -14);
3977 const auto biased_exp16 =
3978 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
3979 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
3980 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
3981 (mantissa32 >> (Set(du, 13) + sub_exp));
3982 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
3983 ShiftRight<13>(mantissa32)); // <1024
3984
3985 const auto sign16 = ShiftLeft<15>(sign);
3986 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3987 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
3988 return BitCast(df16, DemoteTo(du16, bits16));
3989#else
3990 (void)df16;
3991 return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3992#endif
3993}
3994
3995HWY_DIAGNOSTICS(pop)
3996
3997HWY_API Vec128<bfloat16_t> DemoteTo(Full128<bfloat16_t> dbf16,
3998 const Vec256<float> v) {
3999 // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
4000 const Rebind<int32_t, decltype(dbf16)> di32;
4001 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4002 const Rebind<uint16_t, decltype(dbf16)> du16;
4003 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4004 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4005}
4006
4009 // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
4010 const RebindToUnsigned<decltype(dbf16)> du16;
4011 const Repartition<uint32_t, decltype(dbf16)> du32;
4012 const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
4013 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4014}
4015
4017 const Vec256<double> v) {
4018 return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
4019}
4020
4021HWY_API Vec128<int32_t> DemoteTo(Full128<int32_t> /* tag */,
4022 const Vec256<double> v) {
4023 const auto clamped = detail::ClampF64ToI32Max(Full256<double>(), v);
4024 return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
4025}
4026
4027// For already range-limited input [0, 255].
4028HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
4029 const Full256<uint32_t> d32;
4030 alignas(32) static constexpr uint32_t k8From32[8] = {
4031 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
4032 // Place first four bytes in lo[0], remaining 4 in hi[1].
4033 const auto quad = TableLookupBytes(v, Load(d32, k8From32));
4034 // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
4035 const auto lo = LowerHalf(quad);
4036 const auto hi = UpperHalf(Full128<uint32_t>(), quad);
4037 const auto pair = LowerHalf(lo | hi);
4038 return BitCast(Full64<uint8_t>(), pair);
4039}
4040
4041// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
4042
4043HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
4044 const Vec256<int32_t> v) {
4045 return Vec256<float>{_mm256_cvtepi32_ps(v.raw)};
4046}
4047
4049#if HWY_TARGET <= HWY_AVX3
4050 (void)dd;
4051 return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
4052#else
4053 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4054 const Repartition<uint32_t, decltype(dd)> d32;
4055 const Repartition<uint64_t, decltype(dd)> d64;
4056
4057 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
4058 const auto k84_63 = Set(d64, 0x4530000080000000ULL);
4059 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
4060
4061 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
4062 const auto k52 = Set(d32, 0x43300000);
4063 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
4064
4065 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
4066 return (v_upper - k84_63_52) + v_lower; // order matters!
4067#endif
4068}
4069
4070// Truncates (rounds toward zero).
4071HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> d, const Vec256<float> v) {
4072 return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
4073}
4074
4076#if HWY_TARGET <= HWY_AVX3
4077 return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw));
4078#else
4079 using VI = decltype(Zero(di));
4080 const VI k0 = Zero(di);
4081 const VI k1 = Set(di, 1);
4082 const VI k51 = Set(di, 51);
4083
4084 // Exponent indicates whether the number can be represented as int64_t.
4085 const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
4086 const VI exp = biased_exp - Set(di, 0x3FF);
4087 const auto in_range = exp < Set(di, 63);
4088
4089 // If we were to cap the exponent at 51 and add 2^52, the number would be in
4090 // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4091 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
4092 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
4093 // manually shift the mantissa into place (we already have many of the
4094 // inputs anyway).
4095 const VI shift_mnt = Max(k51 - exp, k0);
4096 const VI shift_int = Max(exp - k51, k0);
4097 const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
4098 // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
4099 const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4100 // For inputs larger than 2^52, insert zeros at the bottom.
4101 const VI shifted = int52 << shift_int;
4102 // Restore the one bit lost when shifting in the implicit 1-bit.
4103 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4104
4105 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
4106 const VI sign_mask = BroadcastSignBit(BitCast(di, v));
4107 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
4108 const VI magnitude = IfThenElse(in_range, restored, limit);
4109
4110 // If the input was negative, negate the integer (two's complement).
4111 return (magnitude ^ sign_mask) - sign_mask;
4112#endif
4113}
4114
4115HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
4116 const Full256<int32_t> di;
4117 return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw));
4118}
4119
4120
4121HWY_API Vec256<float> PromoteTo(Full256<float> df32,
4122 const Vec128<float16_t> v) {
4123#ifdef HWY_DISABLE_F16C
4124 const RebindToSigned<decltype(df32)> di32;
4125 const RebindToUnsigned<decltype(df32)> du32;
4126 // Expand to u32 so we can shift.
4127 const auto bits16 = PromoteTo(du32, Vec128<uint16_t>{v.raw});
4128 const auto sign = ShiftRight<15>(bits16);
4129 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
4130 const auto mantissa = bits16 & Set(du32, 0x3FF);
4131 const auto subnormal =
4132 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
4133 Set(df32, 1.0f / 16384 / 1024));
4134
4135 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
4136 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
4137 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4138 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
4139 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4140#else
4141 (void)df32;
4142 return Vec256<float>{_mm256_cvtph_ps(v.raw)};
4143#endif
4144}
4145
4146HWY_API Vec256<float> PromoteTo(Full256<float> df32,
4147 const Vec128<bfloat16_t> v) {
4148 const Rebind<uint16_t, decltype(df32)> du16;
4149 const RebindToSigned<decltype(df32)> di32;
4150 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4151}
4152
4153// ================================================== CRYPTO
4154
4155#if !defined(HWY_DISABLE_PCLMUL_AES)
4156
4157// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4158#ifdef HWY_NATIVE_AES
4159#undef HWY_NATIVE_AES
4160#else
4161#define HWY_NATIVE_AES
4162#endif
4163
4165 Vec256<uint8_t> round_key) {
4166#if HWY_TARGET == HWY_AVX3_DL
4167 return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)};
4168#else
4169 const Full256<uint8_t> d;
4170 const Half<decltype(d)> d2;
4171 return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
4172 AESRound(LowerHalf(state), LowerHalf(round_key)));
4173#endif
4174}
4175
4177 Vec256<uint8_t> round_key) {
4178#if HWY_TARGET == HWY_AVX3_DL
4179 return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
4180#else
4181 const Full256<uint8_t> d;
4182 const Half<decltype(d)> d2;
4183 return Combine(d,
4184 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
4185 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
4186#endif
4187}
4188
4190#if HWY_TARGET == HWY_AVX3_DL
4191 return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
4192#else
4193 const Full256<uint64_t> d;
4194 const Half<decltype(d)> d2;
4195 return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)),
4197#endif
4198}
4199
4201#if HWY_TARGET == HWY_AVX3_DL
4202 return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)};
4203#else
4204 const Full256<uint64_t> d;
4205 const Half<decltype(d)> d2;
4206 return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)),
4208#endif
4209}
4210
4211#endif // HWY_DISABLE_PCLMUL_AES
4212
4213// ================================================== MISC
4214
4215// Returns a vector with lane i=[0, N) set to "first" + i.
4216template <typename T, typename T2>
4217HWY_API Vec256<T> Iota(const Full256<T> d, const T2 first) {
4218 HWY_ALIGN T lanes[32 / sizeof(T)];
4219 for (size_t i = 0; i < 32 / sizeof(T); ++i) {
4220 lanes[i] = static_cast<T>(first + static_cast<T2>(i));
4221 }
4222 return Load(d, lanes);
4223}
4224
4225#if HWY_TARGET <= HWY_AVX3
4226
4227// ------------------------------ LoadMaskBits
4228
4229// `p` points to at least 8 readable bytes, not all of which need be valid.
4230template <typename T>
4231HWY_API Mask256<T> LoadMaskBits(const Full256<T> /* tag */,
4232 const uint8_t* HWY_RESTRICT bits) {
4233 constexpr size_t N = 32 / sizeof(T);
4234 constexpr size_t kNumBytes = (N + 7) / 8;
4235
4236 uint64_t mask_bits = 0;
4237 CopyBytes<kNumBytes>(bits, &mask_bits);
4238
4239 if (N < 8) {
4240 mask_bits &= (1ull << N) - 1;
4241 }
4242
4243 return Mask256<T>::FromBits(mask_bits);
4244}
4245
4246// ------------------------------ StoreMaskBits
4247
4248// `p` points to at least 8 writable bytes.
4249template <typename T>
4250HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4251 uint8_t* bits) {
4252 constexpr size_t N = 32 / sizeof(T);
4253 constexpr size_t kNumBytes = (N + 7) / 8;
4254
4255 CopyBytes<kNumBytes>(&mask.raw, bits);
4256
4257 // Non-full byte, need to clear the undefined upper bits.
4258 if (N < 8) {
4259 const int mask = static_cast<int>((1ull << N) - 1);
4260 bits[0] = static_cast<uint8_t>(bits[0] & mask);
4261 }
4262 return kNumBytes;
4263}
4264
4265// ------------------------------ Mask testing
4266
4267template <typename T>
4268HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4269 return PopCount(static_cast<uint64_t>(mask.raw));
4270}
4271
4272template <typename T>
4273HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
4274 const Mask256<T> mask) {
4275 return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
4276}
4277
4278// Beware: the suffix indicates the number of mask bits, not lane size!
4279
4280namespace detail {
4281
4282template <typename T>
4284#if HWY_COMPILER_HAS_MASK_INTRINSICS
4285 return _kortestz_mask32_u8(mask.raw, mask.raw);
4286#else
4287 return mask.raw == 0;
4288#endif
4289}
4290template <typename T>
4292#if HWY_COMPILER_HAS_MASK_INTRINSICS
4293 return _kortestz_mask16_u8(mask.raw, mask.raw);
4294#else
4295 return mask.raw == 0;
4296#endif
4297}
4298template <typename T>
4300#if HWY_COMPILER_HAS_MASK_INTRINSICS
4301 return _kortestz_mask8_u8(mask.raw, mask.raw);
4302#else
4303 return mask.raw == 0;
4304#endif
4305}
4306template <typename T>
4308 return (uint64_t{mask.raw} & 0xF) == 0;
4309}
4310
4311} // namespace detail
4312
4313template <typename T>
4314HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4315 return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
4316}
4317
4318namespace detail {
4319
4320template <typename T>
4322#if HWY_COMPILER_HAS_MASK_INTRINSICS
4323 return _kortestc_mask32_u8(mask.raw, mask.raw);
4324#else
4325 return mask.raw == 0xFFFFFFFFu;
4326#endif
4327}
4328template <typename T>
4330#if HWY_COMPILER_HAS_MASK_INTRINSICS
4331 return _kortestc_mask16_u8(mask.raw, mask.raw);
4332#else
4333 return mask.raw == 0xFFFFu;
4334#endif
4335}
4336template <typename T>
4338#if HWY_COMPILER_HAS_MASK_INTRINSICS
4339 return _kortestc_mask8_u8(mask.raw, mask.raw);
4340#else
4341 return mask.raw == 0xFFu;
4342#endif
4343}
4344template <typename T>
4346 // Cannot use _kortestc because we have less than 8 mask bits.
4347 return mask.raw == 0xFu;
4348}
4349
4350} // namespace detail
4351
4352template <typename T>
4353HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4354 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
4355}
4356
4357// ------------------------------ Compress
4358
4359// 16-bit is defined in x86_512 so we can use 512-bit vectors.
4360
4361template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4363 return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
4364}
4365
4367 return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)};
4368}
4369
4370template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4371HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
4372 // See CompressIsPartition.
4373 alignas(16) constexpr uint64_t packed_array[16] = {
4374 // PrintCompress64x4NibbleTables
4375 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
4376 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
4377 0x00001032, 0x00001320, 0x00000321, 0x00003210};
4378
4379 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
4380 // _mm256_permutexvar_epi64 will ignore the upper bits.
4381 const Full256<T> d;
4382 const RebindToUnsigned<decltype(d)> du64;
4383 const auto packed = Set(du64, packed_array[mask.raw]);
4384 alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4385 const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
4386 return TableLookupLanes(v, indices);
4387}
4388
4389// ------------------------------ CompressNot (Compress)
4390
4391template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
4393 return Compress(v, Not(mask));
4394}
4395
4396template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4397HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> mask) {
4398 // See CompressIsPartition.
4399 alignas(16) constexpr uint64_t packed_array[16] = {
4400 // PrintCompressNot64x4NibbleTables
4401 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
4402 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
4403 0x00003210, 0x00003201, 0x00003210, 0x00003210};
4404
4405 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
4406 // _mm256_permutexvar_epi64 will ignore the upper bits.
4407 const Full256<T> d;
4408 const RebindToUnsigned<decltype(d)> du64;
4409 const auto packed = Set(du64, packed_array[mask.raw]);
4410 alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4411 const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
4412 return TableLookupLanes(v, indices);
4413}
4414
4415// ------------------------------ CompressBlocksNot
4416HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
4417 Mask256<uint64_t> mask) {
4418 return CompressNot(v, mask);
4419}
4420
4421// ------------------------------ CompressBits (LoadMaskBits)
4422template <typename T>
4423HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4424 return Compress(v, LoadMaskBits(Full256<T>(), bits));
4425}
4426
4427// ------------------------------ CompressStore
4428
4429template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4431 T* HWY_RESTRICT unaligned) {
4432 const Rebind<uint16_t, decltype(d)> du;
4433 const auto vu = BitCast(du, v); // (required for float16_t inputs)
4434
4435 const uint64_t mask_bits{mask.raw};
4436
4437#if HWY_TARGET == HWY_AVX3_DL // VBMI2
4438 _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
4439#else
4440 // Split into halves to keep the table size manageable.
4441 const Half<decltype(du)> duh;
4442 const auto vL = LowerHalf(duh, vu);
4443 const auto vH = UpperHalf(duh, vu);
4444
4445 const uint64_t mask_bitsL = mask_bits & 0xFF;
4446 const uint64_t mask_bitsH = mask_bits >> 8;
4447
4448 const auto idxL = detail::IndicesForCompress16(mask_bitsL);
4449 const auto idxH = detail::IndicesForCompress16(mask_bitsH);
4450
4451 // Compress and 128-bit halves.
4452 const Vec128<uint16_t> cL{_mm_permutexvar_epi16(idxL.raw, vL.raw)};
4453 const Vec128<uint16_t> cH{_mm_permutexvar_epi16(idxH.raw, vH.raw)};
4454 const Half<decltype(d)> dh;
4455 StoreU(BitCast(dh, cL), dh, unaligned);
4456 StoreU(BitCast(dh, cH), dh, unaligned + PopCount(mask_bitsL));
4457#endif // HWY_TARGET == HWY_AVX3_DL
4458
4459 return PopCount(mask_bits);
4460}
4461
4462template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4463HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
4464 T* HWY_RESTRICT unaligned) {
4465 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
4466 const size_t count = PopCount(uint64_t{mask.raw});
4467 // Workaround for MSAN not marking output as initialized (b/233326619)
4468#if HWY_IS_MSAN
4469 __msan_unpoison(unaligned, count * sizeof(T));
4470#endif
4471 return count;
4472}
4473
4474template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4475HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
4476 T* HWY_RESTRICT unaligned) {
4477 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4478 const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4479 // Workaround for MSAN not marking output as initialized (b/233326619)
4480#if HWY_IS_MSAN
4481 __msan_unpoison(unaligned, count * sizeof(T));
4482#endif
4483 return count;
4484}
4485
4487 Full256<float> /* tag */,
4488 float* HWY_RESTRICT unaligned) {
4489 _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
4490 const size_t count = PopCount(uint64_t{mask.raw});
4491 // Workaround for MSAN not marking output as initialized (b/233326619)
4492#if HWY_IS_MSAN
4493 __msan_unpoison(unaligned, count * sizeof(float));
4494#endif
4495 return count;
4496}
4497
4499 Full256<double> /* tag */,
4500 double* HWY_RESTRICT unaligned) {
4501 _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
4502 const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
4503 // Workaround for MSAN not marking output as initialized (b/233326619)
4504#if HWY_IS_MSAN
4505 __msan_unpoison(unaligned, count * sizeof(double));
4506#endif
4507 return count;
4508}
4509
4510// ------------------------------ CompressBlendedStore (CompressStore)
4511
4512template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4514 T* HWY_RESTRICT unaligned) {
4515 // Native (32 or 64-bit) AVX-512 instruction already does the blending at no
4516 // extra cost (latency 11, rthroughput 2 - same as compress plus store).
4517 return CompressStore(v, m, d, unaligned);
4518}
4519
4520template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4521HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4522 T* HWY_RESTRICT unaligned) {
4523#if HWY_TARGET <= HWY_AVX3_DL
4524 return CompressStore(v, m, d, unaligned); // also native
4525#else
4526 const size_t count = CountTrue(d, m);
4527 BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
4528 // Workaround for MSAN not marking output as initialized (b/233326619)
4529#if HWY_IS_MSAN
4530 __msan_unpoison(unaligned, count * sizeof(T));
4531#endif
4532 return count;
4533#endif
4534}
4535
4536// ------------------------------ CompressBitsStore (LoadMaskBits)
4537
4538template <typename T>
4539HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
4540 Full256<T> d, T* HWY_RESTRICT unaligned) {
4541 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4542}
4543
4544#else // AVX2
4545
4546// ------------------------------ LoadMaskBits (TestBit)
4547
4548namespace detail {
4549
4550// 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_LE128 there.
4551template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4552HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4553 const RebindToUnsigned<decltype(d)> du;
4554 const Repartition<uint32_t, decltype(d)> du32;
4555 const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
4556
4557 // Replicate bytes 8x such that each byte contains the bit that governs it.
4558 const Repartition<uint64_t, decltype(d)> du64;
4559 alignas(32) constexpr uint64_t kRep8[4] = {
4560 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4561 0x0303030303030303ull};
4562 const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
4563
4564 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4565 1, 2, 4, 8, 16, 32, 64, 128};
4566 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
4567}
4568
4569template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4570HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4571 const RebindToUnsigned<decltype(d)> du;
4572 alignas(32) constexpr uint16_t kBit[16] = {
4573 1, 2, 4, 8, 16, 32, 64, 128,
4574 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4575 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
4576 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4577}
4578
4579template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4580HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4581 const RebindToUnsigned<decltype(d)> du;
4582 alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4583 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
4584 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
4585}
4586
4587template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4588HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4589 const RebindToUnsigned<decltype(d)> du;
4590 alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4591 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
4592}
4593
4594} // namespace detail
4595
4596// `p` points to at least 8 readable bytes, not all of which need be valid.
4597template <typename T>
4598HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
4599 const uint8_t* HWY_RESTRICT bits) {
4600 constexpr size_t N = 32 / sizeof(T);
4601 constexpr size_t kNumBytes = (N + 7) / 8;
4602
4603 uint64_t mask_bits = 0;
4604 CopyBytes<kNumBytes>(bits, &mask_bits);
4605
4606 if (N < 8) {
4607 mask_bits &= (1ull << N) - 1;
4608 }
4609
4610 return detail::LoadMaskBits256(d, mask_bits);
4611}
4612
4613// ------------------------------ StoreMaskBits
4614
4615namespace detail {
4616
4617template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4618HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4619 const Full256<T> d;
4620 const Full256<uint8_t> d8;
4621 const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
4622 // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
4623 return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
4624}
4625
4626template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4627HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4628#if HWY_ARCH_X86_64
4629 const Full256<T> d;
4630 const Full256<uint8_t> d8;
4631 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4632 const uint64_t sign_bits8 = BitsFromMask(mask8);
4633 // Skip the bits from the lower byte of each u16 (better not to use the
4634 // same packs_epi16 as SSE4, because that requires an extra swizzle here).
4635 return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4636#else
4637 // Slow workaround for 32-bit builds, which lack _pext_u64.
4638 // Remove useless lower half of each u16 while preserving the sign bit.
4639 // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes.
4640 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4641 // Move odd qwords (value zero) to top so they don't affect the mask value.
4642 const auto compressed =
4643 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4644 return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4645#endif // HWY_ARCH_X86_64
4646}
4647
4648template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4649HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4650 const Full256<T> d;
4651 const Full256<float> df;
4652 const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4653 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4654}
4655
4656template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4657HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
4658 const Full256<T> d;
4659 const Full256<double> df;
4660 const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
4661 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4662}
4663
4664} // namespace detail
4665
4666// `p` points to at least 8 writable bytes.
4667template <typename T>
4668HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
4669 uint8_t* bits) {
4670 constexpr size_t N = 32 / sizeof(T);
4671 constexpr size_t kNumBytes = (N + 7) / 8;
4672
4673 const uint64_t mask_bits = detail::BitsFromMask(mask);
4674 CopyBytes<kNumBytes>(&mask_bits, bits);
4675 return kNumBytes;
4676}
4677
4678// ------------------------------ Mask testing
4679
4680// Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask
4681// lane is 0 or ~0.
4682template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4683HWY_API bool AllFalse(const Full256<T> d, const Mask256<T> mask) {
4684 const Repartition<uint8_t, decltype(d)> d8;
4685 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4686 return detail::BitsFromMask(mask8) == 0;
4687}
4688
4689template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4690HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
4691 // Cheaper than PTEST, which is 2 uop / 3L.
4692 return detail::BitsFromMask(mask) == 0;
4693}
4694
4695template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4696HWY_API bool AllTrue(const Full256<T> d, const Mask256<T> mask) {
4697 const Repartition<uint8_t, decltype(d)> d8;
4698 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4699 return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
4700}
4701template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4702HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4703 constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1;
4704 return detail::BitsFromMask(mask) == kAllBits;
4705}
4706
4707template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4708HWY_API size_t CountTrue(const Full256<T> d, const Mask256<T> mask) {
4709 const Repartition<uint8_t, decltype(d)> d8;
4710 const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
4711 return PopCount(detail::BitsFromMask(mask8)) >> 1;
4712}
4713template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4714HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
4715 return PopCount(detail::BitsFromMask(mask));
4716}
4717
4718template <typename T>
4719HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
4720 const Mask256<T> mask) {
4721 const uint64_t mask_bits = detail::BitsFromMask(mask);
4722 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
4723}
4724
4725// ------------------------------ Compress, CompressBits
4726
4727namespace detail {
4728
4729template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4730HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
4731 uint64_t mask_bits) {
4732 const RebindToUnsigned<decltype(d)> d32;
4733 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
4734 // of SetTableIndices would require 8 KiB, a large part of L1D. The other
4735 // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
4736 // and unavailable in 32-bit builds. We instead compress each index into 4
4737 // bits, for a total of 1 KiB.
4738 alignas(16) constexpr uint32_t packed_array[256] = {
4739 // PrintCompress32x8Tables
4740 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4741 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4742 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4743 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4744 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4745 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4746 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4747 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4748 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4749 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4750 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4751 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4752 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4753 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4754 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4755 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4756 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4757 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4758 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4759 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4760 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4761 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4762 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4763 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4764 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4765 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4766 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4767 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4768 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4769 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4770 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4771 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4772 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4773 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4774 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4775 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4776 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4777 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4778 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4779 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4780 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4781 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4782 0x10765432, 0x17654320, 0x07654321, 0x76543210};
4783
4784 // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
4785 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4786 // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
4787 // latency, it may be faster to use LoadDup128 and PSHUFB.
4788 const auto packed = Set(d32, packed_array[mask_bits]);
4789 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4790 return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
4791}
4792
4793template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4794HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
4795 uint64_t mask_bits) {
4796 const Repartition<uint32_t, decltype(d)> d32;
4797
4798 // For 64-bit, we still need 32-bit indices because there is no 64-bit
4799 // permutevar, but there are only 4 lanes, so we can afford to skip the
4800 // unpacking and load the entire index vector directly.
4801 alignas(32) constexpr uint32_t u32_indices[128] = {
4802 // PrintCompress64x4PairTables
4803 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
4804 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
4805 2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
4806 0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
4807 0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
4808 2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
4809 return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
4810}
4811
4812template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4813HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
4814 uint64_t mask_bits) {
4815 const RebindToUnsigned<decltype(d)> d32;
4816 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
4817 // of SetTableIndices would require 8 KiB, a large part of L1D. The other
4818 // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
4819 // and unavailable in 32-bit builds. We instead compress each index into 4
4820 // bits, for a total of 1 KiB.
4821 alignas(16) constexpr uint32_t packed_array[256] = {
4822 // PrintCompressNot32x8Tables
4823 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
4824 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
4825 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
4826 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
4827 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
4828 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
4829 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
4830 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
4831 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
4832 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
4833 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
4834 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
4835 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
4836 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
4837 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
4838 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
4839 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
4840 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
4841 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
4842 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
4843 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
4844 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
4845 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
4846 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
4847 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
4848 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
4849 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
4850 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
4851 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
4852 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
4853 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
4854 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
4855 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
4856 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
4857 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
4858 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
4859 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
4860 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
4861 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
4862 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
4863 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
4864 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
4865 0x76543210, 0x76543201, 0x76543210, 0x76543210};
4866
4867 // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
4868 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4869 // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
4870 // latency, it may be faster to use LoadDup128 and PSHUFB.
4871 const auto packed = Set(d32, packed_array[mask_bits]);
4872 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4873 return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
4874}
4875
4876template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4877HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
4878 uint64_t mask_bits) {
4879 const Repartition<uint32_t, decltype(d)> d32;
4880
4881 // For 64-bit, we still need 32-bit indices because there is no 64-bit
4882 // permutevar, but there are only 4 lanes, so we can afford to skip the
4883 // unpacking and load the entire index vector directly.
4884 alignas(32) constexpr uint32_t u32_indices[128] = {
4885 // PrintCompressNot64x4PairTables
4886 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
4887 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
4888 0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
4889 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
4890 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
4891 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
4892 return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
4893}
4894template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4895HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4896 const Full256<T> d;
4897 const Repartition<uint32_t, decltype(d)> du32;
4898
4899 HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
4900 const auto indices = IndicesFromBits(d, mask_bits);
4901 return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
4902}
4903
4904// LUTs are infeasible for 2^16 possible masks, so splice together two
4905// half-vector Compress.
4906template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4907HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4908 const Full256<T> d;
4909 const RebindToUnsigned<decltype(d)> du;
4910 const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
4911 const Half<decltype(du)> duh;
4912 const auto half0 = LowerHalf(duh, vu16);
4913 const auto half1 = UpperHalf(duh, vu16);
4914
4915 const uint64_t mask_bits0 = mask_bits & 0xFF;
4916 const uint64_t mask_bits1 = mask_bits >> 8;
4917 const auto compressed0 = detail::CompressBits(half0, mask_bits0);
4918 const auto compressed1 = detail::CompressBits(half1, mask_bits1);
4919
4920 alignas(32) uint16_t all_true[16] = {};
4921 // Store mask=true lanes, left to right.
4922 const size_t num_true0 = PopCount(mask_bits0);
4923 Store(compressed0, duh, all_true);
4924 StoreU(compressed1, duh, all_true + num_true0);
4925
4927 // Store mask=false lanes, right to left. The second vector fills the upper
4928 // half with right-aligned false lanes. The first vector is shifted
4929 // rightwards to overwrite the true lanes of the second.
4930 alignas(32) uint16_t all_false[16] = {};
4931 const size_t num_true1 = PopCount(mask_bits1);
4932 Store(compressed1, duh, all_false + 8);
4933 StoreU(compressed0, duh, all_false + num_true1);
4934
4935 const auto mask = FirstN(du, num_true0 + num_true1);
4936 return BitCast(d,
4937 IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
4938 } else {
4939 // Only care about the mask=true lanes.
4940 return BitCast(d, Load(du, all_true));
4941 }
4942}
4943
4944template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4945HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
4946 const Full256<T> d;
4947 const Repartition<uint32_t, decltype(d)> du32;
4948
4949 HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
4950 const auto indices = IndicesFromNotBits(d, mask_bits);
4951 return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
4952}
4953
4954// LUTs are infeasible for 2^16 possible masks, so splice together two
4955// half-vector Compress.
4956template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4957HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
4958 // Compress ensures only the lower 16 bits are set, so flip those.
4959 return Compress(v, mask_bits ^ 0xFFFF);
4960}
4961
4962} // namespace detail
4963
4964template <typename T>
4965HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
4967}
4968
4969template <typename T>
4970HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
4972}
4973
4974HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
4975 Mask256<uint64_t> mask) {
4976 return CompressNot(v, mask);
4977}
4978
4979template <typename T>
4980HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4981 constexpr size_t N = 32 / sizeof(T);
4982 constexpr size_t kNumBytes = (N + 7) / 8;
4983
4984 uint64_t mask_bits = 0;
4985 CopyBytes<kNumBytes>(bits, &mask_bits);
4986
4987 if (N < 8) {
4988 mask_bits &= (1ull << N) - 1;
4989 }
4990
4991 return detail::Compress(v, mask_bits);
4992}
4993
4994// ------------------------------ CompressStore, CompressBitsStore
4995
4996template <typename T>
4997HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4998 T* HWY_RESTRICT unaligned) {
4999 const uint64_t mask_bits = detail::BitsFromMask(m);
5000 const size_t count = PopCount(mask_bits);
5001 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5002 // Workaround for MSAN not marking output as initialized (b/233326619)
5003#if HWY_IS_MSAN
5004 __msan_unpoison(unaligned, count * sizeof(T));
5005#endif
5006 return count;
5007}
5008
5009template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
5010HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
5011 T* HWY_RESTRICT unaligned) {
5012 const uint64_t mask_bits = detail::BitsFromMask(m);
5013 const size_t count = PopCount(mask_bits);
5014 BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
5015 // Workaround for MSAN not marking output as initialized (b/233326619)
5016#if HWY_IS_MSAN
5017 __msan_unpoison(unaligned, count * sizeof(T));
5018#endif
5019 return count;
5020}
5021
5022template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5023HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
5024 T* HWY_RESTRICT unaligned) {
5025 const uint64_t mask_bits = detail::BitsFromMask(m);
5026 const size_t count = PopCount(mask_bits);
5027 const Vec256<T> compressed = detail::Compress(v, mask_bits);
5028
5029#if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN
5030 // BlendedStore tests mask for each lane, but we know that the mask is
5031 // FirstN, so we can just copy.
5032 alignas(32) T buf[16];
5033 Store(compressed, d, buf);
5034 memcpy(unaligned, buf, count * sizeof(T));
5035#else
5036 BlendedStore(compressed, FirstN(d, count), d, unaligned);
5037#endif
5038 return count;
5039}
5040
5041template <typename T>
5042HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
5043 Full256<T> d, T* HWY_RESTRICT unaligned) {
5044 constexpr size_t N = 32 / sizeof(T);
5045 constexpr size_t kNumBytes = (N + 7) / 8;
5046
5047 uint64_t mask_bits = 0;
5048 CopyBytes<kNumBytes>(bits, &mask_bits);
5049
5050 if (N < 8) {
5051 mask_bits &= (1ull << N) - 1;
5052 }
5053 const size_t count = PopCount(mask_bits);
5054
5055 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5056 // Workaround for MSAN not marking output as initialized (b/233326619)
5057#if HWY_IS_MSAN
5058 __msan_unpoison(unaligned, count * sizeof(T));
5059#endif
5060 return count;
5061}
5062
5063#endif // HWY_TARGET <= HWY_AVX3
5064
5065// ------------------------------ LoadInterleaved3/4
5066
5067// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
5068
5069namespace detail {
5070
5071// Input:
5072// 1 0 (<- first block of unaligned)
5073// 3 2
5074// 5 4
5075// Output:
5076// 3 0
5077// 4 1
5078// 5 2
5079template <typename T>
5081 const T* HWY_RESTRICT unaligned,
5082 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
5083 constexpr size_t N = 32 / sizeof(T);
5084 const Vec256<T> v10 = LoadU(d, unaligned + 0 * N); // 1 0
5085 const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
5086 const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
5087
5088 A = ConcatUpperLower(d, v32, v10);
5089 B = ConcatLowerUpper(d, v54, v10);
5090 C = ConcatUpperLower(d, v54, v32);
5091}
5092
5093// Input (128-bit blocks):
5094// 1 0 (first block of unaligned)
5095// 3 2
5096// 5 4
5097// 7 6
5098// Output:
5099// 4 0 (LSB of A)
5100// 5 1
5101// 6 2
5102// 7 3
5103template <typename T>
5105 const T* HWY_RESTRICT unaligned,
5106 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C,
5107 Vec256<T>& D) {
5108 constexpr size_t N = 32 / sizeof(T);
5109 const Vec256<T> v10 = LoadU(d, unaligned + 0 * N);
5110 const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
5111 const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
5112 const Vec256<T> v76 = LoadU(d, unaligned + 3 * N);
5113
5114 A = ConcatLowerLower(d, v54, v10);
5115 B = ConcatUpperUpper(d, v54, v10);
5116 C = ConcatLowerLower(d, v76, v32);
5117 D = ConcatUpperUpper(d, v76, v32);
5118}
5119
5120} // namespace detail
5121
5122// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
5123
5124// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
5125
5126namespace detail {
5127
5128// Input (128-bit blocks):
5129// 2 0 (LSB of i)
5130// 3 1
5131// Output:
5132// 1 0
5133// 3 2
5134template <typename T>
5136 const Full256<T> d,
5137 T* HWY_RESTRICT unaligned) {
5138 constexpr size_t N = 32 / sizeof(T);
5139 const auto out0 = ConcatLowerLower(d, j, i);
5140 const auto out1 = ConcatUpperUpper(d, j, i);
5141 StoreU(out0, d, unaligned + 0 * N);
5142 StoreU(out1, d, unaligned + 1 * N);
5143}
5144
5145// Input (128-bit blocks):
5146// 3 0 (LSB of i)
5147// 4 1
5148// 5 2
5149// Output:
5150// 1 0
5151// 3 2
5152// 5 4
5153template <typename T>
5155 const Vec256<T> k, Full256<T> d,
5156 T* HWY_RESTRICT unaligned) {
5157 constexpr size_t N = 32 / sizeof(T);
5158 const auto out0 = ConcatLowerLower(d, j, i);
5159 const auto out1 = ConcatUpperLower(d, i, k);
5160 const auto out2 = ConcatUpperUpper(d, k, j);
5161 StoreU(out0, d, unaligned + 0 * N);
5162 StoreU(out1, d, unaligned + 1 * N);
5163 StoreU(out2, d, unaligned + 2 * N);
5164}
5165
5166// Input (128-bit blocks):
5167// 4 0 (LSB of i)
5168// 5 1
5169// 6 2
5170// 7 3
5171// Output:
5172// 1 0
5173// 3 2
5174// 5 4
5175// 7 6
5176template <typename T>
5178 const Vec256<T> k, const Vec256<T> l,
5179 Full256<T> d, T* HWY_RESTRICT unaligned) {
5180 constexpr size_t N = 32 / sizeof(T);
5181 // Write lower halves, then upper.
5182 const auto out0 = ConcatLowerLower(d, j, i);
5183 const auto out1 = ConcatLowerLower(d, l, k);
5184 StoreU(out0, d, unaligned + 0 * N);
5185 StoreU(out1, d, unaligned + 1 * N);
5186 const auto out2 = ConcatUpperUpper(d, j, i);
5187 const auto out3 = ConcatUpperUpper(d, l, k);
5188 StoreU(out2, d, unaligned + 2 * N);
5189 StoreU(out3, d, unaligned + 3 * N);
5190}
5191
5192} // namespace detail
5193
5194// ------------------------------ Reductions
5195
5196namespace detail {
5197
5198// Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block.
5199// Same logic as x86/128.h, but with Vec256 arguments.
5200template <typename T>
5201HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
5202 const Vec256<T> v3210) {
5203 const auto v1032 = Shuffle1032(v3210);
5204 const auto v31_20_31_20 = v3210 + v1032;
5205 const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5206 return v20_31_20_31 + v31_20_31_20;
5207}
5208template <typename T>
5209HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
5210 const Vec256<T> v3210) {
5211 const auto v1032 = Shuffle1032(v3210);
5212 const auto v31_20_31_20 = Min(v3210, v1032);
5213 const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5214 return Min(v20_31_20_31, v31_20_31_20);
5215}
5216template <typename T>
5217HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
5218 const Vec256<T> v3210) {
5219 const auto v1032 = Shuffle1032(v3210);
5220 const auto v31_20_31_20 = Max(v3210, v1032);
5221 const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
5222 return Max(v20_31_20_31, v31_20_31_20);
5223}
5224
5225template <typename T>
5226HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
5227 const Vec256<T> v10) {
5228 const auto v01 = Shuffle01(v10);
5229 return v10 + v01;
5230}
5231template <typename T>
5232HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
5233 const Vec256<T> v10) {
5234 const auto v01 = Shuffle01(v10);
5235 return Min(v10, v01);
5236}
5237template <typename T>
5238HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
5239 const Vec256<T> v10) {
5240 const auto v01 = Shuffle01(v10);
5241 return Max(v10, v01);
5242}
5243
5244// u16/i16
5245template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5246HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
5247 const Repartition<int32_t, Full256<T>> d32;
5248 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5249 const auto odd = ShiftRight<16>(BitCast(d32, v));
5250 const auto min = MinOfLanes(d32, Min(even, odd));
5251 // Also broadcast into odd lanes.
5252 return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
5253}
5254template <typename T, HWY_IF_LANE_SIZE(T, 2)>
5255HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
5256 const Repartition<int32_t, Full256<T>> d32;
5257 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5258 const auto odd = ShiftRight<16>(BitCast(d32, v));
5259 const auto min = MaxOfLanes(d32, Max(even, odd));
5260 // Also broadcast into odd lanes.
5261 return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
5262}
5263
5264} // namespace detail
5265
5266// Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
5267template <typename T>
5268HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
5269 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5270 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), vLH + vHL);
5271}
5272template <typename T>
5273HWY_API Vec256<T> MinOfLanes(Full256<T> d, const Vec256<T> vHL) {
5274 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5275 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), Min(vLH, vHL));
5276}
5277template <typename T>
5278HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
5279 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5280 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
5281}
5282
5283// NOLINTNEXTLINE(google-readability-namespace-comments)
5284} // namespace HWY_NAMESPACE
5285} // namespace hwy
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: x86_256-inl.h:96
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: x86_256-inl.h:90
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: x86_256-inl.h:87
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: x86_256-inl.h:84
Raw raw
Definition: x86_256-inl.h:100
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: x86_256-inl.h:93
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: x86_256-inl.h:81
typename detail::Raw256< T >::type Raw
Definition: x86_256-inl.h:73
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: x86_256-inl.h:78
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_API Vec256< T > Shuffle3012(const Vec256< T > a, const Vec256< T > b)
Definition: x86_256-inl.h:2851
HWY_API Vec256< T > Shuffle1230(const Vec256< T > a, const Vec256< T > b)
Definition: x86_256-inl.h:2843
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:3035
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5742
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5751
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition: ops/shared-inl.h:252
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
long long int GatherIndex64
Definition: x86_128-inl.h:3201
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:5318
Definition: wasm_256-inl.h:1801
__v128_u raw
Definition: wasm_256-inl.h:1802
__m256i raw
Definition: x86_256-inl.h:2920
Definition: wasm_256-inl.h:70
typename detail::RawMask256< sizeof(T)>::type Raw
Definition: x86_256-inl.h:131
static Mask256< T > FromBits(uint64_t mask_bits)
Definition: x86_256-inl.h:133
Raw raw
Definition: x86_256-inl.h:137
Definition: ops/shared-inl.h:40
HWY_INLINE __m256d operator()(__m256i v)
Definition: x86_256-inl.h:176
HWY_INLINE __m256 operator()(__m256i v)
Definition: x86_256-inl.h:172
HWY_INLINE __m256i operator()(__m256i v)
Definition: x86_256-inl.h:168
__m256d type
Definition: x86_256-inl.h:66
__m256 type
Definition: x86_256-inl.h:62
Definition: x86_256-inl.h:57
__m256i type
Definition: x86_256-inl.h:58
__mmask32 type
Definition: x86_256-inl.h:112
__mmask16 type
Definition: x86_256-inl.h:116
__mmask8 type
Definition: x86_256-inl.h:120
__mmask8 type
Definition: x86_256-inl.h:124
Definition: x86_256-inl.h:109
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()