Grok 10.0.0
x86_512-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 512-bit AVX512 vectors and operations.
17// External include guard in highway.h - see comment there.
18
19// WARNING: most operations do not cross 128-bit block boundaries. In
20// particular, "Broadcast", pack and zip behavior may be surprising.
21
22// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
23#include "hwy/base.h"
24
25// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
26// https://github.com/google/highway/issues/710)
28#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
29HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
30HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
31#endif
32
33#include <immintrin.h> // AVX2+
34
35#if HWY_COMPILER_CLANGCL
36// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
37// including these headers when _MSC_VER is defined, like when using clang-cl.
38// Include these directly here.
39// clang-format off
40#include <smmintrin.h>
41
42#include <avxintrin.h>
43#include <avx2intrin.h>
44#include <f16cintrin.h>
45#include <fmaintrin.h>
46
47#include <avx512fintrin.h>
48#include <avx512vlintrin.h>
49#include <avx512bwintrin.h>
50#include <avx512dqintrin.h>
51#include <avx512vlbwintrin.h>
52#include <avx512vldqintrin.h>
53#include <avx512bitalgintrin.h>
54#include <avx512vlbitalgintrin.h>
55#include <avx512vpopcntdqintrin.h>
56#include <avx512vpopcntdqvlintrin.h>
57// clang-format on
58#endif // HWY_COMPILER_CLANGCL
59
60#include <stddef.h>
61#include <stdint.h>
62
63#if HWY_IS_MSAN
64#include <sanitizer/msan_interface.h>
65#endif
66
67// For half-width vectors. Already includes base.h and shared-inl.h.
68#include "hwy/ops/x86_256-inl.h"
69
71namespace hwy {
72namespace HWY_NAMESPACE {
73
74namespace detail {
75
76template <typename T>
77struct Raw512 {
78 using type = __m512i;
79};
80template <>
81struct Raw512<float> {
82 using type = __m512;
83};
84template <>
85struct Raw512<double> {
86 using type = __m512d;
87};
88
89// Template arg: sizeof(lane type)
90template <size_t size>
91struct RawMask512 {};
92template <>
93struct RawMask512<1> {
94 using type = __mmask64;
95};
96template <>
97struct RawMask512<2> {
98 using type = __mmask32;
99};
100template <>
101struct RawMask512<4> {
102 using type = __mmask16;
103};
104template <>
105struct RawMask512<8> {
106 using type = __mmask8;
107};
108
109} // namespace detail
110
111template <typename T>
112class Vec512 {
113 using Raw = typename detail::Raw512<T>::type;
114
115 public:
116 // Compound assignment. Only usable if there is a corresponding non-member
117 // binary operator overload. For example, only f32 and f64 support division.
119 return *this = (*this * other);
120 }
122 return *this = (*this / other);
123 }
125 return *this = (*this + other);
126 }
128 return *this = (*this - other);
129 }
131 return *this = (*this & other);
132 }
134 return *this = (*this | other);
135 }
137 return *this = (*this ^ other);
138 }
139
141};
142
143// Mask register: one bit per lane.
144template <typename T>
145struct Mask512 {
146 typename detail::RawMask512<sizeof(T)>::type raw;
147};
148
149// ------------------------------ BitCast
150
151namespace detail {
152
153HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; }
154HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); }
155HWY_INLINE __m512i BitCastToInteger(__m512d v) {
156 return _mm512_castpd_si512(v);
157}
158
159template <typename T>
162}
163
164// Cannot rely on function overloading because return types differ.
165template <typename T>
167 HWY_INLINE __m512i operator()(__m512i v) { return v; }
168};
169template <>
171 HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); }
172};
173template <>
174struct BitCastFromInteger512<double> {
175 HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); }
176};
177
178template <typename T>
180 return Vec512<T>{BitCastFromInteger512<T>()(v.raw)};
181}
182
183} // namespace detail
184
185template <typename T, typename FromT>
188}
189
190// ------------------------------ Set
191
192// Returns an all-zero vector.
193template <typename T>
195 return Vec512<T>{_mm512_setzero_si512()};
196}
198 return Vec512<float>{_mm512_setzero_ps()};
199}
201 return Vec512<double>{_mm512_setzero_pd()};
202}
203
204// Returns a vector with all lanes set to "t".
205HWY_API Vec512<uint8_t> Set(Full512<uint8_t> /* tag */, const uint8_t t) {
206 return Vec512<uint8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
207}
208HWY_API Vec512<uint16_t> Set(Full512<uint16_t> /* tag */, const uint16_t t) {
209 return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
210}
211HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
212 return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
213}
214HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
215 return Vec512<uint64_t>{
216 _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
217}
218HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
219 return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
220}
221HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
222 return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
223}
224HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
225 return Vec512<int32_t>{_mm512_set1_epi32(t)};
226}
227HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
228 return Vec512<int64_t>{
229 _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
230}
231HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
232 return Vec512<float>{_mm512_set1_ps(t)};
233}
234HWY_API Vec512<double> Set(Full512<double> /* tag */, const double t) {
235 return Vec512<double>{_mm512_set1_pd(t)};
236}
237
238HWY_DIAGNOSTICS(push)
239HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
240
241// Returns a vector with uninitialized elements.
242template <typename T>
244 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
245 // generate an XOR instruction.
246 return Vec512<T>{_mm512_undefined_epi32()};
247}
249 return Vec512<float>{_mm512_undefined_ps()};
250}
252 return Vec512<double>{_mm512_undefined_pd()};
253}
254
256
257// ================================================== LOGICAL
258
259// ------------------------------ Not
260
261template <typename T>
263 using TU = MakeUnsigned<T>;
264 const __m512i vu = BitCast(Full512<TU>(), v).raw;
265 return BitCast(Full512<T>(),
266 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
267}
268
269// ------------------------------ And
270
271template <typename T>
273 return Vec512<T>{_mm512_and_si512(a.raw, b.raw)};
274}
275
277 return Vec512<float>{_mm512_and_ps(a.raw, b.raw)};
278}
280 return Vec512<double>{_mm512_and_pd(a.raw, b.raw)};
281}
282
283// ------------------------------ AndNot
284
285// Returns ~not_mask & mask.
286template <typename T>
287HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
288 return Vec512<T>{_mm512_andnot_si512(not_mask.raw, mask.raw)};
289}
291 const Vec512<float> mask) {
292 return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)};
293}
295 const Vec512<double> mask) {
296 return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)};
297}
298
299// ------------------------------ Or
300
301template <typename T>
303 return Vec512<T>{_mm512_or_si512(a.raw, b.raw)};
304}
305
307 return Vec512<float>{_mm512_or_ps(a.raw, b.raw)};
308}
310 return Vec512<double>{_mm512_or_pd(a.raw, b.raw)};
311}
312
313// ------------------------------ Xor
314
315template <typename T>
317 return Vec512<T>{_mm512_xor_si512(a.raw, b.raw)};
318}
319
321 return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)};
322}
324 return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
325}
326
327// ------------------------------ Or3
328
329template <typename T>
331 const Full512<T> d;
332 const RebindToUnsigned<decltype(d)> du;
333 using VU = VFromD<decltype(du)>;
334 const __m512i ret = _mm512_ternarylogic_epi64(
335 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
336 return BitCast(d, VU{ret});
337}
338
339// ------------------------------ OrAnd
340
341template <typename T>
343 const Full512<T> d;
344 const RebindToUnsigned<decltype(d)> du;
345 using VU = VFromD<decltype(du)>;
346 const __m512i ret = _mm512_ternarylogic_epi64(
347 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
348 return BitCast(d, VU{ret});
349}
350
351// ------------------------------ IfVecThenElse
352
353template <typename T>
355 const Full512<T> d;
356 const RebindToUnsigned<decltype(d)> du;
357 using VU = VFromD<decltype(du)>;
358 return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
359 BitCast(du, yes).raw,
360 BitCast(du, no).raw, 0xCA)});
361}
362
363// ------------------------------ Operator overloads (internal-only if float)
364
365template <typename T>
367 return And(a, b);
368}
369
370template <typename T>
372 return Or(a, b);
373}
374
375template <typename T>
377 return Xor(a, b);
378}
379
380// ------------------------------ PopulationCount
381
382// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
383#if HWY_TARGET == HWY_AVX3_DL
384
385#ifdef HWY_NATIVE_POPCNT
386#undef HWY_NATIVE_POPCNT
387#else
388#define HWY_NATIVE_POPCNT
389#endif
390
391namespace detail {
392
393template <typename T>
395 return Vec512<T>{_mm512_popcnt_epi8(v.raw)};
396}
397template <typename T>
399 return Vec512<T>{_mm512_popcnt_epi16(v.raw)};
400}
401template <typename T>
403 return Vec512<T>{_mm512_popcnt_epi32(v.raw)};
404}
405template <typename T>
407 return Vec512<T>{_mm512_popcnt_epi64(v.raw)};
408}
409
410} // namespace detail
411
412template <typename T>
414 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
415}
416
417#endif // HWY_TARGET == HWY_AVX3_DL
418
419// ================================================== SIGN
420
421// ------------------------------ CopySign
422
423template <typename T>
425 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
426
427 const Full512<T> d;
428 const auto msb = SignBit(d);
429
430 const Rebind<MakeUnsigned<T>, decltype(d)> du;
431 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
432 // 0 0 0 | 0
433 // 0 0 1 | 0
434 // 0 1 0 | 1
435 // 0 1 1 | 1
436 // 1 0 0 | 0
437 // 1 0 1 | 1
438 // 1 1 0 | 0
439 // 1 1 1 | 1
440 // The lane size does not matter because we are not using predication.
441 const __m512i out = _mm512_ternarylogic_epi32(
442 BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
443 return BitCast(d, decltype(Zero(du)){out});
444}
445
446template <typename T>
448 // AVX3 can also handle abs < 0, so no extra action needed.
449 return CopySign(abs, sign);
450}
451
452// ================================================== MASK
453
454// ------------------------------ FirstN
455
456// Possibilities for constructing a bitmask of N ones:
457// - kshift* only consider the lowest byte of the shift count, so they would
458// not correctly handle large n.
459// - Scalar shifts >= 64 are UB.
460// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
461// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
462
463#if HWY_ARCH_X86_32
464namespace detail {
465
466// 32 bit mask is sufficient for lane size >= 2.
467template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
468HWY_INLINE Mask512<T> FirstN(size_t n) {
469 Mask512<T> m;
470 const uint32_t all = ~uint32_t(0);
471 // BZHI only looks at the lower 8 bits of n!
472 m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n));
473 return m;
474}
475
476template <typename T, HWY_IF_LANE_SIZE(T, 1)>
477HWY_INLINE Mask512<T> FirstN(size_t n) {
478 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
479 return Mask512<T>{static_cast<__mmask64>(bits)};
480}
481
482} // namespace detail
483#endif // HWY_ARCH_X86_32
484
485template <typename T>
486HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
487#if HWY_ARCH_X86_64
488 Mask512<T> m;
489 const uint64_t all = ~uint64_t(0);
490 // BZHI only looks at the lower 8 bits of n!
491 m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n));
492 return m;
493#else
494 return detail::FirstN<T>(n);
495#endif // HWY_ARCH_X86_64
496}
497
498// ------------------------------ IfThenElse
499
500// Returns mask ? b : a.
501
502namespace detail {
503
504// Templates for signed/unsigned integer of a particular size.
505template <typename T>
507 const Mask512<T> mask, const Vec512<T> yes,
508 const Vec512<T> no) {
509 return Vec512<T>{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
510}
511template <typename T>
513 const Mask512<T> mask, const Vec512<T> yes,
514 const Vec512<T> no) {
515 return Vec512<T>{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
516}
517template <typename T>
519 const Mask512<T> mask, const Vec512<T> yes,
520 const Vec512<T> no) {
521 return Vec512<T>{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
522}
523template <typename T>
525 const Mask512<T> mask, const Vec512<T> yes,
526 const Vec512<T> no) {
527 return Vec512<T>{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
528}
529
530} // namespace detail
531
532template <typename T>
534 const Vec512<T> no) {
535 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
536}
538 const Vec512<float> yes,
539 const Vec512<float> no) {
540 return Vec512<float>{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)};
541}
543 const Vec512<double> yes,
544 const Vec512<double> no) {
545 return Vec512<double>{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)};
546}
547
548namespace detail {
549
550template <typename T>
552 const Mask512<T> mask,
553 const Vec512<T> yes) {
554 return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)};
555}
556template <typename T>
558 const Mask512<T> mask,
559 const Vec512<T> yes) {
560 return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)};
561}
562template <typename T>
564 const Mask512<T> mask,
565 const Vec512<T> yes) {
566 return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)};
567}
568template <typename T>
570 const Mask512<T> mask,
571 const Vec512<T> yes) {
572 return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)};
573}
574
575} // namespace detail
576
577template <typename T>
579 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
580}
582 const Vec512<float> yes) {
583 return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)};
584}
586 const Vec512<double> yes) {
587 return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)};
588}
589
590namespace detail {
591
592template <typename T>
594 const Mask512<T> mask, const Vec512<T> no) {
595 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
596 return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
597}
598template <typename T>
600 const Mask512<T> mask, const Vec512<T> no) {
601 return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
602}
603template <typename T>
605 const Mask512<T> mask, const Vec512<T> no) {
606 return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
607}
608template <typename T>
610 const Mask512<T> mask, const Vec512<T> no) {
611 return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
612}
613
614} // namespace detail
615
616template <typename T>
618 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
619}
621 const Vec512<float> no) {
622 return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
623}
625 const Vec512<double> no) {
626 return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
627}
628
629template <typename T>
631 static_assert(IsSigned<T>(), "Only works for signed/float");
632 // AVX3 MaskFromVec only looks at the MSB
633 return IfThenElse(MaskFromVec(v), yes, no);
634}
635
636template <typename T, HWY_IF_FLOAT(T)>
638 // AVX3 MaskFromVec only looks at the MSB
639 return IfThenZeroElse(MaskFromVec(v), v);
640}
641
642// ================================================== ARITHMETIC
643
644// ------------------------------ Addition
645
646// Unsigned
648 const Vec512<uint8_t> b) {
649 return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)};
650}
652 const Vec512<uint16_t> b) {
653 return Vec512<uint16_t>{_mm512_add_epi16(a.raw, b.raw)};
654}
656 const Vec512<uint32_t> b) {
657 return Vec512<uint32_t>{_mm512_add_epi32(a.raw, b.raw)};
658}
660 const Vec512<uint64_t> b) {
661 return Vec512<uint64_t>{_mm512_add_epi64(a.raw, b.raw)};
662}
663
664// Signed
666 const Vec512<int8_t> b) {
667 return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)};
668}
670 const Vec512<int16_t> b) {
671 return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)};
672}
674 const Vec512<int32_t> b) {
675 return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)};
676}
678 const Vec512<int64_t> b) {
679 return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)};
680}
681
682// Float
684 return Vec512<float>{_mm512_add_ps(a.raw, b.raw)};
685}
687 const Vec512<double> b) {
688 return Vec512<double>{_mm512_add_pd(a.raw, b.raw)};
689}
690
691// ------------------------------ Subtraction
692
693// Unsigned
695 const Vec512<uint8_t> b) {
696 return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)};
697}
699 const Vec512<uint16_t> b) {
700 return Vec512<uint16_t>{_mm512_sub_epi16(a.raw, b.raw)};
701}
703 const Vec512<uint32_t> b) {
704 return Vec512<uint32_t>{_mm512_sub_epi32(a.raw, b.raw)};
705}
707 const Vec512<uint64_t> b) {
708 return Vec512<uint64_t>{_mm512_sub_epi64(a.raw, b.raw)};
709}
710
711// Signed
713 const Vec512<int8_t> b) {
714 return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)};
715}
717 const Vec512<int16_t> b) {
718 return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)};
719}
721 const Vec512<int32_t> b) {
722 return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)};
723}
725 const Vec512<int64_t> b) {
726 return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)};
727}
728
729// Float
731 return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)};
732}
734 const Vec512<double> b) {
735 return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
736}
737
738// ------------------------------ SumsOf8
740 return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
741}
742
743// ------------------------------ SaturatedAdd
744
745// Returns a + b clamped to the destination range.
746
747// Unsigned
749 const Vec512<uint8_t> b) {
750 return Vec512<uint8_t>{_mm512_adds_epu8(a.raw, b.raw)};
751}
753 const Vec512<uint16_t> b) {
754 return Vec512<uint16_t>{_mm512_adds_epu16(a.raw, b.raw)};
755}
756
757// Signed
759 const Vec512<int8_t> b) {
760 return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)};
761}
763 const Vec512<int16_t> b) {
764 return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
765}
766
767// ------------------------------ SaturatedSub
768
769// Returns a - b clamped to the destination range.
770
771// Unsigned
773 const Vec512<uint8_t> b) {
774 return Vec512<uint8_t>{_mm512_subs_epu8(a.raw, b.raw)};
775}
777 const Vec512<uint16_t> b) {
778 return Vec512<uint16_t>{_mm512_subs_epu16(a.raw, b.raw)};
779}
780
781// Signed
783 const Vec512<int8_t> b) {
784 return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)};
785}
787 const Vec512<int16_t> b) {
788 return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)};
789}
790
791// ------------------------------ Average
792
793// Returns (a + b + 1) / 2
794
795// Unsigned
797 const Vec512<uint8_t> b) {
798 return Vec512<uint8_t>{_mm512_avg_epu8(a.raw, b.raw)};
799}
801 const Vec512<uint16_t> b) {
802 return Vec512<uint16_t>{_mm512_avg_epu16(a.raw, b.raw)};
803}
804
805// ------------------------------ Abs (Sub)
806
807// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
809#if HWY_COMPILER_MSVC
810 // Workaround for incorrect codegen? (untested due to internal compiler error)
811 const auto zero = Zero(Full512<int8_t>());
812 return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
813#else
814 return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
815#endif
816}
818 return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
819}
821 return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
822}
824 return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
825}
826
827// These aren't native instructions, they also involve AND with constant.
829 return Vec512<float>{_mm512_abs_ps(v.raw)};
830}
832 return Vec512<double>{_mm512_abs_pd(v.raw)};
833}
834// ------------------------------ ShiftLeft
835
836template <int kBits>
838 return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)};
839}
840
841template <int kBits>
843 return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)};
844}
845
846template <int kBits>
848 return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)};
849}
850
851template <int kBits>
853 return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)};
854}
855
856template <int kBits>
858 return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)};
859}
860
861template <int kBits>
863 return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
864}
865
866template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
868 const Full512<T> d8;
869 const RepartitionToWide<decltype(d8)> d16;
870 const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
871 return kBits == 1
872 ? (v + v)
873 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
874}
875
876// ------------------------------ ShiftRight
877
878template <int kBits>
880 return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)};
881}
882
883template <int kBits>
885 return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)};
886}
887
888template <int kBits>
890 return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)};
891}
892
893template <int kBits>
895 const Full512<uint8_t> d8;
896 // Use raw instead of BitCast to support N=1.
897 const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
898 return shifted & Set(d8, 0xFF >> kBits);
899}
900
901template <int kBits>
903 return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
904}
905
906template <int kBits>
908 return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)};
909}
910
911template <int kBits>
913 return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
914}
915
916template <int kBits>
918 const Full512<int8_t> di;
919 const Full512<uint8_t> du;
920 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
921 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
922 return (shifted ^ shifted_sign) - shifted_sign;
923}
924
925// ------------------------------ RotateRight
926
927template <int kBits>
929 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
930 return Vec512<uint32_t>{_mm512_ror_epi32(v.raw, kBits)};
931}
932
933template <int kBits>
935 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
936 return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
937}
938
939// ------------------------------ ShiftLeftSame
940
942 const int bits) {
943 return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
944}
946 const int bits) {
947 return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
948}
950 const int bits) {
951 return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
952}
953
955 return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
956}
957
959 return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
960}
961
963 return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
964}
965
966template <typename T, HWY_IF_LANE_SIZE(T, 1)>
967HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
968 const Full512<T> d8;
969 const RepartitionToWide<decltype(d8)> d16;
970 const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
971 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
972}
973
974// ------------------------------ ShiftRightSame
975
977 const int bits) {
978 return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
979}
981 const int bits) {
982 return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
983}
985 const int bits) {
986 return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
987}
988
990 const Full512<uint8_t> d8;
991 const RepartitionToWide<decltype(d8)> d16;
992 const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
993 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
994}
995
997 const int bits) {
998 return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
999}
1000
1002 const int bits) {
1003 return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
1004}
1006 const int bits) {
1007 return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
1008}
1009
1011 const Full512<int8_t> di;
1012 const Full512<uint8_t> du;
1013 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
1014 const auto shifted_sign =
1015 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
1016 return (shifted ^ shifted_sign) - shifted_sign;
1017}
1018
1019// ------------------------------ Shl
1020
1022 const Vec512<uint16_t> bits) {
1023 return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
1024}
1025
1027 const Vec512<uint32_t> bits) {
1028 return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)};
1029}
1030
1032 const Vec512<uint64_t> bits) {
1033 return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)};
1034}
1035
1036// Signed left shift is the same as unsigned.
1037template <typename T, HWY_IF_SIGNED(T)>
1039 const Full512<T> di;
1040 const Full512<MakeUnsigned<T>> du;
1041 return BitCast(di, BitCast(du, v) << BitCast(du, bits));
1042}
1043
1044// ------------------------------ Shr
1045
1047 const Vec512<uint16_t> bits) {
1048 return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)};
1049}
1050
1052 const Vec512<uint32_t> bits) {
1053 return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)};
1054}
1055
1057 const Vec512<uint64_t> bits) {
1058 return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)};
1059}
1060
1062 const Vec512<int16_t> bits) {
1063 return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)};
1064}
1065
1067 const Vec512<int32_t> bits) {
1068 return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)};
1069}
1070
1072 const Vec512<int64_t> bits) {
1073 return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
1074}
1075
1076// ------------------------------ Minimum
1077
1078// Unsigned
1080 return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)};
1081}
1083 const Vec512<uint16_t> b) {
1084 return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)};
1085}
1087 const Vec512<uint32_t> b) {
1088 return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)};
1089}
1091 const Vec512<uint64_t> b) {
1092 return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)};
1093}
1094
1095// Signed
1097 return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)};
1098}
1100 return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)};
1101}
1103 return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)};
1104}
1106 return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)};
1107}
1108
1109// Float
1111 return Vec512<float>{_mm512_min_ps(a.raw, b.raw)};
1112}
1114 return Vec512<double>{_mm512_min_pd(a.raw, b.raw)};
1115}
1116
1117// ------------------------------ Maximum
1118
1119// Unsigned
1121 return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)};
1122}
1124 const Vec512<uint16_t> b) {
1125 return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)};
1126}
1128 const Vec512<uint32_t> b) {
1129 return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)};
1130}
1132 const Vec512<uint64_t> b) {
1133 return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)};
1134}
1135
1136// Signed
1138 return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)};
1139}
1141 return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)};
1142}
1144 return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)};
1145}
1147 return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)};
1148}
1149
1150// Float
1152 return Vec512<float>{_mm512_max_ps(a.raw, b.raw)};
1153}
1155 return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
1156}
1157
1158// ------------------------------ Integer multiplication
1159
1160// Unsigned
1162 return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1163}
1165 return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1166}
1167
1168// Signed
1170 return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1171}
1173 return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1174}
1175
1176// Returns the upper 16 bits of a * b in each lane.
1178 return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
1179}
1181 return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)};
1182}
1183
1185 return Vec512<int16_t>{_mm512_mulhrs_epi16(a.raw, b.raw)};
1186}
1187
1188// Multiplies even lanes (0, 2 ..) and places the double-wide result into
1189// even and the upper half into its odd neighbor lane.
1191 return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)};
1192}
1194 return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)};
1195}
1196
1197// ------------------------------ Neg (Sub)
1198
1199template <typename T, HWY_IF_FLOAT(T)>
1201 return Xor(v, SignBit(Full512<T>()));
1202}
1203
1204template <typename T, HWY_IF_NOT_FLOAT(T)>
1205HWY_API Vec512<T> Neg(const Vec512<T> v) {
1206 return Zero(Full512<T>()) - v;
1207}
1208
1209// ------------------------------ Floating-point mul / div
1210
1212 return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)};
1213}
1215 const Vec512<double> b) {
1216 return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
1217}
1218
1220 return Vec512<float>{_mm512_div_ps(a.raw, b.raw)};
1221}
1223 const Vec512<double> b) {
1224 return Vec512<double>{_mm512_div_pd(a.raw, b.raw)};
1225}
1226
1227// Approximate reciprocal
1229 return Vec512<float>{_mm512_rcp14_ps(v.raw)};
1230}
1231
1232// Absolute value of difference.
1234 return Abs(a - b);
1235}
1236
1237// ------------------------------ Floating-point multiply-add variants
1238
1239// Returns mul * x + add
1241 const Vec512<float> add) {
1242 return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)};
1243}
1245 const Vec512<double> add) {
1246 return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)};
1247}
1248
1249// Returns add - mul * x
1251 const Vec512<float> add) {
1252 return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)};
1253}
1255 const Vec512<double> x,
1256 const Vec512<double> add) {
1257 return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)};
1258}
1259
1260// Returns mul * x - sub
1262 const Vec512<float> sub) {
1263 return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)};
1264}
1266 const Vec512<double> sub) {
1267 return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)};
1268}
1269
1270// Returns -mul * x - sub
1272 const Vec512<float> sub) {
1273 return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1274}
1276 const Vec512<double> x,
1277 const Vec512<double> sub) {
1278 return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1279}
1280
1281// ------------------------------ Floating-point square root
1282
1283// Full precision square root
1285 return Vec512<float>{_mm512_sqrt_ps(v.raw)};
1286}
1288 return Vec512<double>{_mm512_sqrt_pd(v.raw)};
1289}
1290
1291// Approximate reciprocal square root
1293 return Vec512<float>{_mm512_rsqrt14_ps(v.raw)};
1294}
1295
1296// ------------------------------ Floating-point rounding
1297
1298// Work around warnings in the intrinsic definitions (passing -1 as a mask).
1299HWY_DIAGNOSTICS(push)
1300HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1301
1302// Toward nearest integer, tie to even
1303HWY_API Vec512<float> Round(const Vec512<float> v) {
1304 return Vec512<float>{_mm512_roundscale_ps(
1305 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1306}
1308 return Vec512<double>{_mm512_roundscale_pd(
1309 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1310}
1311
1312// Toward zero, aka truncate
1314 return Vec512<float>{
1315 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1316}
1318 return Vec512<double>{
1319 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1320}
1321
1322// Toward +infinity, aka ceiling
1324 return Vec512<float>{
1325 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1326}
1328 return Vec512<double>{
1329 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1330}
1331
1332// Toward -infinity, aka floor
1334 return Vec512<float>{
1335 _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1336}
1338 return Vec512<double>{
1339 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1340}
1341
1342HWY_DIAGNOSTICS(pop)
1343
1344// ================================================== COMPARE
1345
1346// Comparisons set a mask bit to 1 if the condition is true, else 0.
1347
1348template <typename TFrom, typename TTo>
1350 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1351 return Mask512<TTo>{m.raw};
1352}
1353
1354namespace detail {
1355
1356template <typename T>
1358 const Vec512<T> bit) {
1359 return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)};
1360}
1361template <typename T>
1363 const Vec512<T> bit) {
1364 return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)};
1365}
1366template <typename T>
1368 const Vec512<T> bit) {
1369 return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)};
1370}
1371template <typename T>
1373 const Vec512<T> bit) {
1374 return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)};
1375}
1376
1377} // namespace detail
1378
1379template <typename T>
1381 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1382 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1383}
1384
1385// ------------------------------ Equality
1386
1387template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1389 return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)};
1390}
1391template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1392HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1393 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1394}
1395template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1396HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1397 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1398}
1399template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1400HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1401 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1402}
1403
1405 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1406}
1407
1409 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1410}
1411
1412// ------------------------------ Inequality
1413
1414template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1416 return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)};
1417}
1418template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1419HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1420 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1421}
1422template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1423HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1424 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1425}
1426template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1427HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1428 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1429}
1430
1432 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1433}
1434
1436 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1437}
1438
1439// ------------------------------ Strict inequality
1440
1442 return Mask512<uint8_t>{_mm512_cmpgt_epu8_mask(a.raw, b.raw)};
1443}
1445 return Mask512<uint16_t>{_mm512_cmpgt_epu16_mask(a.raw, b.raw)};
1446}
1448 return Mask512<uint32_t>{_mm512_cmpgt_epu32_mask(a.raw, b.raw)};
1449}
1451 return Mask512<uint64_t>{_mm512_cmpgt_epu64_mask(a.raw, b.raw)};
1452}
1453
1455 return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)};
1456}
1458 return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)};
1459}
1461 return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)};
1462}
1464 return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)};
1465}
1466
1468 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1469}
1471 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1472}
1473
1474// ------------------------------ Weak inequality
1475
1477 return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1478}
1480 return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1481}
1482
1483// ------------------------------ Reversed comparisons
1484
1485template <typename T>
1487 return b > a;
1488}
1489
1490template <typename T>
1492 return b >= a;
1493}
1494
1495// ------------------------------ Mask
1496
1497namespace detail {
1498
1499template <typename T>
1501 return Mask512<T>{_mm512_movepi8_mask(v.raw)};
1502}
1503template <typename T>
1505 return Mask512<T>{_mm512_movepi16_mask(v.raw)};
1506}
1507template <typename T>
1509 return Mask512<T>{_mm512_movepi32_mask(v.raw)};
1510}
1511template <typename T>
1513 return Mask512<T>{_mm512_movepi64_mask(v.raw)};
1514}
1515
1516} // namespace detail
1517
1518template <typename T>
1520 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1521}
1522// There do not seem to be native floating-point versions of these instructions.
1525}
1528}
1529
1531 return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
1532}
1534 return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
1535}
1536
1538 return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
1539}
1541 return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
1542}
1543
1545 return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
1546}
1548 return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
1549}
1551 return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
1552}
1553
1555 return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
1556}
1558 return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
1559}
1561 return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
1562}
1563
1564template <typename T>
1566 return VecFromMask(v);
1567}
1568
1569// ------------------------------ Mask logical
1570
1571namespace detail {
1572
1573template <typename T>
1575#if HWY_COMPILER_HAS_MASK_INTRINSICS
1576 return Mask512<T>{_knot_mask64(m.raw)};
1577#else
1578 return Mask512<T>{~m.raw};
1579#endif
1580}
1581template <typename T>
1583#if HWY_COMPILER_HAS_MASK_INTRINSICS
1584 return Mask512<T>{_knot_mask32(m.raw)};
1585#else
1586 return Mask512<T>{~m.raw};
1587#endif
1588}
1589template <typename T>
1591#if HWY_COMPILER_HAS_MASK_INTRINSICS
1592 return Mask512<T>{_knot_mask16(m.raw)};
1593#else
1594 return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)};
1595#endif
1596}
1597template <typename T>
1599#if HWY_COMPILER_HAS_MASK_INTRINSICS
1600 return Mask512<T>{_knot_mask8(m.raw)};
1601#else
1602 return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)};
1603#endif
1604}
1605
1606template <typename T>
1608 const Mask512<T> b) {
1609#if HWY_COMPILER_HAS_MASK_INTRINSICS
1610 return Mask512<T>{_kand_mask64(a.raw, b.raw)};
1611#else
1612 return Mask512<T>{a.raw & b.raw};
1613#endif
1614}
1615template <typename T>
1617 const Mask512<T> b) {
1618#if HWY_COMPILER_HAS_MASK_INTRINSICS
1619 return Mask512<T>{_kand_mask32(a.raw, b.raw)};
1620#else
1621 return Mask512<T>{a.raw & b.raw};
1622#endif
1623}
1624template <typename T>
1626 const Mask512<T> b) {
1627#if HWY_COMPILER_HAS_MASK_INTRINSICS
1628 return Mask512<T>{_kand_mask16(a.raw, b.raw)};
1629#else
1630 return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)};
1631#endif
1632}
1633template <typename T>
1635 const Mask512<T> b) {
1636#if HWY_COMPILER_HAS_MASK_INTRINSICS
1637 return Mask512<T>{_kand_mask8(a.raw, b.raw)};
1638#else
1639 return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)};
1640#endif
1641}
1642
1643template <typename T>
1645 const Mask512<T> b) {
1646#if HWY_COMPILER_HAS_MASK_INTRINSICS
1647 return Mask512<T>{_kandn_mask64(a.raw, b.raw)};
1648#else
1649 return Mask512<T>{~a.raw & b.raw};
1650#endif
1651}
1652template <typename T>
1654 const Mask512<T> b) {
1655#if HWY_COMPILER_HAS_MASK_INTRINSICS
1656 return Mask512<T>{_kandn_mask32(a.raw, b.raw)};
1657#else
1658 return Mask512<T>{~a.raw & b.raw};
1659#endif
1660}
1661template <typename T>
1663 const Mask512<T> b) {
1664#if HWY_COMPILER_HAS_MASK_INTRINSICS
1665 return Mask512<T>{_kandn_mask16(a.raw, b.raw)};
1666#else
1667 return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)};
1668#endif
1669}
1670template <typename T>
1672 const Mask512<T> b) {
1673#if HWY_COMPILER_HAS_MASK_INTRINSICS
1674 return Mask512<T>{_kandn_mask8(a.raw, b.raw)};
1675#else
1676 return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)};
1677#endif
1678}
1679
1680template <typename T>
1682 const Mask512<T> b) {
1683#if HWY_COMPILER_HAS_MASK_INTRINSICS
1684 return Mask512<T>{_kor_mask64(a.raw, b.raw)};
1685#else
1686 return Mask512<T>{a.raw | b.raw};
1687#endif
1688}
1689template <typename T>
1691 const Mask512<T> b) {
1692#if HWY_COMPILER_HAS_MASK_INTRINSICS
1693 return Mask512<T>{_kor_mask32(a.raw, b.raw)};
1694#else
1695 return Mask512<T>{a.raw | b.raw};
1696#endif
1697}
1698template <typename T>
1700 const Mask512<T> b) {
1701#if HWY_COMPILER_HAS_MASK_INTRINSICS
1702 return Mask512<T>{_kor_mask16(a.raw, b.raw)};
1703#else
1704 return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)};
1705#endif
1706}
1707template <typename T>
1709 const Mask512<T> b) {
1710#if HWY_COMPILER_HAS_MASK_INTRINSICS
1711 return Mask512<T>{_kor_mask8(a.raw, b.raw)};
1712#else
1713 return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)};
1714#endif
1715}
1716
1717template <typename T>
1719 const Mask512<T> b) {
1720#if HWY_COMPILER_HAS_MASK_INTRINSICS
1721 return Mask512<T>{_kxor_mask64(a.raw, b.raw)};
1722#else
1723 return Mask512<T>{a.raw ^ b.raw};
1724#endif
1725}
1726template <typename T>
1728 const Mask512<T> b) {
1729#if HWY_COMPILER_HAS_MASK_INTRINSICS
1730 return Mask512<T>{_kxor_mask32(a.raw, b.raw)};
1731#else
1732 return Mask512<T>{a.raw ^ b.raw};
1733#endif
1734}
1735template <typename T>
1737 const Mask512<T> b) {
1738#if HWY_COMPILER_HAS_MASK_INTRINSICS
1739 return Mask512<T>{_kxor_mask16(a.raw, b.raw)};
1740#else
1741 return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
1742#endif
1743}
1744template <typename T>
1746 const Mask512<T> b) {
1747#if HWY_COMPILER_HAS_MASK_INTRINSICS
1748 return Mask512<T>{_kxor_mask8(a.raw, b.raw)};
1749#else
1750 return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
1751#endif
1752}
1753
1754} // namespace detail
1755
1756template <typename T>
1758 return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1759}
1760
1761template <typename T>
1763 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1764}
1765
1766template <typename T>
1768 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1769}
1770
1771template <typename T>
1773 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1774}
1775
1776template <typename T>
1778 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1779}
1780
1781// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1782
1784 return VecFromMask(v < Zero(Full512<int8_t>()));
1785}
1786
1788 return ShiftRight<15>(v);
1789}
1790
1792 return ShiftRight<31>(v);
1793}
1794
1796 return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
1797}
1798
1799// ------------------------------ Floating-point classification (Not)
1800
1802 return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x81)};
1803}
1805 return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x81)};
1806}
1807
1809 return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x18)};
1810}
1812 return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x18)};
1813}
1814
1815// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
1816// positive, so we have to check for inf/NaN and negate.
1818 return Not(Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x99)});
1819}
1821 return Not(Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x99)});
1822}
1823
1824// ================================================== MEMORY
1825
1826// ------------------------------ Load
1827
1828template <typename T>
1829HWY_API Vec512<T> Load(Full512<T> /* tag */, const T* HWY_RESTRICT aligned) {
1830 return Vec512<T>{_mm512_load_si512(aligned)};
1831}
1833 const float* HWY_RESTRICT aligned) {
1834 return Vec512<float>{_mm512_load_ps(aligned)};
1835}
1837 const double* HWY_RESTRICT aligned) {
1838 return Vec512<double>{_mm512_load_pd(aligned)};
1839}
1840
1841template <typename T>
1843 return Vec512<T>{_mm512_loadu_si512(p)};
1844}
1846 const float* HWY_RESTRICT p) {
1847 return Vec512<float>{_mm512_loadu_ps(p)};
1848}
1850 const double* HWY_RESTRICT p) {
1851 return Vec512<double>{_mm512_loadu_pd(p)};
1852}
1853
1854// ------------------------------ MaskedLoad
1855
1856template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1858 const T* HWY_RESTRICT p) {
1859 return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, p)};
1860}
1861
1862template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1863HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1864 const T* HWY_RESTRICT p) {
1865 return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1866}
1867
1868template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1869HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1870 const T* HWY_RESTRICT p) {
1871 return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1872}
1873
1874template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1875HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1876 const T* HWY_RESTRICT p) {
1877 return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1878}
1879
1881 const float* HWY_RESTRICT p) {
1882 return Vec512<float>{_mm512_maskz_loadu_ps(m.raw, p)};
1883}
1884
1886 const double* HWY_RESTRICT p) {
1887 return Vec512<double>{_mm512_maskz_loadu_pd(m.raw, p)};
1888}
1889
1890// ------------------------------ LoadDup128
1891
1892// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
1893// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
1894template <typename T>
1896 const T* const HWY_RESTRICT p) {
1897 const auto x4 = LoadU(Full128<T>(), p);
1898 return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1899}
1901 const float* const HWY_RESTRICT p) {
1902 const __m128 x4 = _mm_loadu_ps(p);
1903 return Vec512<float>{_mm512_broadcast_f32x4(x4)};
1904}
1905
1907 const double* const HWY_RESTRICT p) {
1908 const __m128d x2 = _mm_loadu_pd(p);
1909 return Vec512<double>{_mm512_broadcast_f64x2(x2)};
1910}
1911
1912// ------------------------------ Store
1913
1914template <typename T>
1915HWY_API void Store(const Vec512<T> v, Full512<T> /* tag */,
1916 T* HWY_RESTRICT aligned) {
1917 _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1918}
1920 float* HWY_RESTRICT aligned) {
1921 _mm512_store_ps(aligned, v.raw);
1922}
1924 double* HWY_RESTRICT aligned) {
1925 _mm512_store_pd(aligned, v.raw);
1926}
1927
1928template <typename T>
1929HWY_API void StoreU(const Vec512<T> v, Full512<T> /* tag */,
1930 T* HWY_RESTRICT p) {
1931 _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
1932}
1934 float* HWY_RESTRICT p) {
1935 _mm512_storeu_ps(p, v.raw);
1936}
1938 double* HWY_RESTRICT p) {
1939 _mm512_storeu_pd(p, v.raw);
1940}
1941
1942// ------------------------------ BlendedStore
1943
1944template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1946 T* HWY_RESTRICT p) {
1947 _mm512_mask_storeu_epi8(p, m.raw, v.raw);
1948}
1949
1950template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1951HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1952 T* HWY_RESTRICT p) {
1953 _mm512_mask_storeu_epi16(p, m.raw, v.raw);
1954}
1955
1956template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1957HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1958 T* HWY_RESTRICT p) {
1959 _mm512_mask_storeu_epi32(p, m.raw, v.raw);
1960}
1961
1962template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1963HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1964 T* HWY_RESTRICT p) {
1965 _mm512_mask_storeu_epi64(p, m.raw, v.raw);
1966}
1967
1969 Full512<float> /* tag */, float* HWY_RESTRICT p) {
1970 _mm512_mask_storeu_ps(p, m.raw, v.raw);
1971}
1972
1974 Full512<double> /* tag */, double* HWY_RESTRICT p) {
1975 _mm512_mask_storeu_pd(p, m.raw, v.raw);
1976}
1977
1978// ------------------------------ Non-temporal stores
1979
1980template <typename T>
1981HWY_API void Stream(const Vec512<T> v, Full512<T> /* tag */,
1982 T* HWY_RESTRICT aligned) {
1983 _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1984}
1986 float* HWY_RESTRICT aligned) {
1987 _mm512_stream_ps(aligned, v.raw);
1988}
1990 double* HWY_RESTRICT aligned) {
1991 _mm512_stream_pd(aligned, v.raw);
1992}
1993
1994// ------------------------------ Scatter
1995
1996// Work around warnings in the intrinsic definitions (passing -1 as a mask).
1997HWY_DIAGNOSTICS(push)
1998HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1999
2000namespace detail {
2001
2002template <typename T>
2004 Full512<T> /* tag */, T* HWY_RESTRICT base,
2005 const Vec512<int32_t> offset) {
2006 _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
2007}
2008template <typename T>
2010 Full512<T> /* tag */, T* HWY_RESTRICT base,
2011 const Vec512<int32_t> index) {
2012 _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
2013}
2014
2015template <typename T>
2017 Full512<T> /* tag */, T* HWY_RESTRICT base,
2018 const Vec512<int64_t> offset) {
2019 _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
2020}
2021template <typename T>
2023 Full512<T> /* tag */, T* HWY_RESTRICT base,
2024 const Vec512<int64_t> index) {
2025 _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
2026}
2027
2028} // namespace detail
2029
2030template <typename T, typename Offset>
2032 const Vec512<Offset> offset) {
2033 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2034 return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2035}
2036template <typename T, typename Index>
2038 const Vec512<Index> index) {
2039 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2040 return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2041}
2042
2044 float* HWY_RESTRICT base,
2045 const Vec512<int32_t> offset) {
2046 _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
2047}
2049 float* HWY_RESTRICT base,
2050 const Vec512<int32_t> index) {
2051 _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
2052}
2053
2055 double* HWY_RESTRICT base,
2056 const Vec512<int64_t> offset) {
2057 _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
2058}
2060 double* HWY_RESTRICT base,
2061 const Vec512<int64_t> index) {
2062 _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
2063}
2064
2065// ------------------------------ Gather
2066
2067namespace detail {
2068
2069template <typename T>
2071 Full512<T> /* tag */,
2072 const T* HWY_RESTRICT base,
2073 const Vec512<int32_t> offset) {
2074 return Vec512<T>{_mm512_i32gather_epi32(offset.raw, base, 1)};
2075}
2076template <typename T>
2078 Full512<T> /* tag */,
2079 const T* HWY_RESTRICT base,
2080 const Vec512<int32_t> index) {
2081 return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, 4)};
2082}
2083
2084template <typename T>
2086 Full512<T> /* tag */,
2087 const T* HWY_RESTRICT base,
2088 const Vec512<int64_t> offset) {
2089 return Vec512<T>{_mm512_i64gather_epi64(offset.raw, base, 1)};
2090}
2091template <typename T>
2093 Full512<T> /* tag */,
2094 const T* HWY_RESTRICT base,
2095 const Vec512<int64_t> index) {
2096 return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, 8)};
2097}
2098
2099} // namespace detail
2100
2101template <typename T, typename Offset>
2103 const Vec512<Offset> offset) {
2104 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2105 return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2106}
2107template <typename T, typename Index>
2109 const Vec512<Index> index) {
2110 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2111 return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2112}
2113
2115 const float* HWY_RESTRICT base,
2116 const Vec512<int32_t> offset) {
2117 return Vec512<float>{_mm512_i32gather_ps(offset.raw, base, 1)};
2118}
2120 const float* HWY_RESTRICT base,
2121 const Vec512<int32_t> index) {
2122 return Vec512<float>{_mm512_i32gather_ps(index.raw, base, 4)};
2123}
2124
2126 const double* HWY_RESTRICT base,
2127 const Vec512<int64_t> offset) {
2128 return Vec512<double>{_mm512_i64gather_pd(offset.raw, base, 1)};
2129}
2131 const double* HWY_RESTRICT base,
2132 const Vec512<int64_t> index) {
2133 return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
2134}
2135
2136HWY_DIAGNOSTICS(pop)
2137
2138// ================================================== SWIZZLE
2139
2140// ------------------------------ LowerHalf
2141
2142template <typename T>
2144 return Vec256<T>{_mm512_castsi512_si256(v.raw)};
2145}
2147 return Vec256<float>{_mm512_castps512_ps256(v.raw)};
2148}
2150 return Vec256<double>{_mm512_castpd512_pd256(v.raw)};
2151}
2152
2153template <typename T>
2155 return LowerHalf(Full256<T>(), v);
2156}
2157
2158// ------------------------------ UpperHalf
2159
2160template <typename T>
2162 return Vec256<T>{_mm512_extracti32x8_epi32(v.raw, 1)};
2163}
2165 return Vec256<float>{_mm512_extractf32x8_ps(v.raw, 1)};
2166}
2168 return Vec256<double>{_mm512_extractf64x4_pd(v.raw, 1)};
2169}
2170
2171// ------------------------------ ExtractLane (Store)
2172template <typename T>
2173HWY_API T ExtractLane(const Vec512<T> v, size_t i) {
2174 const Full512<T> d;
2175 HWY_DASSERT(i < Lanes(d));
2176 alignas(64) T lanes[64 / sizeof(T)];
2177 Store(v, d, lanes);
2178 return lanes[i];
2179}
2180
2181// ------------------------------ InsertLane (Store)
2182template <typename T>
2183HWY_API Vec512<T> InsertLane(const Vec512<T> v, size_t i, T t) {
2184 const Full512<T> d;
2185 HWY_DASSERT(i < Lanes(d));
2186 alignas(64) T lanes[64 / sizeof(T)];
2187 Store(v, d, lanes);
2188 lanes[i] = t;
2189 return Load(d, lanes);
2190}
2191
2192// ------------------------------ GetLane (LowerHalf)
2193template <typename T>
2195 return GetLane(LowerHalf(v));
2196}
2197
2198// ------------------------------ ZeroExtendVector
2199
2200template <typename T>
2202#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
2203 return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
2204#else
2205 return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
2206#endif
2207}
2209 Vec256<float> lo) {
2210#if HWY_HAVE_ZEXT
2211 return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
2212#else
2213 return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
2214#endif
2215}
2217 Vec256<double> lo) {
2218#if HWY_HAVE_ZEXT
2219 return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
2220#else
2221 return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
2222#endif
2223}
2224
2225// ------------------------------ Combine
2226
2227template <typename T>
2229 const auto lo512 = ZeroExtendVector(d, lo);
2230 return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.raw, 1)};
2231}
2233 Vec256<float> lo) {
2234 const auto lo512 = ZeroExtendVector(d, lo);
2235 return Vec512<float>{_mm512_insertf32x8(lo512.raw, hi.raw, 1)};
2236}
2238 Vec256<double> lo) {
2239 const auto lo512 = ZeroExtendVector(d, lo);
2240 return Vec512<double>{_mm512_insertf64x4(lo512.raw, hi.raw, 1)};
2241}
2242
2243// ------------------------------ ShiftLeftBytes
2244
2245template <int kBytes, typename T>
2247 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2248 return Vec512<T>{_mm512_bslli_epi128(v.raw, kBytes)};
2249}
2250
2251template <int kBytes, typename T>
2253 return ShiftLeftBytes<kBytes>(Full512<T>(), v);
2254}
2255
2256// ------------------------------ ShiftLeftLanes
2257
2258template <int kLanes, typename T>
2260 const Repartition<uint8_t, decltype(d)> d8;
2261 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2262}
2263
2264template <int kLanes, typename T>
2266 return ShiftLeftLanes<kLanes>(Full512<T>(), v);
2267}
2268
2269// ------------------------------ ShiftRightBytes
2270template <int kBytes, typename T>
2272 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2273 return Vec512<T>{_mm512_bsrli_epi128(v.raw, kBytes)};
2274}
2275
2276// ------------------------------ ShiftRightLanes
2277template <int kLanes, typename T>
2279 const Repartition<uint8_t, decltype(d)> d8;
2280 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2281}
2282
2283// ------------------------------ CombineShiftRightBytes
2284
2285template <int kBytes, typename T, class V = Vec512<T>>
2287 const Repartition<uint8_t, decltype(d)> d8;
2288 return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8(
2289 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2290}
2291
2292// ------------------------------ Broadcast/splat any lane
2293
2294// Unsigned
2295template <int kLane>
2297 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2298 if (kLane < 4) {
2299 const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2300 return Vec512<uint16_t>{_mm512_unpacklo_epi64(lo, lo)};
2301 } else {
2302 const __m512i hi =
2303 _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2304 return Vec512<uint16_t>{_mm512_unpackhi_epi64(hi, hi)};
2305 }
2306}
2307template <int kLane>
2309 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2310 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2311 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2312}
2313template <int kLane>
2315 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2316 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2317 return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2318}
2319
2320// Signed
2321template <int kLane>
2323 static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2324 if (kLane < 4) {
2325 const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2326 return Vec512<int16_t>{_mm512_unpacklo_epi64(lo, lo)};
2327 } else {
2328 const __m512i hi =
2329 _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2330 return Vec512<int16_t>{_mm512_unpackhi_epi64(hi, hi)};
2331 }
2332}
2333template <int kLane>
2335 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2336 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2337 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2338}
2339template <int kLane>
2341 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2342 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2343 return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2344}
2345
2346// Float
2347template <int kLane>
2349 static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2350 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2351 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)};
2352}
2353template <int kLane>
2355 static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2356 constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane);
2357 return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)};
2358}
2359
2360// ------------------------------ Hard-coded shuffles
2361
2362// Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2363// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2364// right (the previous least-significant lane is now most-significant =>
2365// 47650321). These could also be implemented via CombineShiftRightBytes but
2366// the shuffle_abcd notation is more convenient.
2367
2368// Swap 32-bit halves in 64-bit halves.
2369template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2371 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
2372}
2374 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)};
2375}
2376
2377namespace detail {
2378
2379template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2381 const Full512<T> d;
2382 const RebindToFloat<decltype(d)> df;
2383 return BitCast(
2384 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2385 _MM_PERM_CDAB)});
2386}
2387template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2389 const Full512<T> d;
2390 const RebindToFloat<decltype(d)> df;
2391 return BitCast(
2392 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2393 _MM_PERM_BCDA)});
2394}
2395template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2397 const Full512<T> d;
2398 const RebindToFloat<decltype(d)> df;
2399 return BitCast(
2400 d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
2401 _MM_PERM_DABC)});
2402}
2403
2404} // namespace detail
2405
2406// Swap 64-bit halves
2408 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2409}
2411 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2412}
2414 // Shorter encoding than _mm512_permute_ps.
2415 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)};
2416}
2418 return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2419}
2421 return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2422}
2424 // Shorter encoding than _mm512_permute_pd.
2425 return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)};
2426}
2427
2428// Rotate right 32 bits
2430 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2431}
2433 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2434}
2436 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)};
2437}
2438// Rotate left 32 bits
2440 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2441}
2443 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2444}
2446 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)};
2447}
2448
2449// Reverse
2451 return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2452}
2454 return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2455}
2457 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)};
2458}
2459
2460// ------------------------------ TableLookupLanes
2461
2462// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2463template <typename T>
2465 __m512i raw;
2466};
2467
2468template <typename T, typename TI>
2470 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2471#if HWY_IS_DEBUG_BUILD
2472 const Full512<TI> di;
2473 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2474 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T))))));
2475#endif
2476 return Indices512<T>{vec.raw};
2477}
2478
2479template <typename T, typename TI>
2481 const Rebind<TI, decltype(d)> di;
2482 return IndicesFromVec(d, LoadU(di, idx));
2483}
2484
2485template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2487 return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
2488}
2489
2490template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2491HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
2492 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)};
2493}
2494
2496 return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
2497}
2498
2500 Indices512<double> idx) {
2501 return Vec512<double>{_mm512_permutexvar_pd(idx.raw, v.raw)};
2502}
2503
2504// ------------------------------ Reverse
2505
2506template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2508 const RebindToSigned<decltype(d)> di;
2509 alignas(64) constexpr int16_t kReverse[32] = {
2510 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2511 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2512 const Vec512<int16_t> idx = Load(di, kReverse);
2513 return BitCast(d, Vec512<int16_t>{
2514 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2515}
2516
2517template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2518HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2519 alignas(64) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2520 7, 6, 5, 4, 3, 2, 1, 0};
2521 return TableLookupLanes(v, SetTableIndices(d, kReverse));
2522}
2523
2524template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2525HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2526 alignas(64) constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2527 return TableLookupLanes(v, SetTableIndices(d, kReverse));
2528}
2529
2530// ------------------------------ Reverse2
2531
2532template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2534 const Full512<uint32_t> du32;
2535 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2536}
2537
2538template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2539HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2540 return Shuffle2301(v);
2541}
2542
2543template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2544HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2545 return Shuffle01(v);
2546}
2547
2548// ------------------------------ Reverse4
2549
2550template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2552 const RebindToSigned<decltype(d)> di;
2553 alignas(64) constexpr int16_t kReverse4[32] = {
2554 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2555 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2556 const Vec512<int16_t> idx = Load(di, kReverse4);
2557 return BitCast(d, Vec512<int16_t>{
2558 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2559}
2560
2561template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2562HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2563 return Shuffle0123(v);
2564}
2565
2566template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2567HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2568 return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2569}
2571 return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2572}
2573
2574// ------------------------------ Reverse8
2575
2576template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2578 const RebindToSigned<decltype(d)> di;
2579 alignas(64) constexpr int16_t kReverse8[32] = {
2580 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2581 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2582 const Vec512<int16_t> idx = Load(di, kReverse8);
2583 return BitCast(d, Vec512<int16_t>{
2584 _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2585}
2586
2587template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2588HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2589 const RebindToSigned<decltype(d)> di;
2590 alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2591 15, 14, 13, 12, 11, 10, 9, 8};
2592 const Vec512<int32_t> idx = Load(di, kReverse8);
2593 return BitCast(d, Vec512<int32_t>{
2594 _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
2595}
2596
2597template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2598HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2599 return Reverse(d, v);
2600}
2601
2602// ------------------------------ InterleaveLower
2603
2604// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2605// the least-significant lane) and "b". To concatenate two half-width integers
2606// into one, use ZipLower/Upper instead (also works with scalar).
2607
2609 const Vec512<uint8_t> b) {
2610 return Vec512<uint8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2611}
2613 const Vec512<uint16_t> b) {
2614 return Vec512<uint16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2615}
2617 const Vec512<uint32_t> b) {
2618 return Vec512<uint32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2619}
2621 const Vec512<uint64_t> b) {
2622 return Vec512<uint64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2623}
2624
2626 const Vec512<int8_t> b) {
2627 return Vec512<int8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2628}
2630 const Vec512<int16_t> b) {
2631 return Vec512<int16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2632}
2634 const Vec512<int32_t> b) {
2635 return Vec512<int32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2636}
2638 const Vec512<int64_t> b) {
2639 return Vec512<int64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2640}
2641
2643 const Vec512<float> b) {
2644 return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)};
2645}
2647 const Vec512<double> b) {
2648 return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
2649}
2650
2651// ------------------------------ InterleaveUpper
2652
2653// All functions inside detail lack the required D parameter.
2654namespace detail {
2655
2657 const Vec512<uint8_t> b) {
2658 return Vec512<uint8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2659}
2661 const Vec512<uint16_t> b) {
2662 return Vec512<uint16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2663}
2665 const Vec512<uint32_t> b) {
2666 return Vec512<uint32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2667}
2669 const Vec512<uint64_t> b) {
2670 return Vec512<uint64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2671}
2672
2674 const Vec512<int8_t> b) {
2675 return Vec512<int8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2676}
2678 const Vec512<int16_t> b) {
2679 return Vec512<int16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2680}
2682 const Vec512<int32_t> b) {
2683 return Vec512<int32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2684}
2686 const Vec512<int64_t> b) {
2687 return Vec512<int64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2688}
2689
2691 const Vec512<float> b) {
2692 return Vec512<float>{_mm512_unpackhi_ps(a.raw, b.raw)};
2693}
2695 const Vec512<double> b) {
2696 return Vec512<double>{_mm512_unpackhi_pd(a.raw, b.raw)};
2697}
2698
2699} // namespace detail
2700
2701template <typename T, class V = Vec512<T>>
2702HWY_API V InterleaveUpper(Full512<T> /* tag */, V a, V b) {
2703 return detail::InterleaveUpper(a, b);
2704}
2705
2706// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2707
2708// Same as Interleave*, except that the return lanes are double-width integers;
2709// this is necessary because the single-lane scalar cannot return two values.
2710template <typename T, typename TW = MakeWide<T>>
2712 return BitCast(Full512<TW>(), InterleaveLower(a, b));
2713}
2714template <typename T, typename TW = MakeWide<T>>
2716 return BitCast(Full512<TW>(), InterleaveLower(a, b));
2717}
2718
2719template <typename T, typename TW = MakeWide<T>>
2721 return BitCast(Full512<TW>(), InterleaveUpper(d, a, b));
2722}
2723
2724// ------------------------------ Concat* halves
2725
2726// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2727template <typename T>
2729 const Vec512<T> lo) {
2730 return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2731}
2733 const Vec512<float> hi,
2734 const Vec512<float> lo) {
2735 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2736}
2738 const Vec512<double> hi,
2739 const Vec512<double> lo) {
2740 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)};
2741}
2742
2743// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2744template <typename T>
2746 const Vec512<T> lo) {
2747 return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2748}
2750 const Vec512<float> hi,
2751 const Vec512<float> lo) {
2752 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2753}
2755 const Vec512<double> hi,
2756 const Vec512<double> lo) {
2757 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)};
2758}
2759
2760// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
2761template <typename T>
2763 const Vec512<T> lo) {
2764 return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2765}
2767 const Vec512<float> hi,
2768 const Vec512<float> lo) {
2769 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2770}
2772 const Vec512<double> hi,
2773 const Vec512<double> lo) {
2774 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
2775}
2776
2777// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2778template <typename T>
2780 const Vec512<T> lo) {
2781 // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
2782 // are efficiently loaded from 32-bit regs.
2783 const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
2784 return Vec512<T>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
2785}
2787 const Vec512<float> hi,
2788 const Vec512<float> lo) {
2789 const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF);
2790 return Vec512<float>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
2791}
2793 const Vec512<double> hi,
2794 const Vec512<double> lo) {
2795 const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F);
2796 return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)};
2797}
2798
2799// ------------------------------ ConcatOdd
2800
2801template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2803 const RebindToUnsigned<decltype(d)> du;
2804#if HWY_TARGET == HWY_AVX3_DL
2805 alignas(64) constexpr uint8_t kIdx[64] = {
2806 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
2807 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
2808 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
2809 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
2810 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
2811 return BitCast(d,
2812 Vec512<uint8_t>{_mm512_mask2_permutex2var_epi8(
2813 BitCast(du, lo).raw, Load(du, kIdx).raw,
2814 __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
2815#else
2816 const RepartitionToWide<decltype(du)> dw;
2817 // Right-shift 8 bits per u16 so we can pack.
2818 const Vec512<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
2819 const Vec512<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
2820 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
2821 // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
2822 const Full512<uint64_t> du64;
2823 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2824 return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
2825#endif
2826}
2827
2828template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2829HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2830 const RebindToUnsigned<decltype(d)> du;
2831 alignas(64) constexpr uint16_t kIdx[32] = {
2832 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2833 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
2834 return BitCast(d, Vec512<uint16_t>{_mm512_mask2_permutex2var_epi16(
2835 BitCast(du, lo).raw, Load(du, kIdx).raw,
2836 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
2837}
2838
2839template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2840HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2841 const RebindToUnsigned<decltype(d)> du;
2842 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2843 17, 19, 21, 23, 25, 27, 29, 31};
2844 return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2845 BitCast(du, lo).raw, Load(du, kIdx).raw,
2846 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2847}
2848
2850 Vec512<float> lo) {
2851 const RebindToUnsigned<decltype(d)> du;
2852 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2853 17, 19, 21, 23, 25, 27, 29, 31};
2854 return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2855 __mmask16{0xFFFF}, hi.raw)};
2856}
2857
2858template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2859HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2860 const RebindToUnsigned<decltype(d)> du;
2861 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2862 return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2863 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2864 BitCast(du, hi).raw)});
2865}
2866
2868 Vec512<double> lo) {
2869 const RebindToUnsigned<decltype(d)> du;
2870 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2871 return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2872 __mmask8{0xFF}, hi.raw)};
2873}
2874
2875// ------------------------------ ConcatEven
2876
2877template <typename T, HWY_IF_LANE_SIZE(T, 1)>
2879 const RebindToUnsigned<decltype(d)> du;
2880#if HWY_TARGET == HWY_AVX3_DL
2881 alignas(64) constexpr uint8_t kIdx[64] = {
2882 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
2883 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
2884 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
2885 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
2886 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
2887 return BitCast(d,
2888 Vec512<uint32_t>{_mm512_mask2_permutex2var_epi8(
2889 BitCast(du, lo).raw, Load(du, kIdx).raw,
2890 __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
2891#else
2892 const RepartitionToWide<decltype(du)> dw;
2893 // Isolate lower 8 bits per u16 so we can pack.
2894 const Vec512<uint16_t> mask = Set(dw, 0x00FF);
2895 const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
2896 const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
2897 const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
2898 // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
2899 const Full512<uint64_t> du64;
2900 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2901 return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
2902#endif
2903}
2904
2905template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2906HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2907 const RebindToUnsigned<decltype(d)> du;
2908 alignas(64) constexpr uint16_t kIdx[32] = {
2909 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
2910 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
2911 return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi16(
2912 BitCast(du, lo).raw, Load(du, kIdx).raw,
2913 __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
2914}
2915
2916template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2917HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2918 const RebindToUnsigned<decltype(d)> du;
2919 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2920 16, 18, 20, 22, 24, 26, 28, 30};
2921 return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2922 BitCast(du, lo).raw, Load(du, kIdx).raw,
2923 __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2924}
2925
2927 Vec512<float> lo) {
2928 const RebindToUnsigned<decltype(d)> du;
2929 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2930 16, 18, 20, 22, 24, 26, 28, 30};
2931 return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2932 __mmask16{0xFFFF}, hi.raw)};
2933}
2934
2935template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2936HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2937 const RebindToUnsigned<decltype(d)> du;
2938 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2939 return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2940 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2941 BitCast(du, hi).raw)});
2942}
2943
2945 Vec512<double> lo) {
2946 const RebindToUnsigned<decltype(d)> du;
2947 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2948 return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2949 __mmask8{0xFF}, hi.raw)};
2950}
2951
2952// ------------------------------ DupEven (InterleaveLower)
2953
2954template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2956 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
2957}
2959 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
2960}
2961
2962template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2963HWY_API Vec512<T> DupEven(const Vec512<T> v) {
2964 return InterleaveLower(Full512<T>(), v, v);
2965}
2966
2967// ------------------------------ DupOdd (InterleaveUpper)
2968
2969template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2971 return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
2972}
2974 return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
2975}
2976
2977template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2978HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
2979 return InterleaveUpper(Full512<T>(), v, v);
2980}
2981
2982// ------------------------------ OddEven
2983
2984template <typename T>
2986 constexpr size_t s = sizeof(T);
2987 constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
2988 return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
2989}
2990
2991// ------------------------------ OddEvenBlocks
2992
2993template <typename T>
2995 return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
2996}
2997
2999 return Vec512<float>{
3000 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)};
3001}
3002
3004 return Vec512<double>{
3005 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)};
3006}
3007
3008// ------------------------------ SwapAdjacentBlocks
3009
3010template <typename T>
3012 return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
3013}
3014
3016 return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
3017}
3018
3020 return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
3021}
3022
3023// ------------------------------ ReverseBlocks
3024
3025template <typename T>
3027 return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
3028}
3030 return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
3031}
3033 Vec512<double> v) {
3034 return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
3035}
3036
3037// ------------------------------ TableLookupBytes (ZeroExtendVector)
3038
3039// Both full
3040template <typename T, typename TI>
3042 return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)};
3043}
3044
3045// Partial index vector
3046template <typename T, typename TI, size_t NI>
3048 const Full512<TI> d512;
3049 const Half<decltype(d512)> d256;
3050 const Half<decltype(d256)> d128;
3051 // First expand to full 128, then 256, then 512.
3052 const Vec128<TI> from_full{from.raw};
3053 const auto from_512 =
3054 ZeroExtendVector(d512, ZeroExtendVector(d256, from_full));
3055 const auto tbl_full = TableLookupBytes(bytes, from_512);
3056 // Shrink to 256, then 128, then partial.
3057 return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw};
3058}
3059template <typename T, typename TI>
3061 const auto from_512 = ZeroExtendVector(Full512<TI>(), from);
3062 return LowerHalf(Full256<TI>(), TableLookupBytes(bytes, from_512));
3063}
3064
3065// Partial table vector
3066template <typename T, size_t N, typename TI>
3068 const Full512<TI> d512;
3069 const Half<decltype(d512)> d256;
3070 const Half<decltype(d256)> d128;
3071 // First expand to full 128, then 256, then 512.
3072 const Vec128<T> bytes_full{bytes.raw};
3073 const auto bytes_512 =
3074 ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full));
3075 return TableLookupBytes(bytes_512, from);
3076}
3077template <typename T, typename TI>
3079 const auto bytes_512 = ZeroExtendVector(Full512<T>(), bytes);
3080 return TableLookupBytes(bytes_512, from);
3081}
3082
3083// Partial both are handled by x86_128/256.
3084
3085// ================================================== CONVERT
3086
3087// ------------------------------ Promotions (part w/ narrow lanes -> full)
3088
3089// Unsigned: zero-extend.
3090// Note: these have 3 cycle latency; if inputs are already split across the
3091// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
3094 return Vec512<uint16_t>{_mm512_cvtepu8_epi16(v.raw)};
3095}
3098 return Vec512<uint32_t>{_mm512_cvtepu8_epi32(v.raw)};
3099}
3102 return Vec512<int16_t>{_mm512_cvtepu8_epi16(v.raw)};
3103}
3106 return Vec512<int32_t>{_mm512_cvtepu8_epi32(v.raw)};
3107}
3110 return Vec512<uint32_t>{_mm512_cvtepu16_epi32(v.raw)};
3111}
3114 return Vec512<int32_t>{_mm512_cvtepu16_epi32(v.raw)};
3115}
3118 return Vec512<uint64_t>{_mm512_cvtepu32_epi64(v.raw)};
3119}
3120
3121// Signed: replicate sign bit.
3122// Note: these have 3 cycle latency; if inputs are already split across the
3123// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
3124// signed shift would be faster.
3126 Vec256<int8_t> v) {
3127 return Vec512<int16_t>{_mm512_cvtepi8_epi16(v.raw)};
3128}
3130 Vec128<int8_t> v) {
3131 return Vec512<int32_t>{_mm512_cvtepi8_epi32(v.raw)};
3132}
3135 return Vec512<int32_t>{_mm512_cvtepi16_epi32(v.raw)};
3136}
3139 return Vec512<int64_t>{_mm512_cvtepi32_epi64(v.raw)};
3140}
3141
3142// Float
3144 const Vec256<float16_t> v) {
3145 return Vec512<float>{_mm512_cvtph_ps(v.raw)};
3146}
3147
3149 const Vec256<bfloat16_t> v) {
3150 const Rebind<uint16_t, decltype(df32)> du16;
3151 const RebindToSigned<decltype(df32)> di32;
3152 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3153}
3154
3156 return Vec512<double>{_mm512_cvtps_pd(v.raw)};
3157}
3158
3160 return Vec512<double>{_mm512_cvtepi32_pd(v.raw)};
3161}
3162
3163// ------------------------------ Demotions (full -> part w/ narrow lanes)
3164
3166 const Vec512<int32_t> v) {
3167 const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3168
3169 // Compress even u64 lanes into 256 bit.
3170 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3171 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3172 const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)};
3173 return LowerHalf(even);
3174}
3175
3177 const Vec512<int32_t> v) {
3178 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3179
3180 // Compress even u64 lanes into 256 bit.
3181 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3182 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3183 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3184 return LowerHalf(even);
3185}
3186
3188 const Vec512<int32_t> v) {
3189 const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3190 // packus treats the input as signed; we want unsigned. Clear the MSB to get
3191 // unsigned saturation to u8.
3192 const Vec512<int16_t> i16{
3193 _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3194 const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
3195
3196 alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3197 const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3198 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3199 return LowerHalf(LowerHalf(fixed));
3200}
3201
3203 const Vec512<int16_t> v) {
3204 const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)};
3205
3206 // Compress even u64 lanes into 256 bit.
3207 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3208 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3209 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3210 return LowerHalf(even);
3211}
3212
3214 const Vec512<int32_t> v) {
3215 const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3216 const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
3217
3218 alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3219 0, 4, 8, 12, 0, 4, 8, 12};
3220 const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3221 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3222 return LowerHalf(LowerHalf(fixed));
3223}
3224
3226 const Vec512<int16_t> v) {
3227 const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
3228
3229 // Compress even u64 lanes into 256 bit.
3230 alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3231 const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3232 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3233 return LowerHalf(even);
3234}
3235
3237 const Vec512<float> v) {
3238 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3239 HWY_DIAGNOSTICS(push)
3240 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3241 return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3242 HWY_DIAGNOSTICS(pop)
3243}
3244
3246 const Vec512<float> v) {
3247 // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
3248 const Rebind<int32_t, decltype(dbf16)> di32;
3249 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3250 const Rebind<uint16_t, decltype(dbf16)> du16;
3251 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3252 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3253}
3254
3257 // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
3258 const RebindToUnsigned<decltype(dbf16)> du16;
3259 const Repartition<uint32_t, decltype(dbf16)> du32;
3260 const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
3261 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3262}
3263
3265 const Vec512<double> v) {
3266 return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
3267}
3268
3270 const Vec512<double> v) {
3271 const auto clamped = detail::ClampF64ToI32Max(Full512<double>(), v);
3272 return Vec256<int32_t>{_mm512_cvttpd_epi32(clamped.raw)};
3273}
3274
3275// For already range-limited input [0, 255].
3277 const Full512<uint32_t> d32;
3278 // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
3279 // lowest 4 bytes.
3280 alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3281 ~0u};
3282 const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
3283 // Gather the lowest 4 bytes of 4 128-bit blocks.
3284 alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3285 const Vec512<uint8_t> bytes{
3286 _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
3287 return LowerHalf(LowerHalf(bytes));
3288}
3289
3290// ------------------------------ Convert integer <=> floating point
3291
3293 const Vec512<int32_t> v) {
3294 return Vec512<float>{_mm512_cvtepi32_ps(v.raw)};
3295}
3296
3298 const Vec512<int64_t> v) {
3299 return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
3300}
3301
3302// Truncates (rounds toward zero).
3304 return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
3305}
3307 return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw));
3308}
3309
3311 const Full512<int32_t> di;
3312 return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw));
3313}
3314
3315// ================================================== CRYPTO
3316
3317#if !defined(HWY_DISABLE_PCLMUL_AES)
3318
3319// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3320#ifdef HWY_NATIVE_AES
3321#undef HWY_NATIVE_AES
3322#else
3323#define HWY_NATIVE_AES
3324#endif
3325
3327 Vec512<uint8_t> round_key) {
3328#if HWY_TARGET == HWY_AVX3_DL
3329 return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
3330#else
3331 const Full512<uint8_t> d;
3332 const Half<decltype(d)> d2;
3333 return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3334 AESRound(LowerHalf(state), LowerHalf(round_key)));
3335#endif
3336}
3337
3339 Vec512<uint8_t> round_key) {
3340#if HWY_TARGET == HWY_AVX3_DL
3341 return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
3342#else
3343 const Full512<uint8_t> d;
3344 const Half<decltype(d)> d2;
3345 return Combine(d,
3346 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3347 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
3348#endif
3349}
3350
3352#if HWY_TARGET == HWY_AVX3_DL
3353 return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)};
3354#else
3355 alignas(64) uint64_t a[8];
3356 alignas(64) uint64_t b[8];
3357 const Full512<uint64_t> d;
3358 const Full128<uint64_t> d128;
3359 Store(va, d, a);
3360 Store(vb, d, b);
3361 for (size_t i = 0; i < 8; i += 2) {
3362 const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i));
3363 Store(mul, d128, a + i);
3364 }
3365 return Load(d, a);
3366#endif
3367}
3368
3370#if HWY_TARGET == HWY_AVX3_DL
3371 return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)};
3372#else
3373 alignas(64) uint64_t a[8];
3374 alignas(64) uint64_t b[8];
3375 const Full512<uint64_t> d;
3376 const Full128<uint64_t> d128;
3377 Store(va, d, a);
3378 Store(vb, d, b);
3379 for (size_t i = 0; i < 8; i += 2) {
3380 const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i));
3381 Store(mul, d128, a + i);
3382 }
3383 return Load(d, a);
3384#endif
3385}
3386
3387#endif // HWY_DISABLE_PCLMUL_AES
3388
3389// ================================================== MISC
3390
3391// Returns a vector with lane i=[0, N) set to "first" + i.
3392template <typename T, typename T2>
3393Vec512<T> Iota(const Full512<T> d, const T2 first) {
3394 HWY_ALIGN T lanes[64 / sizeof(T)];
3395 for (size_t i = 0; i < 64 / sizeof(T); ++i) {
3396 lanes[i] = static_cast<T>(first + static_cast<T2>(i));
3397 }
3398 return Load(d, lanes);
3399}
3400
3401// ------------------------------ Mask testing
3402
3403// Beware: the suffix indicates the number of mask bits, not lane size!
3404
3405namespace detail {
3406
3407template <typename T>
3409#if HWY_COMPILER_HAS_MASK_INTRINSICS
3410 return _kortestz_mask64_u8(mask.raw, mask.raw);
3411#else
3412 return mask.raw == 0;
3413#endif
3414}
3415template <typename T>
3417#if HWY_COMPILER_HAS_MASK_INTRINSICS
3418 return _kortestz_mask32_u8(mask.raw, mask.raw);
3419#else
3420 return mask.raw == 0;
3421#endif
3422}
3423template <typename T>
3425#if HWY_COMPILER_HAS_MASK_INTRINSICS
3426 return _kortestz_mask16_u8(mask.raw, mask.raw);
3427#else
3428 return mask.raw == 0;
3429#endif
3430}
3431template <typename T>
3433#if HWY_COMPILER_HAS_MASK_INTRINSICS
3434 return _kortestz_mask8_u8(mask.raw, mask.raw);
3435#else
3436 return mask.raw == 0;
3437#endif
3438}
3439
3440} // namespace detail
3441
3442template <typename T>
3443HWY_API bool AllFalse(const Full512<T> /* tag */, const Mask512<T> mask) {
3444 return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
3445}
3446
3447namespace detail {
3448
3449template <typename T>
3451#if HWY_COMPILER_HAS_MASK_INTRINSICS
3452 return _kortestc_mask64_u8(mask.raw, mask.raw);
3453#else
3454 return mask.raw == 0xFFFFFFFFFFFFFFFFull;
3455#endif
3456}
3457template <typename T>
3459#if HWY_COMPILER_HAS_MASK_INTRINSICS
3460 return _kortestc_mask32_u8(mask.raw, mask.raw);
3461#else
3462 return mask.raw == 0xFFFFFFFFull;
3463#endif
3464}
3465template <typename T>
3467#if HWY_COMPILER_HAS_MASK_INTRINSICS
3468 return _kortestc_mask16_u8(mask.raw, mask.raw);
3469#else
3470 return mask.raw == 0xFFFFull;
3471#endif
3472}
3473template <typename T>
3475#if HWY_COMPILER_HAS_MASK_INTRINSICS
3476 return _kortestc_mask8_u8(mask.raw, mask.raw);
3477#else
3478 return mask.raw == 0xFFull;
3479#endif
3480}
3481
3482} // namespace detail
3483
3484template <typename T>
3485HWY_API bool AllTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3486 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
3487}
3488
3489// `p` points to at least 8 readable bytes, not all of which need be valid.
3490template <typename T>
3492 const uint8_t* HWY_RESTRICT bits) {
3493 Mask512<T> mask;
3494 CopyBytes<8 / sizeof(T)>(bits, &mask.raw);
3495 // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3496 return mask;
3497}
3498
3499// `p` points to at least 8 writable bytes.
3500template <typename T>
3501HWY_API size_t StoreMaskBits(const Full512<T> /* tag */, const Mask512<T> mask,
3502 uint8_t* bits) {
3503 const size_t kNumBytes = 8 / sizeof(T);
3504 CopyBytes<kNumBytes>(&mask.raw, bits);
3505 // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3506 return kNumBytes;
3507}
3508
3509template <typename T>
3510HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3511 return PopCount(static_cast<uint64_t>(mask.raw));
3512}
3513
3514template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3515HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3516 const Mask512<T> mask) {
3517 return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
3518}
3519
3520template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3521HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3522 const Mask512<T> mask) {
3523 return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
3524}
3525
3526// ------------------------------ Compress
3527
3528template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3530 return Vec512<T>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
3531}
3532
3534 return Vec512<float>{_mm512_maskz_compress_ps(mask.raw, v.raw)};
3535}
3536
3537template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3538HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
3539 // See CompressIsPartition. u64 is faster than u32.
3540 alignas(16) constexpr uint64_t packed_array[256] = {
3541 // PrintCompress32x8Tables
3542 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
3543 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
3544 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
3545 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
3546 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
3547 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
3548 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
3549 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
3550 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
3551 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
3552 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
3553 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
3554 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
3555 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
3556 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
3557 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
3558 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
3559 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
3560 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
3561 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
3562 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
3563 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
3564 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
3565 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
3566 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
3567 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
3568 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
3569 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
3570 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
3571 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
3572 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
3573 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
3574 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
3575 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
3576 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
3577 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
3578 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
3579 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
3580 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
3581 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
3582 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
3583 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
3584 0x10765432, 0x17654320, 0x07654321, 0x76543210};
3585
3586 // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
3587 // _mm512_permutexvar_epi64 will ignore the upper bits.
3588 const Full512<T> d;
3589 const RebindToUnsigned<decltype(d)> du64;
3590 const auto packed = Set(du64, packed_array[mask.raw]);
3591 alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3592 const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
3593 return TableLookupLanes(v, indices);
3594}
3595
3596// 16-bit may use the 32-bit Compress and must be defined after it.
3597//
3598// Ignore IDE redefinition error - this is not actually defined in x86_256 if
3599// we are including x86_512-inl.h.
3600template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3601HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
3602 const Full256<T> d;
3603 const Rebind<uint16_t, decltype(d)> du;
3604 const auto vu = BitCast(du, v); // (required for float16_t inputs)
3605
3606#if HWY_TARGET == HWY_AVX3_DL // VBMI2
3607 const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
3608#else
3609 // Promote to i32 (512-bit vector!) so we can use the native Compress.
3610 const auto vw = PromoteTo(Rebind<int32_t, decltype(d)>(), vu);
3611 const Mask512<int32_t> mask32{static_cast<__mmask16>(mask.raw)};
3612 const auto cu = DemoteTo(du, Compress(vw, mask32));
3613#endif // HWY_TARGET == HWY_AVX3_DL
3614
3615 return BitCast(d, cu);
3616}
3617
3618// Expands to 32-bit, compresses, concatenate demoted halves.
3619template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3620HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
3621 const Full512<T> d;
3622 const Rebind<uint16_t, decltype(d)> du;
3623 const auto vu = BitCast(du, v); // (required for float16_t inputs)
3624
3625#if HWY_TARGET == HWY_AVX3_DL // VBMI2
3626 const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, vu.raw)};
3627#else
3628 const Repartition<int32_t, decltype(d)> dw;
3629 const Half<decltype(du)> duh;
3630 const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3631 const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3632
3633 const uint32_t mask_bits{mask.raw};
3634 const Mask512<int32_t> mask0{static_cast<__mmask16>(mask_bits & 0xFFFF)};
3635 const Mask512<int32_t> mask1{static_cast<__mmask16>(mask_bits >> 16)};
3636 const auto compressed0 = Compress(promoted0, mask0);
3637 const auto compressed1 = Compress(promoted1, mask1);
3638
3639 const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0));
3640 const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1));
3641
3642 // Concatenate into single vector by shifting upper with writemask.
3643 const size_t num0 = CountTrue(dw, mask0);
3644 const __mmask32 m_upper = ~((1u << num0) - 1);
3645 alignas(64) uint16_t iota[64] = {
3646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3648 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3649 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
3650 const auto idx = LoadU(du, iota + 32 - num0);
3651 const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
3652 demoted0.raw, m_upper, idx.raw, demoted1.raw)};
3653#endif // HWY_TARGET == HWY_AVX3_DL
3654
3655 return BitCast(d, cu);
3656}
3657
3658// ------------------------------ CompressNot
3659
3660template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
3662 return Compress(v, Not(mask));
3663}
3664
3665template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3666HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
3667 // See CompressIsPartition. u64 is faster than u32.
3668 alignas(16) constexpr uint64_t packed_array[256] = {
3669 // PrintCompressNot32x8Tables
3670 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
3671 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
3672 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
3673 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
3674 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
3675 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
3676 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
3677 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
3678 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
3679 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
3680 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
3681 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
3682 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
3683 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
3684 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
3685 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
3686 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
3687 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
3688 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
3689 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
3690 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
3691 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
3692 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
3693 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
3694 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
3695 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
3696 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
3697 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
3698 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
3699 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
3700 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
3701 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
3702 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
3703 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
3704 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
3705 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
3706 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
3707 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
3708 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
3709 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
3710 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
3711 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
3712 0x76543210, 0x76543201, 0x76543210, 0x76543210};
3713
3714 // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
3715 // _mm512_permutexvar_epi64 will ignore the upper bits.
3716 const Full512<T> d;
3717 const RebindToUnsigned<decltype(d)> du64;
3718 const auto packed = Set(du64, packed_array[mask.raw]);
3719 alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3720 const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
3721 return TableLookupLanes(v, indices);
3722}
3723
3725 Mask512<uint64_t> mask) {
3726 return CompressNot(v, mask);
3727}
3728
3729// ------------------------------ CompressBits
3730template <typename T>
3732 return Compress(v, LoadMaskBits(Full512<T>(), bits));
3733}
3734
3735// ------------------------------ CompressStore
3736
3737template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3739 T* HWY_RESTRICT unaligned) {
3740 const Rebind<uint16_t, decltype(d)> du;
3741 const auto vu = BitCast(du, v); // (required for float16_t inputs)
3742
3743 const uint64_t mask_bits{mask.raw};
3744
3745#if HWY_TARGET == HWY_AVX3_DL // VBMI2
3746 _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
3747#else
3748 const Repartition<int32_t, decltype(d)> dw;
3749 const Half<decltype(du)> duh;
3750 const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3751 const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3752
3753 const uint64_t maskL = mask_bits & 0xFFFF;
3754 const uint64_t maskH = mask_bits >> 16;
3755 const Mask512<int32_t> mask0{static_cast<__mmask16>(maskL)};
3756 const Mask512<int32_t> mask1{static_cast<__mmask16>(maskH)};
3757 const auto compressed0 = Compress(promoted0, mask0);
3758 const auto compressed1 = Compress(promoted1, mask1);
3759
3760 const Half<decltype(d)> dh;
3761 const auto demoted0 = BitCast(dh, DemoteTo(duh, compressed0));
3762 const auto demoted1 = BitCast(dh, DemoteTo(duh, compressed1));
3763
3764 // Store 256-bit halves
3765 StoreU(demoted0, dh, unaligned);
3766 StoreU(demoted1, dh, unaligned + PopCount(maskL));
3767#endif
3768
3769 return PopCount(mask_bits);
3770}
3771
3772template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3773HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3774 T* HWY_RESTRICT unaligned) {
3775 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3776 const size_t count = PopCount(uint64_t{mask.raw});
3777// Workaround for MSAN not marking output as initialized (b/233326619)
3778#if HWY_IS_MSAN
3779 __msan_unpoison(unaligned, count * sizeof(T));
3780#endif
3781 return count;
3782}
3783
3784template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3785HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3786 T* HWY_RESTRICT unaligned) {
3787 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3788 const size_t count = PopCount(uint64_t{mask.raw});
3789// Workaround for MSAN not marking output as initialized (b/233326619)
3790#if HWY_IS_MSAN
3791 __msan_unpoison(unaligned, count * sizeof(T));
3792#endif
3793 return count;
3794}
3795
3797 Full512<float> /* tag */,
3798 float* HWY_RESTRICT unaligned) {
3799 _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
3800 const size_t count = PopCount(uint64_t{mask.raw});
3801// Workaround for MSAN not marking output as initialized (b/233326619)
3802#if HWY_IS_MSAN
3803 __msan_unpoison(unaligned, count * sizeof(float));
3804#endif
3805 return count;
3806}
3807
3809 Full512<double> /* tag */,
3810 double* HWY_RESTRICT unaligned) {
3811 _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
3812 const size_t count = PopCount(uint64_t{mask.raw});
3813// Workaround for MSAN not marking output as initialized (b/233326619)
3814#if HWY_IS_MSAN
3815 __msan_unpoison(unaligned, count * sizeof(double));
3816#endif
3817 return count;
3818}
3819
3820// ------------------------------ CompressBlendedStore
3821template <typename T>
3823 T* HWY_RESTRICT unaligned) {
3824 // AVX-512 already does the blending at no extra cost (latency 11,
3825 // rthroughput 2 - same as compress plus store).
3826 if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
3827 return CompressStore(v, m, d, unaligned);
3828 } else {
3829 const size_t count = CountTrue(d, m);
3830 BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
3831// Workaround for MSAN not marking output as initialized (b/233326619)
3832#if HWY_IS_MSAN
3833 __msan_unpoison(unaligned, count * sizeof(T));
3834#endif
3835 return count;
3836 }
3837}
3838
3839// ------------------------------ CompressBitsStore
3840template <typename T>
3842 Full512<T> d, T* HWY_RESTRICT unaligned) {
3843 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
3844}
3845
3846// ------------------------------ LoadInterleaved4
3847
3848// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4.
3849namespace detail {
3850
3851// Type-safe wrapper.
3852template <_MM_PERM_ENUM kPerm, typename T>
3854 return Vec512<T>{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)};
3855}
3856template <_MM_PERM_ENUM kPerm>
3858 return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)};
3859}
3860template <_MM_PERM_ENUM kPerm>
3862 return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)};
3863}
3864
3865// Input (128-bit blocks):
3866// 3 2 1 0 (<- first block in unaligned)
3867// 7 6 5 4
3868// b a 9 8
3869// Output:
3870// 9 6 3 0 (LSB of A)
3871// a 7 4 1
3872// b 8 5 2
3873template <typename T>
3875 const T* HWY_RESTRICT unaligned,
3876 Vec512<T>& A, Vec512<T>& B, Vec512<T>& C) {
3877 constexpr size_t N = 64 / sizeof(T);
3878 const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
3879 const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
3880 const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
3881
3882 const Vec512<T> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
3883 const Vec512<T> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
3884
3885 A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
3886 B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
3887 C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
3888}
3889
3890// Input (128-bit blocks):
3891// 3 2 1 0 (<- first block in unaligned)
3892// 7 6 5 4
3893// b a 9 8
3894// f e d c
3895// Output:
3896// c 8 4 0 (LSB of A)
3897// d 9 5 1
3898// e a 6 2
3899// f b 7 3
3900template <typename T>
3902 const T* HWY_RESTRICT unaligned,
3903 Vec512<T>& A, Vec512<T>& B, Vec512<T>& C,
3904 Vec512<T>& D) {
3905 constexpr size_t N = 64 / sizeof(T);
3906 const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
3907 const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
3908 const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
3909 const Vec512<T> vfedc = LoadU(d, unaligned + 3 * N);
3910
3911 const Vec512<T> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
3912 const Vec512<T> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
3913 const Vec512<T> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
3914 const Vec512<T> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
3915 A = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
3916 B = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
3917 C = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
3918 D = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
3919}
3920
3921} // namespace detail
3922
3923// ------------------------------ StoreInterleaved2
3924
3925// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
3926
3927namespace detail {
3928
3929// Input (128-bit blocks):
3930// 6 4 2 0 (LSB of i)
3931// 7 5 3 1
3932// Output:
3933// 3 2 1 0
3934// 7 6 5 4
3935template <typename T>
3937 const Full512<T> d,
3938 T* HWY_RESTRICT unaligned) {
3939 constexpr size_t N = 64 / sizeof(T);
3940 const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
3941 const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
3942 const auto j1_i1_j0_i0 =
3943 detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
3944 const auto j3_i3_j2_i2 =
3945 detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
3946 StoreU(j1_i1_j0_i0, d, unaligned + 0 * N);
3947 StoreU(j3_i3_j2_i2, d, unaligned + 1 * N);
3948}
3949
3950// Input (128-bit blocks):
3951// 9 6 3 0 (LSB of i)
3952// a 7 4 1
3953// b 8 5 2
3954// Output:
3955// 3 2 1 0
3956// 7 6 5 4
3957// b a 9 8
3958template <typename T>
3960 const Vec512<T> k, Full512<T> d,
3961 T* HWY_RESTRICT unaligned) {
3962 constexpr size_t N = 64 / sizeof(T);
3963 const Vec512<T> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
3964 const Vec512<T> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
3965 const Vec512<T> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
3966
3967 const Vec512<T> out0 = // i1 k0 j0 i0
3968 detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
3969 const Vec512<T> out1 = // j2 i2 k1 j1
3970 detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
3971 const Vec512<T> out2 = // k3 j3 i3 k2
3972 detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
3973
3974 StoreU(out0, d, unaligned + 0 * N);
3975 StoreU(out1, d, unaligned + 1 * N);
3976 StoreU(out2, d, unaligned + 2 * N);
3977}
3978
3979// Input (128-bit blocks):
3980// c 8 4 0 (LSB of i)
3981// d 9 5 1
3982// e a 6 2
3983// f b 7 3
3984// Output:
3985// 3 2 1 0
3986// 7 6 5 4
3987// b a 9 8
3988// f e d c
3989template <typename T>
3991 const Vec512<T> k, const Vec512<T> l,
3992 Full512<T> d, T* HWY_RESTRICT unaligned) {
3993 constexpr size_t N = 64 / sizeof(T);
3994 const Vec512<T> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
3995 const Vec512<T> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
3996 const Vec512<T> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
3997 const Vec512<T> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
3998 const Vec512<T> out0 =
3999 detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
4000 const Vec512<T> out1 =
4001 detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
4002 const Vec512<T> out2 =
4003 detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
4004 const Vec512<T> out3 =
4005 detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
4006 StoreU(out0, d, unaligned + 0 * N);
4007 StoreU(out1, d, unaligned + 1 * N);
4008 StoreU(out2, d, unaligned + 2 * N);
4009 StoreU(out3, d, unaligned + 3 * N);
4010}
4011
4012} // namespace detail
4013
4014// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
4015
4017 const Vec512<uint64_t> b) {
4018 const DFromV<decltype(a)> du64;
4019 const RepartitionToNarrow<decltype(du64)> du32;
4020 const auto maskL = Set(du64, 0xFFFFFFFFULL);
4021 const auto a32 = BitCast(du32, a);
4022 const auto b32 = BitCast(du32, b);
4023 // Inputs for MulEven: we only need the lower 32 bits
4024 const auto aH = Shuffle2301(a32);
4025 const auto bH = Shuffle2301(b32);
4026
4027 // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
4028 // the even (lower 64 bits of every 128-bit block) results. See
4029 // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
4030 const auto aLbL = MulEven(a32, b32);
4031 const auto w3 = aLbL & maskL;
4032
4033 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
4034 const auto w2 = t2 & maskL;
4035 const auto w1 = ShiftRight<32>(t2);
4036
4037 const auto t = MulEven(a32, bH) + w2;
4038 const auto k = ShiftRight<32>(t);
4039
4040 const auto mulH = MulEven(aH, bH) + w1 + k;
4041 const auto mulL = ShiftLeft<32>(t) + w3;
4042 return InterleaveLower(mulL, mulH);
4043}
4044
4046 const Vec512<uint64_t> b) {
4047 const DFromV<decltype(a)> du64;
4048 const RepartitionToNarrow<decltype(du64)> du32;
4049 const auto maskL = Set(du64, 0xFFFFFFFFULL);
4050 const auto a32 = BitCast(du32, a);
4051 const auto b32 = BitCast(du32, b);
4052 // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
4053 const auto aH = Shuffle2301(a32);
4054 const auto bH = Shuffle2301(b32);
4055
4056 // Same as above, but we're using the odd results (upper 64 bits per block).
4057 const auto aLbL = MulEven(a32, b32);
4058 const auto w3 = aLbL & maskL;
4059
4060 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
4061 const auto w2 = t2 & maskL;
4062 const auto w1 = ShiftRight<32>(t2);
4063
4064 const auto t = MulEven(a32, bH) + w2;
4065 const auto k = ShiftRight<32>(t);
4066
4067 const auto mulH = MulEven(aH, bH) + w1 + k;
4068 const auto mulL = ShiftLeft<32>(t) + w3;
4069 return InterleaveUpper(du64, mulL, mulH);
4070}
4071
4072// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4073
4077 const Vec512<float> sum0,
4078 Vec512<float>& sum1) {
4079 // TODO(janwas): _mm512_dpbf16_ps when available
4080 const Repartition<uint16_t, decltype(df32)> du16;
4081 const RebindToUnsigned<decltype(df32)> du32;
4082 const Vec512<uint16_t> zero = Zero(du16);
4083 // Lane order within sum0/1 is undefined, hence we can avoid the
4084 // longer-latency lane-crossing PromoteTo.
4085 const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
4086 const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4087 const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
4088 const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4089 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4090 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4091}
4092
4093// ------------------------------ Reductions
4094
4095// Returns the sum in each lane.
4097 return Set(d, _mm512_reduce_add_epi32(v.raw));
4098}
4100 return Set(d, _mm512_reduce_add_epi64(v.raw));
4101}
4103 return Set(d, static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw)));
4104}
4106 return Set(d, static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw)));
4107}
4109 return Set(d, _mm512_reduce_add_ps(v.raw));
4110}
4112 return Set(d, _mm512_reduce_add_pd(v.raw));
4113}
4114
4115// Returns the minimum in each lane.
4117 return Set(d, _mm512_reduce_min_epi32(v.raw));
4118}
4120 return Set(d, _mm512_reduce_min_epi64(v.raw));
4121}
4123 return Set(d, _mm512_reduce_min_epu32(v.raw));
4124}
4126 return Set(d, _mm512_reduce_min_epu64(v.raw));
4127}
4129 return Set(d, _mm512_reduce_min_ps(v.raw));
4130}
4132 return Set(d, _mm512_reduce_min_pd(v.raw));
4133}
4134template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4136 const Repartition<int32_t, decltype(d)> d32;
4137 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4138 const auto odd = ShiftRight<16>(BitCast(d32, v));
4139 const auto min = MinOfLanes(d32, Min(even, odd));
4140 // Also broadcast into odd lanes.
4141 return BitCast(d, Or(min, ShiftLeft<16>(min)));
4142}
4143
4144// Returns the maximum in each lane.
4146 return Set(d, _mm512_reduce_max_epi32(v.raw));
4147}
4149 return Set(d, _mm512_reduce_max_epi64(v.raw));
4150}
4152 return Set(d, _mm512_reduce_max_epu32(v.raw));
4153}
4155 return Set(d, _mm512_reduce_max_epu64(v.raw));
4156}
4158 return Set(d, _mm512_reduce_max_ps(v.raw));
4159}
4161 return Set(d, _mm512_reduce_max_pd(v.raw));
4162}
4163template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4165 const Repartition<int32_t, decltype(d)> d32;
4166 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
4167 const auto odd = ShiftRight<16>(BitCast(d32, v));
4168 const auto min = MaxOfLanes(d32, Max(even, odd));
4169 // Also broadcast into odd lanes.
4170 return BitCast(d, Or(min, ShiftLeft<16>(min)));
4171}
4172
4173// NOLINTNEXTLINE(google-readability-namespace-comments)
4174} // namespace HWY_NAMESPACE
4175} // namespace hwy
4177
4178// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
4179// the warning seems to be issued at the call site of intrinsics, i.e. our code.
4180HWY_DIAGNOSTICS(pop)
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
Raw raw
Definition: x86_256-inl.h:100
Definition: x86_512-inl.h:112
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:121
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:113
Raw raw
Definition: x86_512-inl.h:140
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:133
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:124
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:136
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:127
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:118
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:130
#define HWY_AVX3_DL
Definition: detect_targets.h:62
#define HWY_TARGET
Definition: detect_targets.h:341
const double shift
Definition: RateControl.cpp:165
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition: x86_512-inl.h:3853
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: x86_512-inl.h:2464
__m512i raw
Definition: x86_512-inl.h:2465
Definition: x86_512-inl.h:145
detail::RawMask512< sizeof(T)>::type raw
Definition: x86_512-inl.h:146
Definition: ops/shared-inl.h:40
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:175
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:171
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:167
__m512d type
Definition: x86_512-inl.h:86
__m512 type
Definition: x86_512-inl.h:82
Definition: x86_512-inl.h:77
__m512i type
Definition: x86_512-inl.h:78
__mmask64 type
Definition: x86_512-inl.h:94
__mmask32 type
Definition: x86_512-inl.h:98
__mmask16 type
Definition: x86_512-inl.h:102
__mmask8 type
Definition: x86_512-inl.h:106
Definition: x86_512-inl.h:91
Definition: base.h:358
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()