Grok  9.7.5
x86_512-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 512-bit AVX512 vectors and operations.
17 // External include guard in highway.h - see comment there.
18 
19 // WARNING: most operations do not cross 128-bit block boundaries. In
20 // particular, "Broadcast", pack and zip behavior may be surprising.
21 
22 #include <immintrin.h> // AVX2+
23 
24 #include "hwy/base.h"
25 
26 #if HWY_IS_MSAN
27 #include <sanitizer/msan_interface.h>
28 #endif
29 
30 #if defined(_MSC_VER) && defined(__clang__)
31 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
32 // including these headers when _MSC_VER is defined, like when using clang-cl.
33 // Include these directly here.
34 // clang-format off
35 #include <smmintrin.h>
36 
37 #include <avxintrin.h>
38 #include <avx2intrin.h>
39 #include <f16cintrin.h>
40 #include <fmaintrin.h>
41 
42 #include <avx512fintrin.h>
43 #include <avx512vlintrin.h>
44 #include <avx512bwintrin.h>
45 #include <avx512dqintrin.h>
46 #include <avx512vlbwintrin.h>
47 #include <avx512vldqintrin.h>
48 #include <avx512bitalgintrin.h>
49 #include <avx512vlbitalgintrin.h>
50 #include <avx512vpopcntdqintrin.h>
51 #include <avx512vpopcntdqvlintrin.h>
52 // clang-format on
53 #endif
54 
55 #include <stddef.h>
56 #include <stdint.h>
57 
58 // For half-width vectors. Already includes base.h and shared-inl.h.
59 #include "hwy/ops/x86_256-inl.h"
60 
62 namespace hwy {
63 namespace HWY_NAMESPACE {
64 
65 namespace detail {
66 
67 template <typename T>
68 struct Raw512 {
69  using type = __m512i;
70 };
71 template <>
72 struct Raw512<float> {
73  using type = __m512;
74 };
75 template <>
76 struct Raw512<double> {
77  using type = __m512d;
78 };
79 
80 // Template arg: sizeof(lane type)
81 template <size_t size>
82 struct RawMask512 {};
83 template <>
84 struct RawMask512<1> {
85  using type = __mmask64;
86 };
87 template <>
88 struct RawMask512<2> {
89  using type = __mmask32;
90 };
91 template <>
92 struct RawMask512<4> {
93  using type = __mmask16;
94 };
95 template <>
96 struct RawMask512<8> {
97  using type = __mmask8;
98 };
99 
100 } // namespace detail
101 
102 template <typename T>
103 class Vec512 {
104  using Raw = typename detail::Raw512<T>::type;
105 
106  public:
107  // Compound assignment. Only usable if there is a corresponding non-member
108  // binary operator overload. For example, only f32 and f64 support division.
110  return *this = (*this * other);
111  }
113  return *this = (*this / other);
114  }
116  return *this = (*this + other);
117  }
119  return *this = (*this - other);
120  }
122  return *this = (*this & other);
123  }
125  return *this = (*this | other);
126  }
128  return *this = (*this ^ other);
129  }
130 
132 };
133 
134 // Mask register: one bit per lane.
135 template <typename T>
136 struct Mask512 {
137  typename detail::RawMask512<sizeof(T)>::type raw;
138 };
139 
140 // ------------------------------ BitCast
141 
142 namespace detail {
143 
144 HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; }
145 HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); }
146 HWY_INLINE __m512i BitCastToInteger(__m512d v) {
147  return _mm512_castpd_si512(v);
148 }
149 
150 template <typename T>
152  return Vec512<uint8_t>{BitCastToInteger(v.raw)};
153 }
154 
155 // Cannot rely on function overloading because return types differ.
156 template <typename T>
158  HWY_INLINE __m512i operator()(__m512i v) { return v; }
159 };
160 template <>
161 struct BitCastFromInteger512<float> {
162  HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); }
163 };
164 template <>
165 struct BitCastFromInteger512<double> {
166  HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); }
167 };
168 
169 template <typename T>
171  return Vec512<T>{BitCastFromInteger512<T>()(v.raw)};
172 }
173 
174 } // namespace detail
175 
176 template <typename T, typename FromT>
179 }
180 
181 // ------------------------------ Set
182 
183 // Returns an all-zero vector.
184 template <typename T>
186  return Vec512<T>{_mm512_setzero_si512()};
187 }
189  return Vec512<float>{_mm512_setzero_ps()};
190 }
192  return Vec512<double>{_mm512_setzero_pd()};
193 }
194 
195 // Returns a vector with all lanes set to "t".
196 HWY_API Vec512<uint8_t> Set(Full512<uint8_t> /* tag */, const uint8_t t) {
197  return Vec512<uint8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
198 }
199 HWY_API Vec512<uint16_t> Set(Full512<uint16_t> /* tag */, const uint16_t t) {
200  return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
201 }
202 HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
203  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
204 }
205 HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
206  return Vec512<uint64_t>{
207  _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
208 }
209 HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
210  return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
211 }
212 HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
213  return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
214 }
215 HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
216  return Vec512<int32_t>{_mm512_set1_epi32(t)};
217 }
218 HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
219  return Vec512<int64_t>{
220  _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
221 }
222 HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
223  return Vec512<float>{_mm512_set1_ps(t)};
224 }
225 HWY_API Vec512<double> Set(Full512<double> /* tag */, const double t) {
226  return Vec512<double>{_mm512_set1_pd(t)};
227 }
228 
229 HWY_DIAGNOSTICS(push)
230 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
231 
232 // Returns a vector with uninitialized elements.
233 template <typename T>
235  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
236  // generate an XOR instruction.
237  return Vec512<T>{_mm512_undefined_epi32()};
238 }
240  return Vec512<float>{_mm512_undefined_ps()};
241 }
243  return Vec512<double>{_mm512_undefined_pd()};
244 }
245 
246 HWY_DIAGNOSTICS(pop)
247 
248 // ================================================== LOGICAL
249 
250 // ------------------------------ Not
251 
252 template <typename T>
254  using TU = MakeUnsigned<T>;
255  const __m512i vu = BitCast(Full512<TU>(), v).raw;
256  return BitCast(Full512<T>(),
257  Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
258 }
259 
260 // ------------------------------ And
261 
262 template <typename T>
264  return Vec512<T>{_mm512_and_si512(a.raw, b.raw)};
265 }
266 
268  return Vec512<float>{_mm512_and_ps(a.raw, b.raw)};
269 }
271  return Vec512<double>{_mm512_and_pd(a.raw, b.raw)};
272 }
273 
274 // ------------------------------ AndNot
275 
276 // Returns ~not_mask & mask.
277 template <typename T>
278 HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
279  return Vec512<T>{_mm512_andnot_si512(not_mask.raw, mask.raw)};
280 }
282  const Vec512<float> mask) {
283  return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)};
284 }
286  const Vec512<double> mask) {
287  return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)};
288 }
289 
290 // ------------------------------ Or
291 
292 template <typename T>
294  return Vec512<T>{_mm512_or_si512(a.raw, b.raw)};
295 }
296 
298  return Vec512<float>{_mm512_or_ps(a.raw, b.raw)};
299 }
301  return Vec512<double>{_mm512_or_pd(a.raw, b.raw)};
302 }
303 
304 // ------------------------------ Xor
305 
306 template <typename T>
308  return Vec512<T>{_mm512_xor_si512(a.raw, b.raw)};
309 }
310 
312  return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)};
313 }
315  return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
316 }
317 
318 // ------------------------------ OrAnd
319 
320 template <typename T>
322  const Full512<T> d;
323  const RebindToUnsigned<decltype(d)> du;
324  using VU = VFromD<decltype(du)>;
325  const __m512i ret = _mm512_ternarylogic_epi64(
326  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
327  return BitCast(d, VU{ret});
328 }
329 
330 // ------------------------------ IfVecThenElse
331 
332 template <typename T>
334  const Full512<T> d;
335  const RebindToUnsigned<decltype(d)> du;
336  using VU = VFromD<decltype(du)>;
337  return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
338  BitCast(du, yes).raw,
339  BitCast(du, no).raw, 0xCA)});
340 }
341 
342 // ------------------------------ Operator overloads (internal-only if float)
343 
344 template <typename T>
346  return And(a, b);
347 }
348 
349 template <typename T>
351  return Or(a, b);
352 }
353 
354 template <typename T>
356  return Xor(a, b);
357 }
358 
359 // ------------------------------ PopulationCount
360 
361 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
362 #if HWY_TARGET == HWY_AVX3_DL
363 
364 #ifdef HWY_NATIVE_POPCNT
365 #undef HWY_NATIVE_POPCNT
366 #else
367 #define HWY_NATIVE_POPCNT
368 #endif
369 
370 namespace detail {
371 
372 template <typename T>
374  return Vec512<T>{_mm512_popcnt_epi8(v.raw)};
375 }
376 template <typename T>
378  return Vec512<T>{_mm512_popcnt_epi16(v.raw)};
379 }
380 template <typename T>
382  return Vec512<T>{_mm512_popcnt_epi32(v.raw)};
383 }
384 template <typename T>
386  return Vec512<T>{_mm512_popcnt_epi64(v.raw)};
387 }
388 
389 } // namespace detail
390 
391 template <typename T>
393  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
394 }
395 
396 #endif // HWY_TARGET == HWY_AVX3_DL
397 
398 // ================================================== SIGN
399 
400 // ------------------------------ CopySign
401 
402 template <typename T>
403 HWY_API Vec512<T> CopySign(const Vec512<T> magn, const Vec512<T> sign) {
404  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
405 
406  const Full512<T> d;
407  const auto msb = SignBit(d);
408 
409  const Rebind<MakeUnsigned<T>, decltype(d)> du;
410  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
411  // 0 0 0 | 0
412  // 0 0 1 | 0
413  // 0 1 0 | 1
414  // 0 1 1 | 1
415  // 1 0 0 | 0
416  // 1 0 1 | 1
417  // 1 1 0 | 0
418  // 1 1 1 | 1
419  // The lane size does not matter because we are not using predication.
420  const __m512i out = _mm512_ternarylogic_epi32(
421  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
422  return BitCast(d, decltype(Zero(du)){out});
423 }
424 
425 template <typename T>
427  // AVX3 can also handle abs < 0, so no extra action needed.
428  return CopySign(abs, sign);
429 }
430 
431 // ================================================== MASK
432 
433 // ------------------------------ FirstN
434 
435 // Possibilities for constructing a bitmask of N ones:
436 // - kshift* only consider the lowest byte of the shift count, so they would
437 // not correctly handle large n.
438 // - Scalar shifts >= 64 are UB.
439 // - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
440 // we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
441 
442 #if HWY_ARCH_X86_32
443 namespace detail {
444 
445 // 32 bit mask is sufficient for lane size >= 2.
446 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
447 HWY_INLINE Mask512<T> FirstN(size_t n) {
448  Mask512<T> m;
449  const uint32_t all = ~uint32_t(0);
450  // BZHI only looks at the lower 8 bits of n!
451  m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n));
452  return m;
453 }
454 
455 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
456 HWY_INLINE Mask512<T> FirstN(size_t n) {
457  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
458  return Mask512<T>{static_cast<__mmask64>(bits)};
459 }
460 
461 } // namespace detail
462 #endif // HWY_ARCH_X86_32
463 
464 template <typename T>
465 HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
466 #if HWY_ARCH_X86_64
467  Mask512<T> m;
468  const uint64_t all = ~uint64_t(0);
469  // BZHI only looks at the lower 8 bits of n!
470  m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n));
471  return m;
472 #else
473  return detail::FirstN<T>(n);
474 #endif // HWY_ARCH_X86_64
475 }
476 
477 // ------------------------------ IfThenElse
478 
479 // Returns mask ? b : a.
480 
481 namespace detail {
482 
483 // Templates for signed/unsigned integer of a particular size.
484 template <typename T>
486  const Mask512<T> mask, const Vec512<T> yes,
487  const Vec512<T> no) {
488  return Vec512<T>{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
489 }
490 template <typename T>
492  const Mask512<T> mask, const Vec512<T> yes,
493  const Vec512<T> no) {
494  return Vec512<T>{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
495 }
496 template <typename T>
498  const Mask512<T> mask, const Vec512<T> yes,
499  const Vec512<T> no) {
500  return Vec512<T>{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
501 }
502 template <typename T>
504  const Mask512<T> mask, const Vec512<T> yes,
505  const Vec512<T> no) {
506  return Vec512<T>{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
507 }
508 
509 } // namespace detail
510 
511 template <typename T>
513  const Vec512<T> no) {
514  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
515 }
517  const Vec512<float> yes,
518  const Vec512<float> no) {
519  return Vec512<float>{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)};
520 }
522  const Vec512<double> yes,
523  const Vec512<double> no) {
524  return Vec512<double>{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)};
525 }
526 
527 namespace detail {
528 
529 template <typename T>
531  const Mask512<T> mask,
532  const Vec512<T> yes) {
533  return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)};
534 }
535 template <typename T>
537  const Mask512<T> mask,
538  const Vec512<T> yes) {
539  return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)};
540 }
541 template <typename T>
543  const Mask512<T> mask,
544  const Vec512<T> yes) {
545  return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)};
546 }
547 template <typename T>
549  const Mask512<T> mask,
550  const Vec512<T> yes) {
551  return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)};
552 }
553 
554 } // namespace detail
555 
556 template <typename T>
558  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
559 }
561  const Vec512<float> yes) {
562  return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)};
563 }
565  const Vec512<double> yes) {
566  return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)};
567 }
568 
569 namespace detail {
570 
571 template <typename T>
573  const Mask512<T> mask, const Vec512<T> no) {
574  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
575  return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
576 }
577 template <typename T>
579  const Mask512<T> mask, const Vec512<T> no) {
580  return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
581 }
582 template <typename T>
584  const Mask512<T> mask, const Vec512<T> no) {
585  return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
586 }
587 template <typename T>
589  const Mask512<T> mask, const Vec512<T> no) {
590  return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
591 }
592 
593 } // namespace detail
594 
595 template <typename T>
597  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
598 }
600  const Vec512<float> no) {
601  return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
602 }
604  const Vec512<double> no) {
605  return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
606 }
607 
608 template <typename T>
610  static_assert(IsSigned<T>(), "Only works for signed/float");
611  // AVX3 MaskFromVec only looks at the MSB
612  return IfThenElse(MaskFromVec(v), yes, no);
613 }
614 
615 template <typename T, HWY_IF_FLOAT(T)>
617  // AVX3 MaskFromVec only looks at the MSB
618  return IfThenZeroElse(MaskFromVec(v), v);
619 }
620 
621 // ================================================== ARITHMETIC
622 
623 // ------------------------------ Addition
624 
625 // Unsigned
627  const Vec512<uint8_t> b) {
628  return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)};
629 }
631  const Vec512<uint16_t> b) {
632  return Vec512<uint16_t>{_mm512_add_epi16(a.raw, b.raw)};
633 }
635  const Vec512<uint32_t> b) {
636  return Vec512<uint32_t>{_mm512_add_epi32(a.raw, b.raw)};
637 }
639  const Vec512<uint64_t> b) {
640  return Vec512<uint64_t>{_mm512_add_epi64(a.raw, b.raw)};
641 }
642 
643 // Signed
645  const Vec512<int8_t> b) {
646  return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)};
647 }
649  const Vec512<int16_t> b) {
650  return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)};
651 }
653  const Vec512<int32_t> b) {
654  return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)};
655 }
657  const Vec512<int64_t> b) {
658  return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)};
659 }
660 
661 // Float
663  return Vec512<float>{_mm512_add_ps(a.raw, b.raw)};
664 }
666  const Vec512<double> b) {
667  return Vec512<double>{_mm512_add_pd(a.raw, b.raw)};
668 }
669 
670 // ------------------------------ Subtraction
671 
672 // Unsigned
674  const Vec512<uint8_t> b) {
675  return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)};
676 }
678  const Vec512<uint16_t> b) {
679  return Vec512<uint16_t>{_mm512_sub_epi16(a.raw, b.raw)};
680 }
682  const Vec512<uint32_t> b) {
683  return Vec512<uint32_t>{_mm512_sub_epi32(a.raw, b.raw)};
684 }
686  const Vec512<uint64_t> b) {
687  return Vec512<uint64_t>{_mm512_sub_epi64(a.raw, b.raw)};
688 }
689 
690 // Signed
692  const Vec512<int8_t> b) {
693  return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)};
694 }
696  const Vec512<int16_t> b) {
697  return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)};
698 }
700  const Vec512<int32_t> b) {
701  return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)};
702 }
704  const Vec512<int64_t> b) {
705  return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)};
706 }
707 
708 // Float
710  return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)};
711 }
713  const Vec512<double> b) {
714  return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
715 }
716 
717 // ------------------------------ SumsOf8
719  return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
720 }
721 
722 // ------------------------------ SaturatedAdd
723 
724 // Returns a + b clamped to the destination range.
725 
726 // Unsigned
728  const Vec512<uint8_t> b) {
729  return Vec512<uint8_t>{_mm512_adds_epu8(a.raw, b.raw)};
730 }
732  const Vec512<uint16_t> b) {
733  return Vec512<uint16_t>{_mm512_adds_epu16(a.raw, b.raw)};
734 }
735 
736 // Signed
738  const Vec512<int8_t> b) {
739  return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)};
740 }
742  const Vec512<int16_t> b) {
743  return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
744 }
745 
746 // ------------------------------ SaturatedSub
747 
748 // Returns a - b clamped to the destination range.
749 
750 // Unsigned
752  const Vec512<uint8_t> b) {
753  return Vec512<uint8_t>{_mm512_subs_epu8(a.raw, b.raw)};
754 }
756  const Vec512<uint16_t> b) {
757  return Vec512<uint16_t>{_mm512_subs_epu16(a.raw, b.raw)};
758 }
759 
760 // Signed
762  const Vec512<int8_t> b) {
763  return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)};
764 }
766  const Vec512<int16_t> b) {
767  return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)};
768 }
769 
770 // ------------------------------ Average
771 
772 // Returns (a + b + 1) / 2
773 
774 // Unsigned
776  const Vec512<uint8_t> b) {
777  return Vec512<uint8_t>{_mm512_avg_epu8(a.raw, b.raw)};
778 }
780  const Vec512<uint16_t> b) {
781  return Vec512<uint16_t>{_mm512_avg_epu16(a.raw, b.raw)};
782 }
783 
784 // ------------------------------ Abs (Sub)
785 
786 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
788 #if HWY_COMPILER_MSVC
789  // Workaround for incorrect codegen? (untested due to internal compiler error)
790  const auto zero = Zero(Full512<int8_t>());
791  return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
792 #else
793  return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
794 #endif
795 }
797  return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
798 }
800  return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
801 }
803  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
804 }
805 
806 // These aren't native instructions, they also involve AND with constant.
808  return Vec512<float>{_mm512_abs_ps(v.raw)};
809 }
811  return Vec512<double>{_mm512_abs_pd(v.raw)};
812 }
813 // ------------------------------ ShiftLeft
814 
815 template <int kBits>
817  return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)};
818 }
819 
820 template <int kBits>
822  return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)};
823 }
824 
825 template <int kBits>
827  return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)};
828 }
829 
830 template <int kBits>
832  return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)};
833 }
834 
835 template <int kBits>
837  return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)};
838 }
839 
840 template <int kBits>
842  return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
843 }
844 
845 template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
847  const Full512<T> d8;
848  const RepartitionToWide<decltype(d8)> d16;
849  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
850  return kBits == 1
851  ? (v + v)
852  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
853 }
854 
855 // ------------------------------ ShiftRight
856 
857 template <int kBits>
859  return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)};
860 }
861 
862 template <int kBits>
864  return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)};
865 }
866 
867 template <int kBits>
869  return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)};
870 }
871 
872 template <int kBits>
874  const Full512<uint8_t> d8;
875  // Use raw instead of BitCast to support N=1.
876  const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
877  return shifted & Set(d8, 0xFF >> kBits);
878 }
879 
880 template <int kBits>
882  return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
883 }
884 
885 template <int kBits>
887  return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)};
888 }
889 
890 template <int kBits>
892  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
893 }
894 
895 template <int kBits>
897  const Full512<int8_t> di;
898  const Full512<uint8_t> du;
899  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
900  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
901  return (shifted ^ shifted_sign) - shifted_sign;
902 }
903 
904 // ------------------------------ RotateRight
905 
906 template <int kBits>
908  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
909  return Vec512<uint32_t>{_mm512_ror_epi32(v.raw, kBits)};
910 }
911 
912 template <int kBits>
914  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
915  return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
916 }
917 
918 // ------------------------------ ShiftLeftSame
919 
921  const int bits) {
922  return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
923 }
925  const int bits) {
926  return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
927 }
929  const int bits) {
930  return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
931 }
932 
934  return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
935 }
936 
938  return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
939 }
940 
942  return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
943 }
944 
945 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
946 HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
947  const Full512<T> d8;
948  const RepartitionToWide<decltype(d8)> d16;
949  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
950  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
951 }
952 
953 // ------------------------------ ShiftRightSame
954 
956  const int bits) {
957  return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
958 }
960  const int bits) {
961  return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
962 }
964  const int bits) {
965  return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
966 }
967 
969  const Full512<uint8_t> d8;
970  const RepartitionToWide<decltype(d8)> d16;
971  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
972  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
973 }
974 
976  const int bits) {
977  return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
978 }
979 
981  const int bits) {
982  return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
983 }
985  const int bits) {
986  return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
987 }
988 
990  const Full512<int8_t> di;
991  const Full512<uint8_t> du;
992  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
993  const auto shifted_sign =
994  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
995  return (shifted ^ shifted_sign) - shifted_sign;
996 }
997 
998 // ------------------------------ Shl
999 
1001  const Vec512<uint16_t> bits) {
1002  return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
1003 }
1004 
1006  const Vec512<uint32_t> bits) {
1007  return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)};
1008 }
1009 
1011  const Vec512<uint64_t> bits) {
1012  return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)};
1013 }
1014 
1015 // Signed left shift is the same as unsigned.
1016 template <typename T, HWY_IF_SIGNED(T)>
1018  const Full512<T> di;
1019  const Full512<MakeUnsigned<T>> du;
1020  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
1021 }
1022 
1023 // ------------------------------ Shr
1024 
1026  const Vec512<uint16_t> bits) {
1027  return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)};
1028 }
1029 
1031  const Vec512<uint32_t> bits) {
1032  return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)};
1033 }
1034 
1036  const Vec512<uint64_t> bits) {
1037  return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)};
1038 }
1039 
1041  const Vec512<int16_t> bits) {
1042  return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)};
1043 }
1044 
1046  const Vec512<int32_t> bits) {
1047  return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)};
1048 }
1049 
1051  const Vec512<int64_t> bits) {
1052  return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
1053 }
1054 
1055 // ------------------------------ Minimum
1056 
1057 // Unsigned
1059  return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)};
1060 }
1062  const Vec512<uint16_t> b) {
1063  return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)};
1064 }
1066  const Vec512<uint32_t> b) {
1067  return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)};
1068 }
1070  const Vec512<uint64_t> b) {
1071  return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)};
1072 }
1073 
1074 // Signed
1076  return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)};
1077 }
1079  return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)};
1080 }
1082  return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)};
1083 }
1085  return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)};
1086 }
1087 
1088 // Float
1090  return Vec512<float>{_mm512_min_ps(a.raw, b.raw)};
1091 }
1093  return Vec512<double>{_mm512_min_pd(a.raw, b.raw)};
1094 }
1095 
1096 // ------------------------------ Maximum
1097 
1098 // Unsigned
1100  return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)};
1101 }
1103  const Vec512<uint16_t> b) {
1104  return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)};
1105 }
1107  const Vec512<uint32_t> b) {
1108  return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)};
1109 }
1111  const Vec512<uint64_t> b) {
1112  return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)};
1113 }
1114 
1115 // Signed
1117  return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)};
1118 }
1120  return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)};
1121 }
1123  return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)};
1124 }
1126  return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)};
1127 }
1128 
1129 // Float
1131  return Vec512<float>{_mm512_max_ps(a.raw, b.raw)};
1132 }
1134  return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
1135 }
1136 
1137 // ------------------------------ Integer multiplication
1138 
1139 // Unsigned
1141  return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1142 }
1144  return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1145 }
1146 
1147 // Signed
1149  return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
1150 }
1152  return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
1153 }
1154 
1155 // Returns the upper 16 bits of a * b in each lane.
1157  return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
1158 }
1160  return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)};
1161 }
1162 
1164  return Vec512<int16_t>{_mm512_mulhrs_epi16(a.raw, b.raw)};
1165 }
1166 
1167 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1168 // even and the upper half into its odd neighbor lane.
1170  return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)};
1171 }
1173  return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)};
1174 }
1175 
1176 // ------------------------------ Neg (Sub)
1177 
1178 template <typename T, HWY_IF_FLOAT(T)>
1180  return Xor(v, SignBit(Full512<T>()));
1181 }
1182 
1183 template <typename T, HWY_IF_NOT_FLOAT(T)>
1184 HWY_API Vec512<T> Neg(const Vec512<T> v) {
1185  return Zero(Full512<T>()) - v;
1186 }
1187 
1188 // ------------------------------ Floating-point mul / div
1189 
1191  return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)};
1192 }
1194  const Vec512<double> b) {
1195  return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
1196 }
1197 
1199  return Vec512<float>{_mm512_div_ps(a.raw, b.raw)};
1200 }
1202  const Vec512<double> b) {
1203  return Vec512<double>{_mm512_div_pd(a.raw, b.raw)};
1204 }
1205 
1206 // Approximate reciprocal
1208  return Vec512<float>{_mm512_rcp14_ps(v.raw)};
1209 }
1210 
1211 // Absolute value of difference.
1213  return Abs(a - b);
1214 }
1215 
1216 // ------------------------------ Floating-point multiply-add variants
1217 
1218 // Returns mul * x + add
1220  const Vec512<float> add) {
1221  return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)};
1222 }
1224  const Vec512<double> add) {
1225  return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)};
1226 }
1227 
1228 // Returns add - mul * x
1230  const Vec512<float> add) {
1231  return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)};
1232 }
1234  const Vec512<double> x,
1235  const Vec512<double> add) {
1236  return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)};
1237 }
1238 
1239 // Returns mul * x - sub
1241  const Vec512<float> sub) {
1242  return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)};
1243 }
1245  const Vec512<double> sub) {
1246  return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)};
1247 }
1248 
1249 // Returns -mul * x - sub
1251  const Vec512<float> sub) {
1252  return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1253 }
1255  const Vec512<double> x,
1256  const Vec512<double> sub) {
1257  return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1258 }
1259 
1260 // ------------------------------ Floating-point square root
1261 
1262 // Full precision square root
1264  return Vec512<float>{_mm512_sqrt_ps(v.raw)};
1265 }
1267  return Vec512<double>{_mm512_sqrt_pd(v.raw)};
1268 }
1269 
1270 // Approximate reciprocal square root
1272  return Vec512<float>{_mm512_rsqrt14_ps(v.raw)};
1273 }
1274 
1275 // ------------------------------ Floating-point rounding
1276 
1277 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1278 HWY_DIAGNOSTICS(push)
1279 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1280 
1281 // Toward nearest integer, tie to even
1282 HWY_API Vec512<float> Round(const Vec512<float> v) {
1283  return Vec512<float>{_mm512_roundscale_ps(
1284  v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1285 }
1287  return Vec512<double>{_mm512_roundscale_pd(
1288  v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1289 }
1290 
1291 // Toward zero, aka truncate
1293  return Vec512<float>{
1294  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1295 }
1297  return Vec512<double>{
1298  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1299 }
1300 
1301 // Toward +infinity, aka ceiling
1303  return Vec512<float>{
1304  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1305 }
1307  return Vec512<double>{
1308  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1309 }
1310 
1311 // Toward -infinity, aka floor
1313  return Vec512<float>{
1314  _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1315 }
1317  return Vec512<double>{
1318  _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1319 }
1320 
1321 HWY_DIAGNOSTICS(pop)
1322 
1323 // ================================================== COMPARE
1324 
1325 // Comparisons set a mask bit to 1 if the condition is true, else 0.
1326 
1327 template <typename TFrom, typename TTo>
1329  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1330  return Mask512<TTo>{m.raw};
1331 }
1332 
1333 namespace detail {
1334 
1335 template <typename T>
1337  const Vec512<T> bit) {
1338  return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)};
1339 }
1340 template <typename T>
1342  const Vec512<T> bit) {
1343  return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)};
1344 }
1345 template <typename T>
1347  const Vec512<T> bit) {
1348  return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)};
1349 }
1350 template <typename T>
1352  const Vec512<T> bit) {
1353  return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)};
1354 }
1355 
1356 } // namespace detail
1357 
1358 template <typename T>
1360  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1361  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1362 }
1363 
1364 // ------------------------------ Equality
1365 
1366 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1368  return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)};
1369 }
1370 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1371 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1372  return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1373 }
1374 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1375 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1376  return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1377 }
1378 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1379 HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1380  return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1381 }
1382 
1384  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1385 }
1386 
1388  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1389 }
1390 
1391 // ------------------------------ Inequality
1392 
1393 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1395  return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)};
1396 }
1397 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1398 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1399  return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1400 }
1401 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1402 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1403  return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1404 }
1405 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1406 HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1407  return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1408 }
1409 
1411  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1412 }
1413 
1415  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1416 }
1417 
1418 // ------------------------------ Strict inequality
1419 
1421  return Mask512<uint8_t>{_mm512_cmpgt_epu8_mask(a.raw, b.raw)};
1422 }
1424  return Mask512<uint16_t>{_mm512_cmpgt_epu16_mask(a.raw, b.raw)};
1425 }
1427  return Mask512<uint32_t>{_mm512_cmpgt_epu32_mask(a.raw, b.raw)};
1428 }
1430  return Mask512<uint64_t>{_mm512_cmpgt_epu64_mask(a.raw, b.raw)};
1431 }
1432 
1434  return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)};
1435 }
1437  return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)};
1438 }
1440  return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)};
1441 }
1443  return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)};
1444 }
1445 
1447  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1448 }
1450  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1451 }
1452 
1453 // ------------------------------ Weak inequality
1454 
1456  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1457 }
1459  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1460 }
1461 
1462 // ------------------------------ Reversed comparisons
1463 
1464 template <typename T>
1466  return b > a;
1467 }
1468 
1469 template <typename T>
1471  return b >= a;
1472 }
1473 
1474 // ------------------------------ Mask
1475 
1476 namespace detail {
1477 
1478 template <typename T>
1480  return Mask512<T>{_mm512_movepi8_mask(v.raw)};
1481 }
1482 template <typename T>
1484  return Mask512<T>{_mm512_movepi16_mask(v.raw)};
1485 }
1486 template <typename T>
1488  return Mask512<T>{_mm512_movepi32_mask(v.raw)};
1489 }
1490 template <typename T>
1492  return Mask512<T>{_mm512_movepi64_mask(v.raw)};
1493 }
1494 
1495 } // namespace detail
1496 
1497 template <typename T>
1499  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1500 }
1501 // There do not seem to be native floating-point versions of these instructions.
1504 }
1507 }
1508 
1510  return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
1511 }
1513  return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
1514 }
1515 
1517  return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
1518 }
1520  return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
1521 }
1522 
1524  return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
1525 }
1527  return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
1528 }
1530  return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
1531 }
1532 
1534  return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
1535 }
1537  return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
1538 }
1540  return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
1541 }
1542 
1543 template <typename T>
1545  return VecFromMask(v);
1546 }
1547 
1548 // ------------------------------ Mask logical
1549 
1550 namespace detail {
1551 
1552 template <typename T>
1554 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1555  return Mask512<T>{_knot_mask64(m.raw)};
1556 #else
1557  return Mask512<T>{~m.raw};
1558 #endif
1559 }
1560 template <typename T>
1562 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1563  return Mask512<T>{_knot_mask32(m.raw)};
1564 #else
1565  return Mask512<T>{~m.raw};
1566 #endif
1567 }
1568 template <typename T>
1570 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1571  return Mask512<T>{_knot_mask16(m.raw)};
1572 #else
1573  return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)};
1574 #endif
1575 }
1576 template <typename T>
1578 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1579  return Mask512<T>{_knot_mask8(m.raw)};
1580 #else
1581  return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)};
1582 #endif
1583 }
1584 
1585 template <typename T>
1587  const Mask512<T> b) {
1588 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1589  return Mask512<T>{_kand_mask64(a.raw, b.raw)};
1590 #else
1591  return Mask512<T>{a.raw & b.raw};
1592 #endif
1593 }
1594 template <typename T>
1596  const Mask512<T> b) {
1597 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1598  return Mask512<T>{_kand_mask32(a.raw, b.raw)};
1599 #else
1600  return Mask512<T>{a.raw & b.raw};
1601 #endif
1602 }
1603 template <typename T>
1605  const Mask512<T> b) {
1606 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1607  return Mask512<T>{_kand_mask16(a.raw, b.raw)};
1608 #else
1609  return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)};
1610 #endif
1611 }
1612 template <typename T>
1614  const Mask512<T> b) {
1615 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1616  return Mask512<T>{_kand_mask8(a.raw, b.raw)};
1617 #else
1618  return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)};
1619 #endif
1620 }
1621 
1622 template <typename T>
1624  const Mask512<T> b) {
1625 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1626  return Mask512<T>{_kandn_mask64(a.raw, b.raw)};
1627 #else
1628  return Mask512<T>{~a.raw & b.raw};
1629 #endif
1630 }
1631 template <typename T>
1633  const Mask512<T> b) {
1634 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1635  return Mask512<T>{_kandn_mask32(a.raw, b.raw)};
1636 #else
1637  return Mask512<T>{~a.raw & b.raw};
1638 #endif
1639 }
1640 template <typename T>
1642  const Mask512<T> b) {
1643 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1644  return Mask512<T>{_kandn_mask16(a.raw, b.raw)};
1645 #else
1646  return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)};
1647 #endif
1648 }
1649 template <typename T>
1651  const Mask512<T> b) {
1652 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1653  return Mask512<T>{_kandn_mask8(a.raw, b.raw)};
1654 #else
1655  return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)};
1656 #endif
1657 }
1658 
1659 template <typename T>
1661  const Mask512<T> b) {
1662 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1663  return Mask512<T>{_kor_mask64(a.raw, b.raw)};
1664 #else
1665  return Mask512<T>{a.raw | b.raw};
1666 #endif
1667 }
1668 template <typename T>
1670  const Mask512<T> b) {
1671 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1672  return Mask512<T>{_kor_mask32(a.raw, b.raw)};
1673 #else
1674  return Mask512<T>{a.raw | b.raw};
1675 #endif
1676 }
1677 template <typename T>
1679  const Mask512<T> b) {
1680 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1681  return Mask512<T>{_kor_mask16(a.raw, b.raw)};
1682 #else
1683  return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)};
1684 #endif
1685 }
1686 template <typename T>
1688  const Mask512<T> b) {
1689 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1690  return Mask512<T>{_kor_mask8(a.raw, b.raw)};
1691 #else
1692  return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)};
1693 #endif
1694 }
1695 
1696 template <typename T>
1698  const Mask512<T> b) {
1699 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1700  return Mask512<T>{_kxor_mask64(a.raw, b.raw)};
1701 #else
1702  return Mask512<T>{a.raw ^ b.raw};
1703 #endif
1704 }
1705 template <typename T>
1707  const Mask512<T> b) {
1708 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1709  return Mask512<T>{_kxor_mask32(a.raw, b.raw)};
1710 #else
1711  return Mask512<T>{a.raw ^ b.raw};
1712 #endif
1713 }
1714 template <typename T>
1716  const Mask512<T> b) {
1717 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1718  return Mask512<T>{_kxor_mask16(a.raw, b.raw)};
1719 #else
1720  return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
1721 #endif
1722 }
1723 template <typename T>
1725  const Mask512<T> b) {
1726 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1727  return Mask512<T>{_kxor_mask8(a.raw, b.raw)};
1728 #else
1729  return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
1730 #endif
1731 }
1732 
1733 } // namespace detail
1734 
1735 template <typename T>
1737  return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1738 }
1739 
1740 template <typename T>
1742  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1743 }
1744 
1745 template <typename T>
1747  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1748 }
1749 
1750 template <typename T>
1752  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1753 }
1754 
1755 template <typename T>
1757  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1758 }
1759 
1760 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1761 
1763  return VecFromMask(v < Zero(Full512<int8_t>()));
1764 }
1765 
1767  return ShiftRight<15>(v);
1768 }
1769 
1771  return ShiftRight<31>(v);
1772 }
1773 
1775  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
1776 }
1777 
1778 // ================================================== MEMORY
1779 
1780 // ------------------------------ Load
1781 
1782 template <typename T>
1783 HWY_API Vec512<T> Load(Full512<T> /* tag */, const T* HWY_RESTRICT aligned) {
1784  return Vec512<T>{_mm512_load_si512(aligned)};
1785 }
1787  const float* HWY_RESTRICT aligned) {
1788  return Vec512<float>{_mm512_load_ps(aligned)};
1789 }
1791  const double* HWY_RESTRICT aligned) {
1792  return Vec512<double>{_mm512_load_pd(aligned)};
1793 }
1794 
1795 template <typename T>
1797  return Vec512<T>{_mm512_loadu_si512(p)};
1798 }
1800  const float* HWY_RESTRICT p) {
1801  return Vec512<float>{_mm512_loadu_ps(p)};
1802 }
1804  const double* HWY_RESTRICT p) {
1805  return Vec512<double>{_mm512_loadu_pd(p)};
1806 }
1807 
1808 // ------------------------------ MaskedLoad
1809 
1810 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1812  const T* HWY_RESTRICT p) {
1813  return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, p)};
1814 }
1815 
1816 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1817 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1818  const T* HWY_RESTRICT p) {
1819  return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1820 }
1821 
1822 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1823 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1824  const T* HWY_RESTRICT p) {
1825  return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1826 }
1827 
1828 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1829 HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
1830  const T* HWY_RESTRICT p) {
1831  return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1832 }
1833 
1835  const float* HWY_RESTRICT p) {
1836  return Vec512<float>{_mm512_maskz_loadu_ps(m.raw, p)};
1837 }
1838 
1840  const double* HWY_RESTRICT p) {
1841  return Vec512<double>{_mm512_maskz_loadu_pd(m.raw, p)};
1842 }
1843 
1844 // ------------------------------ LoadDup128
1845 
1846 // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
1847 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
1848 template <typename T>
1850  const T* const HWY_RESTRICT p) {
1851  // Clang 3.9 generates VINSERTF128 which is slower, but inline assembly leads
1852  // to "invalid output size for constraint" without -mavx512:
1853  // https://gcc.godbolt.org/z/-Jt_-F
1854 #if HWY_LOADDUP_ASM
1855  __m512i out;
1856  asm("vbroadcasti128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
1857  return Vec512<T>{out};
1858 #else
1859  const auto x4 = LoadU(Full128<T>(), p);
1860  return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1861 #endif
1862 }
1864  const float* const HWY_RESTRICT p) {
1865 #if HWY_LOADDUP_ASM
1866  __m512 out;
1867  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
1868  return Vec512<float>{out};
1869 #else
1870  const __m128 x4 = _mm_loadu_ps(p);
1871  return Vec512<float>{_mm512_broadcast_f32x4(x4)};
1872 #endif
1873 }
1874 
1876  const double* const HWY_RESTRICT p) {
1877 #if HWY_LOADDUP_ASM
1878  __m512d out;
1879  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
1880  return Vec512<double>{out};
1881 #else
1882  const __m128d x2 = _mm_loadu_pd(p);
1883  return Vec512<double>{_mm512_broadcast_f64x2(x2)};
1884 #endif
1885 }
1886 
1887 // ------------------------------ Store
1888 
1889 template <typename T>
1890 HWY_API void Store(const Vec512<T> v, Full512<T> /* tag */,
1891  T* HWY_RESTRICT aligned) {
1892  _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1893 }
1895  float* HWY_RESTRICT aligned) {
1896  _mm512_store_ps(aligned, v.raw);
1897 }
1899  double* HWY_RESTRICT aligned) {
1900  _mm512_store_pd(aligned, v.raw);
1901 }
1902 
1903 template <typename T>
1904 HWY_API void StoreU(const Vec512<T> v, Full512<T> /* tag */,
1905  T* HWY_RESTRICT p) {
1906  _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
1907 }
1909  float* HWY_RESTRICT p) {
1910  _mm512_storeu_ps(p, v.raw);
1911 }
1913  double* HWY_RESTRICT p) {
1914  _mm512_storeu_pd(p, v.raw);
1915 }
1916 
1917 // ------------------------------ BlendedStore
1918 
1919 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1921  T* HWY_RESTRICT p) {
1922  _mm512_mask_storeu_epi8(p, m.raw, v.raw);
1923 }
1924 
1925 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1926 HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1927  T* HWY_RESTRICT p) {
1928  _mm512_mask_storeu_epi16(p, m.raw, v.raw);
1929 }
1930 
1931 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1932 HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1933  T* HWY_RESTRICT p) {
1934  _mm512_mask_storeu_epi32(p, m.raw, v.raw);
1935 }
1936 
1937 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1938 HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
1939  T* HWY_RESTRICT p) {
1940  _mm512_mask_storeu_epi64(p, m.raw, v.raw);
1941 }
1942 
1944  Full512<float> /* tag */, float* HWY_RESTRICT p) {
1945  _mm512_mask_storeu_ps(p, m.raw, v.raw);
1946 }
1947 
1949  Full512<double> /* tag */, double* HWY_RESTRICT p) {
1950  _mm512_mask_storeu_pd(p, m.raw, v.raw);
1951 }
1952 
1953 // ------------------------------ Non-temporal stores
1954 
1955 template <typename T>
1956 HWY_API void Stream(const Vec512<T> v, Full512<T> /* tag */,
1957  T* HWY_RESTRICT aligned) {
1958  _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
1959 }
1961  float* HWY_RESTRICT aligned) {
1962  _mm512_stream_ps(aligned, v.raw);
1963 }
1965  double* HWY_RESTRICT aligned) {
1966  _mm512_stream_pd(aligned, v.raw);
1967 }
1968 
1969 // ------------------------------ Scatter
1970 
1971 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1972 HWY_DIAGNOSTICS(push)
1973 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1974 
1975 namespace detail {
1976 
1977 template <typename T>
1979  Full512<T> /* tag */, T* HWY_RESTRICT base,
1980  const Vec512<int32_t> offset) {
1981  _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
1982 }
1983 template <typename T>
1985  Full512<T> /* tag */, T* HWY_RESTRICT base,
1986  const Vec512<int32_t> index) {
1987  _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
1988 }
1989 
1990 template <typename T>
1992  Full512<T> /* tag */, T* HWY_RESTRICT base,
1993  const Vec512<int64_t> offset) {
1994  _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
1995 }
1996 template <typename T>
1998  Full512<T> /* tag */, T* HWY_RESTRICT base,
1999  const Vec512<int64_t> index) {
2000  _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
2001 }
2002 
2003 } // namespace detail
2004 
2005 template <typename T, typename Offset>
2007  const Vec512<Offset> offset) {
2008  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2009  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2010 }
2011 template <typename T, typename Index>
2013  const Vec512<Index> index) {
2014  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2015  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2016 }
2017 
2019  float* HWY_RESTRICT base,
2020  const Vec512<int32_t> offset) {
2021  _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
2022 }
2024  float* HWY_RESTRICT base,
2025  const Vec512<int32_t> index) {
2026  _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
2027 }
2028 
2030  double* HWY_RESTRICT base,
2031  const Vec512<int64_t> offset) {
2032  _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
2033 }
2035  double* HWY_RESTRICT base,
2036  const Vec512<int64_t> index) {
2037  _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
2038 }
2039 
2040 // ------------------------------ Gather
2041 
2042 namespace detail {
2043 
2044 template <typename T>
2046  Full512<T> /* tag */,
2047  const T* HWY_RESTRICT base,
2048  const Vec512<int32_t> offset) {
2049  return Vec512<T>{_mm512_i32gather_epi32(offset.raw, base, 1)};
2050 }
2051 template <typename T>
2053  Full512<T> /* tag */,
2054  const T* HWY_RESTRICT base,
2055  const Vec512<int32_t> index) {
2056  return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, 4)};
2057 }
2058 
2059 template <typename T>
2061  Full512<T> /* tag */,
2062  const T* HWY_RESTRICT base,
2063  const Vec512<int64_t> offset) {
2064  return Vec512<T>{_mm512_i64gather_epi64(offset.raw, base, 1)};
2065 }
2066 template <typename T>
2068  Full512<T> /* tag */,
2069  const T* HWY_RESTRICT base,
2070  const Vec512<int64_t> index) {
2071  return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, 8)};
2072 }
2073 
2074 } // namespace detail
2075 
2076 template <typename T, typename Offset>
2078  const Vec512<Offset> offset) {
2079  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2080  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
2081 }
2082 template <typename T, typename Index>
2084  const Vec512<Index> index) {
2085  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2086  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
2087 }
2088 
2090  const float* HWY_RESTRICT base,
2091  const Vec512<int32_t> offset) {
2092  return Vec512<float>{_mm512_i32gather_ps(offset.raw, base, 1)};
2093 }
2095  const float* HWY_RESTRICT base,
2096  const Vec512<int32_t> index) {
2097  return Vec512<float>{_mm512_i32gather_ps(index.raw, base, 4)};
2098 }
2099 
2101  const double* HWY_RESTRICT base,
2102  const Vec512<int64_t> offset) {
2103  return Vec512<double>{_mm512_i64gather_pd(offset.raw, base, 1)};
2104 }
2106  const double* HWY_RESTRICT base,
2107  const Vec512<int64_t> index) {
2108  return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
2109 }
2110 
2111 HWY_DIAGNOSTICS(pop)
2112 
2113 // ================================================== SWIZZLE
2114 
2115 // ------------------------------ LowerHalf
2116 
2117 template <typename T>
2119  return Vec256<T>{_mm512_castsi512_si256(v.raw)};
2120 }
2122  return Vec256<float>{_mm512_castps512_ps256(v.raw)};
2123 }
2125  return Vec256<double>{_mm512_castpd512_pd256(v.raw)};
2126 }
2127 
2128 template <typename T>
2130  return LowerHalf(Full256<T>(), v);
2131 }
2132 
2133 // ------------------------------ UpperHalf
2134 
2135 template <typename T>
2137  return Vec256<T>{_mm512_extracti32x8_epi32(v.raw, 1)};
2138 }
2140  return Vec256<float>{_mm512_extractf32x8_ps(v.raw, 1)};
2141 }
2143  return Vec256<double>{_mm512_extractf64x4_pd(v.raw, 1)};
2144 }
2145 
2146 // ------------------------------ GetLane (LowerHalf)
2147 template <typename T>
2149  return GetLane(LowerHalf(v));
2150 }
2151 
2152 // ------------------------------ ZeroExtendVector
2153 
2154 // Unfortunately the initial _mm512_castsi256_si512 intrinsic leaves the upper
2155 // bits undefined. Although it makes sense for them to be zero (EVEX encoded
2156 // instructions have that effect), a compiler could decide to optimize out code
2157 // that relies on this.
2158 //
2159 // The newer _mm512_zextsi256_si512 intrinsic fixes this by specifying the
2160 // zeroing, but it is not available on GCC until 10.1. For older GCC, we can
2161 // still obtain the desired code thanks to pattern recognition; note that the
2162 // expensive insert instruction is not actually generated, see
2163 // https://gcc.godbolt.org/z/1MKGaP.
2164 
2165 template <typename T>
2167 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2168  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
2169 #else
2170  return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
2171 #endif
2172 }
2174  Vec256<float> lo) {
2175 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2176  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
2177 #else
2178  return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
2179 #endif
2180 }
2182  Vec256<double> lo) {
2183 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2184  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
2185 #else
2186  return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
2187 #endif
2188 }
2189 
2190 // ------------------------------ Combine
2191 
2192 template <typename T>
2194  const auto lo512 = ZeroExtendVector(d, lo);
2195  return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.raw, 1)};
2196 }
2198  Vec256<float> lo) {
2199  const auto lo512 = ZeroExtendVector(d, lo);
2200  return Vec512<float>{_mm512_insertf32x8(lo512.raw, hi.raw, 1)};
2201 }
2203  Vec256<double> lo) {
2204  const auto lo512 = ZeroExtendVector(d, lo);
2205  return Vec512<double>{_mm512_insertf64x4(lo512.raw, hi.raw, 1)};
2206 }
2207 
2208 // ------------------------------ ShiftLeftBytes
2209 
2210 template <int kBytes, typename T>
2212  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2213  return Vec512<T>{_mm512_bslli_epi128(v.raw, kBytes)};
2214 }
2215 
2216 template <int kBytes, typename T>
2218  return ShiftLeftBytes<kBytes>(Full512<T>(), v);
2219 }
2220 
2221 // ------------------------------ ShiftLeftLanes
2222 
2223 template <int kLanes, typename T>
2225  const Repartition<uint8_t, decltype(d)> d8;
2226  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2227 }
2228 
2229 template <int kLanes, typename T>
2231  return ShiftLeftLanes<kLanes>(Full512<T>(), v);
2232 }
2233 
2234 // ------------------------------ ShiftRightBytes
2235 template <int kBytes, typename T>
2237  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2238  return Vec512<T>{_mm512_bsrli_epi128(v.raw, kBytes)};
2239 }
2240 
2241 // ------------------------------ ShiftRightLanes
2242 template <int kLanes, typename T>
2244  const Repartition<uint8_t, decltype(d)> d8;
2245  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
2246 }
2247 
2248 // ------------------------------ CombineShiftRightBytes
2249 
2250 template <int kBytes, typename T, class V = Vec512<T>>
2252  const Repartition<uint8_t, decltype(d)> d8;
2253  return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8(
2254  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
2255 }
2256 
2257 // ------------------------------ Broadcast/splat any lane
2258 
2259 // Unsigned
2260 template <int kLane>
2262  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2263  if (kLane < 4) {
2264  const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2265  return Vec512<uint16_t>{_mm512_unpacklo_epi64(lo, lo)};
2266  } else {
2267  const __m512i hi =
2268  _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2269  return Vec512<uint16_t>{_mm512_unpackhi_epi64(hi, hi)};
2270  }
2271 }
2272 template <int kLane>
2274  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2275  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2276  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2277 }
2278 template <int kLane>
2280  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2281  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2282  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2283 }
2284 
2285 // Signed
2286 template <int kLane>
2288  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2289  if (kLane < 4) {
2290  const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
2291  return Vec512<int16_t>{_mm512_unpacklo_epi64(lo, lo)};
2292  } else {
2293  const __m512i hi =
2294  _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
2295  return Vec512<int16_t>{_mm512_unpackhi_epi64(hi, hi)};
2296  }
2297 }
2298 template <int kLane>
2300  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2301  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2302  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, perm)};
2303 }
2304 template <int kLane>
2306  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2307  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2308  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, perm)};
2309 }
2310 
2311 // Float
2312 template <int kLane>
2314  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2315  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
2316  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)};
2317 }
2318 template <int kLane>
2320  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2321  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane);
2322  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)};
2323 }
2324 
2325 // ------------------------------ Hard-coded shuffles
2326 
2327 // Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2328 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2329 // right (the previous least-significant lane is now most-significant =>
2330 // 47650321). These could also be implemented via CombineShiftRightBytes but
2331 // the shuffle_abcd notation is more convenient.
2332 
2333 // Swap 32-bit halves in 64-bit halves.
2335  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
2336 }
2338  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
2339 }
2341  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)};
2342 }
2343 
2344 // Swap 64-bit halves
2346  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2347 }
2349  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2350 }
2352  // Shorter encoding than _mm512_permute_ps.
2353  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)};
2354 }
2356  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2357 }
2359  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
2360 }
2362  // Shorter encoding than _mm512_permute_pd.
2363  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)};
2364 }
2365 
2366 // Rotate right 32 bits
2368  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2369 }
2371  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
2372 }
2374  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)};
2375 }
2376 // Rotate left 32 bits
2378  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2379 }
2381  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
2382 }
2384  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)};
2385 }
2386 
2387 // Reverse
2389  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2390 }
2392  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
2393 }
2395  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)};
2396 }
2397 
2398 // ------------------------------ TableLookupLanes
2399 
2400 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2401 template <typename T>
2402 struct Indices512 {
2403  __m512i raw;
2404 };
2405 
2406 template <typename T, typename TI>
2408  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
2409 #if HWY_IS_DEBUG_BUILD
2410  const Full512<TI> di;
2411  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2412  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T))))));
2413 #endif
2414  return Indices512<T>{vec.raw};
2415 }
2416 
2417 template <typename T, typename TI>
2419  const Rebind<TI, decltype(d)> di;
2420  return IndicesFromVec(d, LoadU(di, idx));
2421 }
2422 
2423 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2425  return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
2426 }
2427 
2428 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2429 HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
2430  return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)};
2431 }
2432 
2434  return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
2435 }
2436 
2438  Indices512<double> idx) {
2439  return Vec512<double>{_mm512_permutexvar_pd(idx.raw, v.raw)};
2440 }
2441 
2442 // ------------------------------ Reverse
2443 
2444 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2446  const RebindToSigned<decltype(d)> di;
2447  alignas(64) constexpr int16_t kReverse[32] = {
2448  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2449  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2450  const Vec512<int16_t> idx = Load(di, kReverse);
2451  return BitCast(d, Vec512<int16_t>{
2452  _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2453 }
2454 
2455 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2456 HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2457  alignas(64) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2458  7, 6, 5, 4, 3, 2, 1, 0};
2459  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2460 }
2461 
2462 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2463 HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
2464  alignas(64) constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2465  return TableLookupLanes(v, SetTableIndices(d, kReverse));
2466 }
2467 
2468 // ------------------------------ Reverse2
2469 
2470 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2472  const Full512<uint32_t> du32;
2473  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
2474 }
2475 
2476 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2477 HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2478  return Shuffle2301(v);
2479 }
2480 
2481 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2482 HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
2483  return Shuffle01(v);
2484 }
2485 
2486 // ------------------------------ Reverse4
2487 
2488 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2490  const RebindToSigned<decltype(d)> di;
2491  alignas(64) constexpr int16_t kReverse4[32] = {
2492  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2493  19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2494  const Vec512<int16_t> idx = Load(di, kReverse4);
2495  return BitCast(d, Vec512<int16_t>{
2496  _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2497 }
2498 
2499 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2500 HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2501  return Shuffle0123(v);
2502 }
2503 
2504 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2505 HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
2506  return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2507 }
2509  return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2510 }
2511 
2512 // ------------------------------ Reverse8
2513 
2514 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
2516  const RebindToSigned<decltype(d)> di;
2517  alignas(64) constexpr int16_t kReverse8[32] = {
2518  7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2519  23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2520  const Vec512<int16_t> idx = Load(di, kReverse8);
2521  return BitCast(d, Vec512<int16_t>{
2522  _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
2523 }
2524 
2525 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2526 HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2527  const RebindToSigned<decltype(d)> di;
2528  alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2529  15, 14, 13, 12, 11, 10, 9, 8};
2530  const Vec512<int32_t> idx = Load(di, kReverse8);
2531  return BitCast(d, Vec512<int32_t>{
2532  _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
2533 }
2534 
2535 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2536 HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
2537  return Reverse(d, v);
2538 }
2539 
2540 // ------------------------------ InterleaveLower
2541 
2542 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2543 // the least-significant lane) and "b". To concatenate two half-width integers
2544 // into one, use ZipLower/Upper instead (also works with scalar).
2545 
2547  const Vec512<uint8_t> b) {
2548  return Vec512<uint8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2549 }
2551  const Vec512<uint16_t> b) {
2552  return Vec512<uint16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2553 }
2555  const Vec512<uint32_t> b) {
2556  return Vec512<uint32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2557 }
2559  const Vec512<uint64_t> b) {
2560  return Vec512<uint64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2561 }
2562 
2564  const Vec512<int8_t> b) {
2565  return Vec512<int8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
2566 }
2568  const Vec512<int16_t> b) {
2569  return Vec512<int16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
2570 }
2572  const Vec512<int32_t> b) {
2573  return Vec512<int32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
2574 }
2576  const Vec512<int64_t> b) {
2577  return Vec512<int64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
2578 }
2579 
2581  const Vec512<float> b) {
2582  return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)};
2583 }
2585  const Vec512<double> b) {
2586  return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
2587 }
2588 
2589 // ------------------------------ InterleaveUpper
2590 
2591 // All functions inside detail lack the required D parameter.
2592 namespace detail {
2593 
2595  const Vec512<uint8_t> b) {
2596  return Vec512<uint8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2597 }
2599  const Vec512<uint16_t> b) {
2600  return Vec512<uint16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2601 }
2603  const Vec512<uint32_t> b) {
2604  return Vec512<uint32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2605 }
2607  const Vec512<uint64_t> b) {
2608  return Vec512<uint64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2609 }
2610 
2612  const Vec512<int8_t> b) {
2613  return Vec512<int8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
2614 }
2616  const Vec512<int16_t> b) {
2617  return Vec512<int16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
2618 }
2620  const Vec512<int32_t> b) {
2621  return Vec512<int32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
2622 }
2624  const Vec512<int64_t> b) {
2625  return Vec512<int64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
2626 }
2627 
2629  const Vec512<float> b) {
2630  return Vec512<float>{_mm512_unpackhi_ps(a.raw, b.raw)};
2631 }
2633  const Vec512<double> b) {
2634  return Vec512<double>{_mm512_unpackhi_pd(a.raw, b.raw)};
2635 }
2636 
2637 } // namespace detail
2638 
2639 template <typename T, class V = Vec512<T>>
2640 HWY_API V InterleaveUpper(Full512<T> /* tag */, V a, V b) {
2641  return detail::InterleaveUpper(a, b);
2642 }
2643 
2644 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2645 
2646 // Same as Interleave*, except that the return lanes are double-width integers;
2647 // this is necessary because the single-lane scalar cannot return two values.
2648 template <typename T, typename TW = MakeWide<T>>
2650  return BitCast(Full512<TW>(), InterleaveLower(a, b));
2651 }
2652 template <typename T, typename TW = MakeWide<T>>
2654  return BitCast(Full512<TW>(), InterleaveLower(a, b));
2655 }
2656 
2657 template <typename T, typename TW = MakeWide<T>>
2659  return BitCast(Full512<TW>(), InterleaveUpper(d, a, b));
2660 }
2661 
2662 // ------------------------------ Concat* halves
2663 
2664 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2665 template <typename T>
2667  const Vec512<T> lo) {
2668  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2669 }
2671  const Vec512<float> hi,
2672  const Vec512<float> lo) {
2673  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
2674 }
2676  const Vec512<double> hi,
2677  const Vec512<double> lo) {
2678  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)};
2679 }
2680 
2681 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2682 template <typename T>
2684  const Vec512<T> lo) {
2685  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2686 }
2688  const Vec512<float> hi,
2689  const Vec512<float> lo) {
2690  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
2691 }
2693  const Vec512<double> hi,
2694  const Vec512<double> lo) {
2695  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)};
2696 }
2697 
2698 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
2699 template <typename T>
2701  const Vec512<T> lo) {
2702  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2703 }
2705  const Vec512<float> hi,
2706  const Vec512<float> lo) {
2707  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
2708 }
2710  const Vec512<double> hi,
2711  const Vec512<double> lo) {
2712  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
2713 }
2714 
2715 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2716 template <typename T>
2718  const Vec512<T> lo) {
2719  // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
2720  // are efficiently loaded from 32-bit regs.
2721  const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
2722  return Vec512<T>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
2723 }
2725  const Vec512<float> hi,
2726  const Vec512<float> lo) {
2727  const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF);
2728  return Vec512<float>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
2729 }
2731  const Vec512<double> hi,
2732  const Vec512<double> lo) {
2733  const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F);
2734  return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)};
2735 }
2736 
2737 // ------------------------------ ConcatOdd
2738 
2739 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2741  const RebindToUnsigned<decltype(d)> du;
2742  alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2743  17, 19, 21, 23, 25, 27, 29, 31};
2744  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2745  BitCast(du, lo).raw, Load(du, kIdx).raw,
2746  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2747 }
2748 
2750  Vec512<float> lo) {
2751  const RebindToUnsigned<decltype(d)> du;
2752  alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2753  17, 19, 21, 23, 25, 27, 29, 31};
2754  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2755  __mmask16{0xFFFF}, hi.raw)};
2756 }
2757 
2758 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2759 HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2760  const RebindToUnsigned<decltype(d)> du;
2761  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2762  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2763  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2764  BitCast(du, hi).raw)});
2765 }
2766 
2768  Vec512<double> lo) {
2769  const RebindToUnsigned<decltype(d)> du;
2770  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2771  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2772  __mmask8{0xFF}, hi.raw)};
2773 }
2774 
2775 // ------------------------------ ConcatEven
2776 
2777 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2779  const RebindToUnsigned<decltype(d)> du;
2780  alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2781  16, 18, 20, 22, 24, 26, 28, 30};
2782  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2783  BitCast(du, lo).raw, Load(du, kIdx).raw,
2784  __mmask16{0xFFFF}, BitCast(du, hi).raw)});
2785 }
2786 
2788  Vec512<float> lo) {
2789  const RebindToUnsigned<decltype(d)> du;
2790  alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2791  16, 18, 20, 22, 24, 26, 28, 30};
2792  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
2793  __mmask16{0xFFFF}, hi.raw)};
2794 }
2795 
2796 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2797 HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
2798  const RebindToUnsigned<decltype(d)> du;
2799  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2800  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2801  BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
2802  BitCast(du, hi).raw)});
2803 }
2804 
2806  Vec512<double> lo) {
2807  const RebindToUnsigned<decltype(d)> du;
2808  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2809  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
2810  __mmask8{0xFF}, hi.raw)};
2811 }
2812 
2813 // ------------------------------ DupEven (InterleaveLower)
2814 
2815 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2817  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
2818 }
2820  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
2821 }
2822 
2823 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2824 HWY_API Vec512<T> DupEven(const Vec512<T> v) {
2825  return InterleaveLower(Full512<T>(), v, v);
2826 }
2827 
2828 // ------------------------------ DupOdd (InterleaveUpper)
2829 
2830 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
2832  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
2833 }
2835  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
2836 }
2837 
2838 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
2839 HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
2840  return InterleaveUpper(Full512<T>(), v, v);
2841 }
2842 
2843 // ------------------------------ OddEven
2844 
2845 template <typename T>
2847  constexpr size_t s = sizeof(T);
2848  constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
2849  return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
2850 }
2851 
2852 // ------------------------------ OddEvenBlocks
2853 
2854 template <typename T>
2856  return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
2857 }
2858 
2860  return Vec512<float>{
2861  _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)};
2862 }
2863 
2865  return Vec512<double>{
2866  _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)};
2867 }
2868 
2869 // ------------------------------ SwapAdjacentBlocks
2870 
2871 template <typename T>
2873  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
2874 }
2875 
2877  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
2878 }
2879 
2881  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
2882 }
2883 
2884 // ------------------------------ ReverseBlocks
2885 
2886 template <typename T>
2888  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
2889 }
2891  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
2892 }
2894  Vec512<double> v) {
2895  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
2896 }
2897 
2898 // ------------------------------ TableLookupBytes (ZeroExtendVector)
2899 
2900 // Both full
2901 template <typename T, typename TI>
2903  return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)};
2904 }
2905 
2906 // Partial index vector
2907 template <typename T, typename TI, size_t NI>
2909  const Full512<TI> d512;
2910  const Half<decltype(d512)> d256;
2911  const Half<decltype(d256)> d128;
2912  // First expand to full 128, then 256, then 512.
2913  const Vec128<TI> from_full{from.raw};
2914  const auto from_512 =
2915  ZeroExtendVector(d512, ZeroExtendVector(d256, from_full));
2916  const auto tbl_full = TableLookupBytes(bytes, from_512);
2917  // Shrink to 256, then 128, then partial.
2918  return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw};
2919 }
2920 template <typename T, typename TI>
2922  const auto from_512 = ZeroExtendVector(Full512<TI>(), from);
2923  return LowerHalf(Full256<TI>(), TableLookupBytes(bytes, from_512));
2924 }
2925 
2926 // Partial table vector
2927 template <typename T, size_t N, typename TI>
2929  const Full512<TI> d512;
2930  const Half<decltype(d512)> d256;
2931  const Half<decltype(d256)> d128;
2932  // First expand to full 128, then 256, then 512.
2933  const Vec128<T> bytes_full{bytes.raw};
2934  const auto bytes_512 =
2935  ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full));
2936  return TableLookupBytes(bytes_512, from);
2937 }
2938 template <typename T, typename TI>
2940  const auto bytes_512 = ZeroExtendVector(Full512<T>(), bytes);
2941  return TableLookupBytes(bytes_512, from);
2942 }
2943 
2944 // Partial both are handled by x86_128/256.
2945 
2946 // ================================================== CONVERT
2947 
2948 // ------------------------------ Promotions (part w/ narrow lanes -> full)
2949 
2950 // Unsigned: zero-extend.
2951 // Note: these have 3 cycle latency; if inputs are already split across the
2952 // 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
2954  Vec256<uint8_t> v) {
2955  return Vec512<uint16_t>{_mm512_cvtepu8_epi16(v.raw)};
2956 }
2958  Vec128<uint8_t> v) {
2959  return Vec512<uint32_t>{_mm512_cvtepu8_epi32(v.raw)};
2960 }
2962  Vec256<uint8_t> v) {
2963  return Vec512<int16_t>{_mm512_cvtepu8_epi16(v.raw)};
2964 }
2966  Vec128<uint8_t> v) {
2967  return Vec512<int32_t>{_mm512_cvtepu8_epi32(v.raw)};
2968 }
2970  Vec256<uint16_t> v) {
2971  return Vec512<uint32_t>{_mm512_cvtepu16_epi32(v.raw)};
2972 }
2974  Vec256<uint16_t> v) {
2975  return Vec512<int32_t>{_mm512_cvtepu16_epi32(v.raw)};
2976 }
2978  Vec256<uint32_t> v) {
2979  return Vec512<uint64_t>{_mm512_cvtepu32_epi64(v.raw)};
2980 }
2981 
2982 // Signed: replicate sign bit.
2983 // Note: these have 3 cycle latency; if inputs are already split across the
2984 // 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
2985 // signed shift would be faster.
2987  Vec256<int8_t> v) {
2988  return Vec512<int16_t>{_mm512_cvtepi8_epi16(v.raw)};
2989 }
2991  Vec128<int8_t> v) {
2992  return Vec512<int32_t>{_mm512_cvtepi8_epi32(v.raw)};
2993 }
2995  Vec256<int16_t> v) {
2996  return Vec512<int32_t>{_mm512_cvtepi16_epi32(v.raw)};
2997 }
2999  Vec256<int32_t> v) {
3000  return Vec512<int64_t>{_mm512_cvtepi32_epi64(v.raw)};
3001 }
3002 
3003 // Float
3005  const Vec256<float16_t> v) {
3006  return Vec512<float>{_mm512_cvtph_ps(v.raw)};
3007 }
3008 
3010  const Vec256<bfloat16_t> v) {
3011  const Rebind<uint16_t, decltype(df32)> du16;
3012  const RebindToSigned<decltype(df32)> di32;
3013  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
3014 }
3015 
3017  return Vec512<double>{_mm512_cvtps_pd(v.raw)};
3018 }
3019 
3021  return Vec512<double>{_mm512_cvtepi32_pd(v.raw)};
3022 }
3023 
3024 // ------------------------------ Demotions (full -> part w/ narrow lanes)
3025 
3027  const Vec512<int32_t> v) {
3028  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3029 
3030  // Compress even u64 lanes into 256 bit.
3031  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3032  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3033  const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)};
3034  return LowerHalf(even);
3035 }
3036 
3038  const Vec512<int32_t> v) {
3039  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3040 
3041  // Compress even u64 lanes into 256 bit.
3042  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3043  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3044  const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3045  return LowerHalf(even);
3046 }
3047 
3049  const Vec512<int32_t> v) {
3050  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
3051  // packus treats the input as signed; we want unsigned. Clear the MSB to get
3052  // unsigned saturation to u8.
3053  const Vec512<int16_t> i16{
3054  _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3055  const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
3056 
3057  alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3058  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3059  const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3060  return LowerHalf(LowerHalf(fixed));
3061 }
3062 
3064  const Vec512<int16_t> v) {
3065  const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)};
3066 
3067  // Compress even u64 lanes into 256 bit.
3068  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3069  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3070  const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3071  return LowerHalf(even);
3072 }
3073 
3075  const Vec512<int32_t> v) {
3076  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
3077  const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
3078 
3079  alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3080  0, 4, 8, 12, 0, 4, 8, 12};
3081  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
3082  const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3083  return LowerHalf(LowerHalf(fixed));
3084 }
3085 
3087  const Vec512<int16_t> v) {
3088  const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
3089 
3090  // Compress even u64 lanes into 256 bit.
3091  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3092  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
3093  const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3094  return LowerHalf(even);
3095 }
3096 
3098  const Vec512<float> v) {
3099  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3100  HWY_DIAGNOSTICS(push)
3101  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3102  return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
3103  HWY_DIAGNOSTICS(pop)
3104 }
3105 
3107  const Vec512<float> v) {
3108  // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
3109  const Rebind<int32_t, decltype(dbf16)> di32;
3110  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3111  const Rebind<uint16_t, decltype(dbf16)> du16;
3112  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
3113  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
3114 }
3115 
3118  // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
3119  const RebindToUnsigned<decltype(dbf16)> du16;
3120  const Repartition<uint32_t, decltype(dbf16)> du32;
3121  const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
3122  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
3123 }
3124 
3126  const Vec512<double> v) {
3127  return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
3128 }
3129 
3131  const Vec512<double> v) {
3132  const auto clamped = detail::ClampF64ToI32Max(Full512<double>(), v);
3133  return Vec256<int32_t>{_mm512_cvttpd_epi32(clamped.raw)};
3134 }
3135 
3136 // For already range-limited input [0, 255].
3138  const Full512<uint32_t> d32;
3139  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
3140  // lowest 4 bytes.
3141  alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3142  ~0u};
3143  const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
3144  // Gather the lowest 4 bytes of 4 128-bit blocks.
3145  alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3146  const Vec512<uint8_t> bytes{
3147  _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
3148  return LowerHalf(LowerHalf(bytes));
3149 }
3150 
3151 // ------------------------------ Convert integer <=> floating point
3152 
3154  const Vec512<int32_t> v) {
3155  return Vec512<float>{_mm512_cvtepi32_ps(v.raw)};
3156 }
3157 
3159  const Vec512<int64_t> v) {
3160  return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
3161 }
3162 
3163 // Truncates (rounds toward zero).
3165  return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
3166 }
3168  return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw));
3169 }
3170 
3172  const Full512<int32_t> di;
3173  return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw));
3174 }
3175 
3176 // ================================================== CRYPTO
3177 
3178 #if !defined(HWY_DISABLE_PCLMUL_AES)
3179 
3180 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3181 #ifdef HWY_NATIVE_AES
3182 #undef HWY_NATIVE_AES
3183 #else
3184 #define HWY_NATIVE_AES
3185 #endif
3186 
3188  Vec512<uint8_t> round_key) {
3189 #if HWY_TARGET == HWY_AVX3_DL
3190  return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
3191 #else
3192  const Full512<uint8_t> d;
3193  const Half<decltype(d)> d2;
3194  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3195  AESRound(LowerHalf(state), LowerHalf(round_key)));
3196 #endif
3197 }
3198 
3200  Vec512<uint8_t> round_key) {
3201 #if HWY_TARGET == HWY_AVX3_DL
3202  return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
3203 #else
3204  const Full512<uint8_t> d;
3205  const Half<decltype(d)> d2;
3206  return Combine(d,
3207  AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
3208  AESLastRound(LowerHalf(state), LowerHalf(round_key)));
3209 #endif
3210 }
3211 
3213 #if HWY_TARGET == HWY_AVX3_DL
3214  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)};
3215 #else
3216  alignas(64) uint64_t a[8];
3217  alignas(64) uint64_t b[8];
3218  const Full512<uint64_t> d;
3219  const Full128<uint64_t> d128;
3220  Store(va, d, a);
3221  Store(vb, d, b);
3222  for (size_t i = 0; i < 8; i += 2) {
3223  const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i));
3224  Store(mul, d128, a + i);
3225  }
3226  return Load(d, a);
3227 #endif
3228 }
3229 
3231 #if HWY_TARGET == HWY_AVX3_DL
3232  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)};
3233 #else
3234  alignas(64) uint64_t a[8];
3235  alignas(64) uint64_t b[8];
3236  const Full512<uint64_t> d;
3237  const Full128<uint64_t> d128;
3238  Store(va, d, a);
3239  Store(vb, d, b);
3240  for (size_t i = 0; i < 8; i += 2) {
3241  const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i));
3242  Store(mul, d128, a + i);
3243  }
3244  return Load(d, a);
3245 #endif
3246 }
3247 
3248 #endif // HWY_DISABLE_PCLMUL_AES
3249 
3250 // ================================================== MISC
3251 
3252 // Returns a vector with lane i=[0, N) set to "first" + i.
3253 template <typename T, typename T2>
3254 Vec512<T> Iota(const Full512<T> d, const T2 first) {
3255  HWY_ALIGN T lanes[64 / sizeof(T)];
3256  for (size_t i = 0; i < 64 / sizeof(T); ++i) {
3257  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
3258  }
3259  return Load(d, lanes);
3260 }
3261 
3262 // ------------------------------ Mask testing
3263 
3264 // Beware: the suffix indicates the number of mask bits, not lane size!
3265 
3266 namespace detail {
3267 
3268 template <typename T>
3269 HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
3270 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3271  return _kortestz_mask64_u8(mask.raw, mask.raw);
3272 #else
3273  return mask.raw == 0;
3274 #endif
3275 }
3276 template <typename T>
3277 HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
3278 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3279  return _kortestz_mask32_u8(mask.raw, mask.raw);
3280 #else
3281  return mask.raw == 0;
3282 #endif
3283 }
3284 template <typename T>
3285 HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
3286 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3287  return _kortestz_mask16_u8(mask.raw, mask.raw);
3288 #else
3289  return mask.raw == 0;
3290 #endif
3291 }
3292 template <typename T>
3293 HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
3294 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3295  return _kortestz_mask8_u8(mask.raw, mask.raw);
3296 #else
3297  return mask.raw == 0;
3298 #endif
3299 }
3300 
3301 } // namespace detail
3302 
3303 template <typename T>
3304 HWY_API bool AllFalse(const Full512<T> /* tag */, const Mask512<T> mask) {
3305  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
3306 }
3307 
3308 namespace detail {
3309 
3310 template <typename T>
3311 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
3312 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3313  return _kortestc_mask64_u8(mask.raw, mask.raw);
3314 #else
3315  return mask.raw == 0xFFFFFFFFFFFFFFFFull;
3316 #endif
3317 }
3318 template <typename T>
3319 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
3320 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3321  return _kortestc_mask32_u8(mask.raw, mask.raw);
3322 #else
3323  return mask.raw == 0xFFFFFFFFull;
3324 #endif
3325 }
3326 template <typename T>
3327 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
3328 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3329  return _kortestc_mask16_u8(mask.raw, mask.raw);
3330 #else
3331  return mask.raw == 0xFFFFull;
3332 #endif
3333 }
3334 template <typename T>
3335 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
3336 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3337  return _kortestc_mask8_u8(mask.raw, mask.raw);
3338 #else
3339  return mask.raw == 0xFFull;
3340 #endif
3341 }
3342 
3343 } // namespace detail
3344 
3345 template <typename T>
3346 HWY_API bool AllTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3347  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
3348 }
3349 
3350 // `p` points to at least 8 readable bytes, not all of which need be valid.
3351 template <typename T>
3353  const uint8_t* HWY_RESTRICT bits) {
3354  Mask512<T> mask;
3355  CopyBytes<8 / sizeof(T)>(bits, &mask.raw);
3356  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3357  return mask;
3358 }
3359 
3360 // `p` points to at least 8 writable bytes.
3361 template <typename T>
3362 HWY_API size_t StoreMaskBits(const Full512<T> /* tag */, const Mask512<T> mask,
3363  uint8_t* bits) {
3364  const size_t kNumBytes = 8 / sizeof(T);
3365  CopyBytes<kNumBytes>(&mask.raw, bits);
3366  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
3367  return kNumBytes;
3368 }
3369 
3370 template <typename T>
3371 HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
3372  return PopCount(static_cast<uint64_t>(mask.raw));
3373 }
3374 
3375 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3376 HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3377  const Mask512<T> mask) {
3378  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
3379 }
3380 
3381 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
3382 HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
3383  const Mask512<T> mask) {
3384  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
3385 }
3386 
3387 // ------------------------------ Compress
3388 
3389 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3391  return Vec512<T>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
3392 }
3393 
3395  return Vec512<float>{_mm512_maskz_compress_ps(mask.raw, v.raw)};
3396 }
3397 
3398 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3399 HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
3400  // See CompressIsPartition. u64 is faster than u32.
3401  alignas(16) constexpr uint64_t packed_array[256] = {
3402  0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
3403  0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
3404  0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
3405  0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
3406  0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
3407  0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
3408  0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
3409  0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
3410  0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
3411  0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
3412  0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
3413  0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
3414  0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
3415  0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
3416  0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
3417  0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
3418  0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
3419  0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
3420  0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
3421  0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
3422  0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
3423  0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
3424  0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
3425  0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
3426  0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
3427  0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
3428  0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
3429  0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
3430  0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
3431  0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
3432  0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
3433  0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
3434  0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
3435  0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
3436  0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
3437  0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
3438  0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
3439  0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
3440  0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
3441  0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
3442  0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
3443  0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
3444  0x10765432, 0x17654320, 0x07654321, 0x76543210};
3445 
3446  // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
3447  // _mm512_permutexvar_epi64 will ignore the upper bits.
3448  const Full512<T> d;
3449  const RebindToUnsigned<decltype(d)> du64;
3450  const auto packed = Set(du64, packed_array[mask.raw]);
3451  alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3452  const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
3453  return TableLookupLanes(v, indices);
3454 }
3455 
3456 // 16-bit may use the 32-bit Compress and must be defined after it.
3457 //
3458 // Ignore IDE redefinition error - this is not actually defined in x86_256 if
3459 // we are including x86_512-inl.h.
3460 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3461 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
3462  const Full256<T> d;
3463  const Rebind<uint16_t, decltype(d)> du;
3464  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3465 
3466 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3467  const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
3468 #else
3469  // Promote to i32 (512-bit vector!) so we can use the native Compress.
3470  const auto vw = PromoteTo(Rebind<int32_t, decltype(d)>(), vu);
3471  const Mask512<int32_t> mask32{static_cast<__mmask16>(mask.raw)};
3472  const auto cu = DemoteTo(du, Compress(vw, mask32));
3473 #endif // HWY_TARGET == HWY_AVX3_DL
3474 
3475  return BitCast(d, cu);
3476 }
3477 
3478 // Expands to 32-bit, compresses, concatenate demoted halves.
3479 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3480 HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
3481  const Full512<T> d;
3482  const Rebind<uint16_t, decltype(d)> du;
3483  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3484 
3485 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3486  const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, vu.raw)};
3487 #else
3488  const Repartition<int32_t, decltype(d)> dw;
3489  const Half<decltype(du)> duh;
3490  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3491  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3492 
3493  const uint32_t mask_bits{mask.raw};
3494  const Mask512<int32_t> mask0{static_cast<__mmask16>(mask_bits & 0xFFFF)};
3495  const Mask512<int32_t> mask1{static_cast<__mmask16>(mask_bits >> 16)};
3496  const auto compressed0 = Compress(promoted0, mask0);
3497  const auto compressed1 = Compress(promoted1, mask1);
3498 
3499  const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0));
3500  const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1));
3501 
3502  // Concatenate into single vector by shifting upper with writemask.
3503  const size_t num0 = CountTrue(dw, mask0);
3504  const __mmask32 m_upper = ~((1u << num0) - 1);
3505  alignas(64) uint16_t iota[64] = {
3506  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3507  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3508  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3509  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
3510  const auto idx = LoadU(du, iota + 32 - num0);
3511  const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
3512  demoted0.raw, m_upper, idx.raw, demoted1.raw)};
3513 #endif // HWY_TARGET == HWY_AVX3_DL
3514 
3515  return BitCast(d, cu);
3516 }
3517 
3518 // ------------------------------ CompressBits
3519 template <typename T>
3521  return Compress(v, LoadMaskBits(Full512<T>(), bits));
3522 }
3523 
3524 // ------------------------------ CompressStore
3525 
3526 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3528  T* HWY_RESTRICT unaligned) {
3529  const Rebind<uint16_t, decltype(d)> du;
3530  const auto vu = BitCast(du, v); // (required for float16_t inputs)
3531 
3532  const uint64_t mask_bits{mask.raw};
3533 
3534 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
3535  _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
3536 #else
3537  const Repartition<int32_t, decltype(d)> dw;
3538  const Half<decltype(du)> duh;
3539  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
3540  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
3541 
3542  const uint64_t maskL = mask_bits & 0xFFFF;
3543  const uint64_t maskH = mask_bits >> 16;
3544  const Mask512<int32_t> mask0{static_cast<__mmask16>(maskL)};
3545  const Mask512<int32_t> mask1{static_cast<__mmask16>(maskH)};
3546  const auto compressed0 = Compress(promoted0, mask0);
3547  const auto compressed1 = Compress(promoted1, mask1);
3548 
3549  const Half<decltype(d)> dh;
3550  const auto demoted0 = BitCast(dh, DemoteTo(duh, compressed0));
3551  const auto demoted1 = BitCast(dh, DemoteTo(duh, compressed1));
3552 
3553  // Store 256-bit halves
3554  StoreU(demoted0, dh, unaligned);
3555  StoreU(demoted1, dh, unaligned + PopCount(maskL));
3556 #endif
3557 
3558  return PopCount(mask_bits);
3559 }
3560 
3561 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3562 HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3563  T* HWY_RESTRICT unaligned) {
3564  _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
3565  const size_t count = PopCount(uint64_t{mask.raw});
3566 // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
3567 #if HWY_IS_MSAN
3568  __msan_unpoison(unaligned, count * sizeof(T));
3569 #endif
3570  return count;
3571 }
3572 
3573 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3574 HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
3575  T* HWY_RESTRICT unaligned) {
3576  _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
3577  const size_t count = PopCount(uint64_t{mask.raw});
3578 // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
3579 #if HWY_IS_MSAN
3580  __msan_unpoison(unaligned, count * sizeof(T));
3581 #endif
3582  return count;
3583 }
3584 
3586  Full512<float> /* tag */,
3587  float* HWY_RESTRICT unaligned) {
3588  _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
3589  const size_t count = PopCount(uint64_t{mask.raw});
3590 // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
3591 #if HWY_IS_MSAN
3592  __msan_unpoison(unaligned, count * sizeof(float));
3593 #endif
3594  return count;
3595 }
3596 
3598  Full512<double> /* tag */,
3599  double* HWY_RESTRICT unaligned) {
3600  _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
3601  const size_t count = PopCount(uint64_t{mask.raw});
3602 // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
3603 #if HWY_IS_MSAN
3604  __msan_unpoison(unaligned, count * sizeof(double));
3605 #endif
3606  return count;
3607 }
3608 
3609 // ------------------------------ CompressBlendedStore
3610 template <typename T>
3612  T* HWY_RESTRICT unaligned) {
3613  // AVX-512 already does the blending at no extra cost (latency 11,
3614  // rthroughput 2 - same as compress plus store).
3615  if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
3616  return CompressStore(v, m, d, unaligned);
3617  } else {
3618  const size_t count = CountTrue(d, m);
3619  BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
3620 // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
3621 #if HWY_IS_MSAN
3622  __msan_unpoison(unaligned, count * sizeof(T));
3623 #endif
3624  return count;
3625  }
3626 }
3627 
3628 // ------------------------------ CompressBitsStore
3629 template <typename T>
3630 HWY_API size_t CompressBitsStore(Vec512<T> v, const uint8_t* HWY_RESTRICT bits,
3631  Full512<T> d, T* HWY_RESTRICT unaligned) {
3632  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
3633 }
3634 
3635 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
3636 // TableLookupBytes)
3637 
3640  uint8_t* HWY_RESTRICT unaligned) {
3641  const auto k5 = Set(d, 5);
3642  const auto k6 = Set(d, 6);
3643 
3644  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
3645  // 0x80 so lanes to be filled from other vectors are 0 for blending.
3646  alignas(16) static constexpr uint8_t tbl_r0[16] = {
3647  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3648  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3649  alignas(16) static constexpr uint8_t tbl_g0[16] = {
3650  0x80, 0, 0x80, 0x80, 1, 0x80, //
3651  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3652  const auto shuf_r0 = LoadDup128(d, tbl_r0);
3653  const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
3654  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
3655  const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
3656  const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
3657  const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
3658  const auto i = (r0 | g0 | b0).raw; // low byte in each 128bit: 30 20 10 00
3659 
3660  // Second vector: g10,r10, bgr[9:6], b5,g5
3661  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
3662  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
3663  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
3664  const auto r1 = TableLookupBytes(a, shuf_r1);
3665  const auto g1 = TableLookupBytes(b, shuf_g1);
3666  const auto b1 = TableLookupBytes(c, shuf_b1);
3667  const auto j = (r1 | g1 | b1).raw; // low byte in each 128bit: 35 25 15 05
3668 
3669  // Third vector: bgr[15:11], b10
3670  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
3671  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
3672  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
3673  const auto r2 = TableLookupBytes(a, shuf_r2);
3674  const auto g2 = TableLookupBytes(b, shuf_g2);
3675  const auto b2 = TableLookupBytes(c, shuf_b2);
3676  const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A
3677 
3678  // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
3679  const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_PERM_DADA);
3680  const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_PERM_BCAB);
3681  const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_PERM_CDBC);
3682 
3683  // Alternating order, most-significant 128 bits from the second arg.
3684  const __mmask8 m = 0xCC;
3685  const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
3686  const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
3687  const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
3688 
3689  StoreU(Vec512<uint8_t>{i1_k0_j0_i0}, d, unaligned + 0 * 64); // 10 0A 05 00
3690  StoreU(Vec512<uint8_t>{j2_i2_k1_j1}, d, unaligned + 1 * 64); // 25 20 1A 15
3691  StoreU(Vec512<uint8_t>{k3_j3_i3_k2}, d, unaligned + 2 * 64); // 3A 35 30 2A
3692 }
3693 
3694 // ------------------------------ StoreInterleaved4
3695 
3697  const Vec512<uint8_t> v1,
3698  const Vec512<uint8_t> v2,
3699  const Vec512<uint8_t> v3, Full512<uint8_t> d8,
3700  uint8_t* HWY_RESTRICT unaligned) {
3701  const RepartitionToWide<decltype(d8)> d16;
3702  const RepartitionToWide<decltype(d16)> d32;
3703  // let a,b,c,d denote v0..3.
3704  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
3705  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
3706  const auto ba8 = ZipUpper(d16, v0, v1);
3707  const auto dc8 = ZipUpper(d16, v2, v3);
3708  const auto i = ZipLower(d32, ba0, dc0).raw; // 4x128bit: d..a3 d..a0
3709  const auto j = ZipUpper(d32, ba0, dc0).raw; // 4x128bit: d..a7 d..a4
3710  const auto k = ZipLower(d32, ba8, dc8).raw; // 4x128bit: d..aB d..a8
3711  const auto l = ZipUpper(d32, ba8, dc8).raw; // 4x128bit: d..aF d..aC
3712  // 128-bit blocks were independent until now; transpose 4x4.
3713  const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_PERM_BABA);
3714  const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_PERM_BABA);
3715  const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_PERM_DCDC);
3716  const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_PERM_DCDC);
3717  constexpr _MM_PERM_ENUM k20 = _MM_PERM_CACA;
3718  constexpr _MM_PERM_ENUM k31 = _MM_PERM_DBDB;
3719  const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
3720  const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
3721  const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
3722  const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
3723  StoreU(Vec512<uint8_t>{l0_k0_j0_i0}, d8, unaligned + 0 * 64);
3724  StoreU(Vec512<uint8_t>{l1_k1_j1_i1}, d8, unaligned + 1 * 64);
3725  StoreU(Vec512<uint8_t>{l2_k2_j2_i2}, d8, unaligned + 2 * 64);
3726  StoreU(Vec512<uint8_t>{l3_k3_j3_i3}, d8, unaligned + 3 * 64);
3727 }
3728 
3729 // ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
3730 
3732  const Vec512<uint64_t> b) {
3733  const DFromV<decltype(a)> du64;
3734  const RepartitionToNarrow<decltype(du64)> du32;
3735  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3736  const auto a32 = BitCast(du32, a);
3737  const auto b32 = BitCast(du32, b);
3738  // Inputs for MulEven: we only need the lower 32 bits
3739  const auto aH = Shuffle2301(a32);
3740  const auto bH = Shuffle2301(b32);
3741 
3742  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
3743  // the even (lower 64 bits of every 128-bit block) results. See
3744  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
3745  const auto aLbL = MulEven(a32, b32);
3746  const auto w3 = aLbL & maskL;
3747 
3748  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3749  const auto w2 = t2 & maskL;
3750  const auto w1 = ShiftRight<32>(t2);
3751 
3752  const auto t = MulEven(a32, bH) + w2;
3753  const auto k = ShiftRight<32>(t);
3754 
3755  const auto mulH = MulEven(aH, bH) + w1 + k;
3756  const auto mulL = ShiftLeft<32>(t) + w3;
3757  return InterleaveLower(mulL, mulH);
3758 }
3759 
3761  const Vec512<uint64_t> b) {
3762  const DFromV<decltype(a)> du64;
3763  const RepartitionToNarrow<decltype(du64)> du32;
3764  const auto maskL = Set(du64, 0xFFFFFFFFULL);
3765  const auto a32 = BitCast(du32, a);
3766  const auto b32 = BitCast(du32, b);
3767  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
3768  const auto aH = Shuffle2301(a32);
3769  const auto bH = Shuffle2301(b32);
3770 
3771  // Same as above, but we're using the odd results (upper 64 bits per block).
3772  const auto aLbL = MulEven(a32, b32);
3773  const auto w3 = aLbL & maskL;
3774 
3775  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3776  const auto w2 = t2 & maskL;
3777  const auto w1 = ShiftRight<32>(t2);
3778 
3779  const auto t = MulEven(a32, bH) + w2;
3780  const auto k = ShiftRight<32>(t);
3781 
3782  const auto mulH = MulEven(aH, bH) + w1 + k;
3783  const auto mulL = ShiftLeft<32>(t) + w3;
3784  return InterleaveUpper(du64, mulL, mulH);
3785 }
3786 
3787 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3788 
3792  const Vec512<float> sum0,
3793  Vec512<float>& sum1) {
3794  // TODO(janwas): _mm512_dpbf16_ps when available
3795  const Repartition<uint16_t, decltype(df32)> du16;
3796  const RebindToUnsigned<decltype(df32)> du32;
3797  const Vec512<uint16_t> zero = Zero(du16);
3798  // Lane order within sum0/1 is undefined, hence we can avoid the
3799  // longer-latency lane-crossing PromoteTo.
3800  const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
3801  const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
3802  const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
3803  const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
3804  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
3805  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
3806 }
3807 
3808 // ------------------------------ Reductions
3809 
3810 // Returns the sum in each lane.
3812  return Set(d, _mm512_reduce_add_epi32(v.raw));
3813 }
3815  return Set(d, _mm512_reduce_add_epi64(v.raw));
3816 }
3818  return Set(d, static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw)));
3819 }
3821  return Set(d, static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw)));
3822 }
3824  return Set(d, _mm512_reduce_add_ps(v.raw));
3825 }
3827  return Set(d, _mm512_reduce_add_pd(v.raw));
3828 }
3829 
3830 // Returns the minimum in each lane.
3832  return Set(d, _mm512_reduce_min_epi32(v.raw));
3833 }
3835  return Set(d, _mm512_reduce_min_epi64(v.raw));
3836 }
3838  return Set(d, _mm512_reduce_min_epu32(v.raw));
3839 }
3841  return Set(d, _mm512_reduce_min_epu64(v.raw));
3842 }
3844  return Set(d, _mm512_reduce_min_ps(v.raw));
3845 }
3847  return Set(d, _mm512_reduce_min_pd(v.raw));
3848 }
3849 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3851  const Repartition<int32_t, decltype(d)> d32;
3852  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
3853  const auto odd = ShiftRight<16>(BitCast(d32, v));
3854  const auto min = MinOfLanes(d32, Min(even, odd));
3855  // Also broadcast into odd lanes.
3856  return BitCast(d, Or(min, ShiftLeft<16>(min)));
3857 }
3858 
3859 // Returns the maximum in each lane.
3861  return Set(d, _mm512_reduce_max_epi32(v.raw));
3862 }
3864  return Set(d, _mm512_reduce_max_epi64(v.raw));
3865 }
3867  return Set(d, _mm512_reduce_max_epu32(v.raw));
3868 }
3870  return Set(d, _mm512_reduce_max_epu64(v.raw));
3871 }
3873  return Set(d, _mm512_reduce_max_ps(v.raw));
3874 }
3876  return Set(d, _mm512_reduce_max_pd(v.raw));
3877 }
3878 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
3880  const Repartition<int32_t, decltype(d)> d32;
3881  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
3882  const auto odd = ShiftRight<16>(BitCast(d32, v));
3883  const auto min = MaxOfLanes(d32, Max(even, odd));
3884  // Also broadcast into odd lanes.
3885  return BitCast(d, Or(min, ShiftLeft<16>(min)));
3886 }
3887 
3888 // NOLINTNEXTLINE(google-readability-namespace-comments)
3889 } // namespace HWY_NAMESPACE
3890 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_API
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
Definition: arm_neon-inl.h:485
Raw raw
Definition: arm_neon-inl.h:518
Definition: wasm_256-inl.h:39
Raw raw
Definition: x86_256-inl.h:94
Definition: x86_512-inl.h:103
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:104
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:109
Raw raw
Definition: x86_512-inl.h:131
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:115
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:121
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:124
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:118
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:127
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:112
#define HWY_AVX3_DL
Definition: detect_targets.h:58
#define HWY_TARGET
Definition: detect_targets.h:328
const double shift
Definition: RateControl.cpp:165
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:4784
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4773
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3106
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3095
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4066
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1553
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2434
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:770
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2426
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
u
Definition: rvv-inl.h:1405
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:598
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: x86_512-inl.h:2402
__m512i raw
Definition: x86_512-inl.h:2403
Definition: x86_512-inl.h:136
detail::RawMask512< sizeof(T)>::type raw
Definition: x86_512-inl.h:137
Definition: ops/shared-inl.h:40
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:166
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:162
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:158
__m512d type
Definition: x86_512-inl.h:77
__m512 type
Definition: x86_512-inl.h:73
Definition: x86_512-inl.h:68
__m512i type
Definition: x86_512-inl.h:69
__mmask64 type
Definition: x86_512-inl.h:85
__mmask32 type
Definition: x86_512-inl.h:89
__mmask16 type
Definition: x86_512-inl.h:93
__mmask8 type
Definition: x86_512-inl.h:97
Definition: x86_512-inl.h:82
Definition: base.h:317
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()