Grok 10.0.1
x86_128-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
17// operations when compiling for those targets.
18// External include guard in highway.h - see comment there.
19
20#include <emmintrin.h>
21#include <stdio.h>
22#if HWY_TARGET == HWY_SSSE3
23#include <tmmintrin.h> // SSSE3
24#else
25#include <smmintrin.h> // SSE4
26#include <wmmintrin.h> // CLMUL
27#endif
28#include <stddef.h>
29#include <stdint.h>
30
31#include "hwy/base.h"
32#include "hwy/ops/shared-inl.h"
33
34#if HWY_IS_MSAN
35#include <sanitizer/msan_interface.h>
36#endif
37
39namespace hwy {
40namespace HWY_NAMESPACE {
41
42#if HWY_TARGET <= HWY_AVX2
43template <typename T>
44using Full256 = Simd<T, 32 / sizeof(T), 0>;
45#endif
46
47#if HWY_TARGET <= HWY_AVX3
48template <typename T>
49using Full512 = Simd<T, 64 / sizeof(T), 0>;
50#endif
51
52namespace detail {
53
54template <typename T>
55struct Raw128 {
56 using type = __m128i;
57};
58template <>
59struct Raw128<float> {
60 using type = __m128;
61};
62template <>
63struct Raw128<double> {
64 using type = __m128d;
65};
66
67} // namespace detail
68
69template <typename T, size_t N = 16 / sizeof(T)>
70class Vec128 {
71 using Raw = typename detail::Raw128<T>::type;
72
73 public:
74 // Compound assignment. Only usable if there is a corresponding non-member
75 // binary operator overload. For example, only f32 and f64 support division.
77 return *this = (*this * other);
78 }
80 return *this = (*this / other);
81 }
83 return *this = (*this + other);
84 }
86 return *this = (*this - other);
87 }
89 return *this = (*this & other);
90 }
92 return *this = (*this | other);
93 }
95 return *this = (*this ^ other);
96 }
97
98 Raw raw;
99};
100
101template <typename T>
102using Vec64 = Vec128<T, 8 / sizeof(T)>;
103
104template <typename T>
105using Vec32 = Vec128<T, 4 / sizeof(T)>;
106
107#if HWY_TARGET <= HWY_AVX3
108
109// Forward-declare for use by DeduceD, see below.
110template <typename T>
111class Vec512;
112
113namespace detail {
114
115// Template arg: sizeof(lane type)
116template <size_t size>
117struct RawMask128 {};
118template <>
120 using type = __mmask16;
121};
122template <>
124 using type = __mmask8;
125};
126template <>
128 using type = __mmask8;
129};
130template <>
132 using type = __mmask8;
133};
134
135} // namespace detail
136
137template <typename T, size_t N = 16 / sizeof(T)>
138struct Mask128 {
139 using Raw = typename detail::RawMask128<sizeof(T)>::type;
140
141 static Mask128<T, N> FromBits(uint64_t mask_bits) {
142 return Mask128<T, N>{static_cast<Raw>(mask_bits)};
143 }
144
145 Raw raw;
146};
147
148#else // AVX2 or below
149
150// FF..FF or 0.
151template <typename T, size_t N = 16 / sizeof(T)>
152struct Mask128 {
153 typename detail::Raw128<T>::type raw;
154};
155
156#endif // HWY_TARGET <= HWY_AVX3
157
158#if HWY_TARGET <= HWY_AVX2
159// Forward-declare for use by DeduceD, see below.
160template <typename T>
161class Vec256;
162#endif
163
164namespace detail {
165
166// Deduce Simd<T, N, 0> from Vec*<T, N> (pointers because Vec256/512 may be
167// incomplete types at this point; this is simpler than avoiding multiple
168// definitions of DFromV via #if)
169struct DeduceD {
170 template <typename T, size_t N>
172 return Simd<T, N, 0>();
173 }
174#if HWY_TARGET <= HWY_AVX2
175 template <typename T>
177 return Full256<T>();
178 }
179#endif
180#if HWY_TARGET <= HWY_AVX3
181 template <typename T>
183 return Full512<T>();
184 }
185#endif
186};
187
188// Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
189template <class V>
191 using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
192};
193
194} // namespace detail
195
196template <class V>
198
199template <class V>
200using TFromV = TFromD<DFromV<V>>;
201
202// ------------------------------ BitCast
203
204namespace detail {
205
206HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
207HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
208HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
209
210template <typename T, size_t N>
211HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
212 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
213}
214
215// Cannot rely on function overloading because return types differ.
216template <typename T>
217struct BitCastFromInteger128 {
218 HWY_INLINE __m128i operator()(__m128i v) { return v; }
219};
220template <>
221struct BitCastFromInteger128<float> {
222 HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
223};
224template <>
225struct BitCastFromInteger128<double> {
226 HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
227};
228
229template <typename T, size_t N>
231 Vec128<uint8_t, N * sizeof(T)> v) {
232 return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
233}
234
235} // namespace detail
236
237template <typename T, size_t N, typename FromT>
238HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
239 Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
241}
242
243// ------------------------------ Zero
244
245// Returns an all-zero vector/part.
246template <typename T, size_t N, HWY_IF_LE128(T, N)>
247HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
248 return Vec128<T, N>{_mm_setzero_si128()};
249}
250template <size_t N, HWY_IF_LE128(float, N)>
251HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
252 return Vec128<float, N>{_mm_setzero_ps()};
253}
254template <size_t N, HWY_IF_LE128(double, N)>
256 return Vec128<double, N>{_mm_setzero_pd()};
257}
258
259template <class D>
260using VFromD = decltype(Zero(D()));
261
262// ------------------------------ Set
263
264// Returns a vector/part with all lanes set to "t".
265template <size_t N, HWY_IF_LE128(uint8_t, N)>
266HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
267 return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
268}
269template <size_t N, HWY_IF_LE128(uint16_t, N)>
270HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
271 const uint16_t t) {
272 return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
273}
274template <size_t N, HWY_IF_LE128(uint32_t, N)>
275HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
276 const uint32_t t) {
277 return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
278}
279template <size_t N, HWY_IF_LE128(uint64_t, N)>
280HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
281 const uint64_t t) {
282 return Vec128<uint64_t, N>{
283 _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
284}
285template <size_t N, HWY_IF_LE128(int8_t, N)>
286HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
287 return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
288}
289template <size_t N, HWY_IF_LE128(int16_t, N)>
290HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
291 return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
292}
293template <size_t N, HWY_IF_LE128(int32_t, N)>
294HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
295 return Vec128<int32_t, N>{_mm_set1_epi32(t)};
296}
297template <size_t N, HWY_IF_LE128(int64_t, N)>
298HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
299 return Vec128<int64_t, N>{
300 _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
301}
302template <size_t N, HWY_IF_LE128(float, N)>
303HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
304 return Vec128<float, N>{_mm_set1_ps(t)};
305}
306template <size_t N, HWY_IF_LE128(double, N)>
308 return Vec128<double, N>{_mm_set1_pd(t)};
309}
310
311HWY_DIAGNOSTICS(push)
312HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
313
314// Returns a vector with uninitialized elements.
315template <typename T, size_t N, HWY_IF_LE128(T, N)>
316HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /* tag */) {
317 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
318 // generate an XOR instruction.
319 return Vec128<T, N>{_mm_undefined_si128()};
320}
321template <size_t N, HWY_IF_LE128(float, N)>
323 return Vec128<float, N>{_mm_undefined_ps()};
324}
325template <size_t N, HWY_IF_LE128(double, N)>
327 return Vec128<double, N>{_mm_undefined_pd()};
328}
329
331
332// ------------------------------ GetLane
333
334// Gets the single value stored in a vector/part.
335template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
337 return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
338}
339template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
340HWY_API T GetLane(const Vec128<T, N> v) {
341 return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
342}
343template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
344HWY_API T GetLane(const Vec128<T, N> v) {
345 return static_cast<T>(_mm_cvtsi128_si32(v.raw));
346}
347template <size_t N>
349 return _mm_cvtss_f32(v.raw);
350}
351template <size_t N>
353#if HWY_ARCH_X86_32
354 alignas(16) uint64_t lanes[2];
355 Store(v, Simd<uint64_t, N, 0>(), lanes);
356 return lanes[0];
357#else
358 return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw));
359#endif
360}
361template <size_t N>
363#if HWY_ARCH_X86_32
364 alignas(16) int64_t lanes[2];
365 Store(v, Simd<int64_t, N, 0>(), lanes);
366 return lanes[0];
367#else
368 return _mm_cvtsi128_si64(v.raw);
369#endif
370}
371template <size_t N>
373 return _mm_cvtsd_f64(v.raw);
374}
375
376// ================================================== LOGICAL
377
378// ------------------------------ And
379
380template <typename T, size_t N>
381HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
382 return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
383}
384template <size_t N>
386 const Vec128<float, N> b) {
387 return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
388}
389template <size_t N>
391 const Vec128<double, N> b) {
392 return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
393}
394
395// ------------------------------ AndNot
396
397// Returns ~not_mask & mask.
398template <typename T, size_t N>
399HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
400 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
401}
402template <size_t N>
404 const Vec128<float, N> mask) {
405 return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
406}
407template <size_t N>
409 const Vec128<double, N> mask) {
410 return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
411}
412
413// ------------------------------ Or
414
415template <typename T, size_t N>
416HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
417 return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
418}
419
420template <size_t N>
422 const Vec128<float, N> b) {
423 return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
424}
425template <size_t N>
427 const Vec128<double, N> b) {
428 return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
429}
430
431// ------------------------------ Xor
432
433template <typename T, size_t N>
434HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
435 return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
436}
437
438template <size_t N>
440 const Vec128<float, N> b) {
441 return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
442}
443template <size_t N>
445 const Vec128<double, N> b) {
446 return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
447}
448
449// ------------------------------ Not
450
451template <typename T, size_t N>
452HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
453 const DFromV<decltype(v)> d;
454 const RebindToUnsigned<decltype(d)> du;
455 using VU = VFromD<decltype(du)>;
456#if HWY_TARGET <= HWY_AVX3
457 const __m128i vu = BitCast(du, v).raw;
458 return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
459#else
460 return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
461#endif
462}
463
464// ------------------------------ Or3
465
466template <typename T, size_t N>
467HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
468#if HWY_TARGET <= HWY_AVX3
469 const DFromV<decltype(o1)> d;
470 const RebindToUnsigned<decltype(d)> du;
471 using VU = VFromD<decltype(du)>;
472 const __m128i ret = _mm_ternarylogic_epi64(
473 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
474 return BitCast(d, VU{ret});
475#else
476 return Or(o1, Or(o2, o3));
477#endif
478}
479
480// ------------------------------ OrAnd
481
482template <typename T, size_t N>
483HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
484#if HWY_TARGET <= HWY_AVX3
485 const DFromV<decltype(o)> d;
486 const RebindToUnsigned<decltype(d)> du;
487 using VU = VFromD<decltype(du)>;
488 const __m128i ret = _mm_ternarylogic_epi64(
489 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
490 return BitCast(d, VU{ret});
491#else
492 return Or(o, And(a1, a2));
493#endif
494}
495
496// ------------------------------ IfVecThenElse
497
498template <typename T, size_t N>
499HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
500 Vec128<T, N> no) {
501#if HWY_TARGET <= HWY_AVX3
502 const DFromV<decltype(no)> d;
503 const RebindToUnsigned<decltype(d)> du;
504 using VU = VFromD<decltype(du)>;
505 return BitCast(
506 d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
507 BitCast(du, no).raw, 0xCA)});
508#else
509 return IfThenElse(MaskFromVec(mask), yes, no);
510#endif
511}
512
513// ------------------------------ Operator overloads (internal-only if float)
514
515template <typename T, size_t N>
516HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
517 return And(a, b);
518}
519
520template <typename T, size_t N>
521HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
522 return Or(a, b);
523}
524
525template <typename T, size_t N>
526HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
527 return Xor(a, b);
528}
529
530// ------------------------------ PopulationCount
531
532// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
533#if HWY_TARGET == HWY_AVX3_DL
534
535#ifdef HWY_NATIVE_POPCNT
536#undef HWY_NATIVE_POPCNT
537#else
538#define HWY_NATIVE_POPCNT
539#endif
540
541namespace detail {
542
543template <typename T, size_t N>
545 Vec128<T, N> v) {
546 return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
547}
548template <typename T, size_t N>
550 Vec128<T, N> v) {
551 return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
552}
553template <typename T, size_t N>
555 Vec128<T, N> v) {
556 return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
557}
558template <typename T, size_t N>
560 Vec128<T, N> v) {
561 return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
562}
563
564} // namespace detail
565
566template <typename T, size_t N>
568 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
569}
570
571#endif // HWY_TARGET == HWY_AVX3_DL
572
573// ================================================== SIGN
574
575// ------------------------------ Neg
576
577template <typename T, size_t N, HWY_IF_FLOAT(T)>
578HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
579 return Xor(v, SignBit(DFromV<decltype(v)>()));
580}
581
582template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
583HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
584 return Zero(DFromV<decltype(v)>()) - v;
585}
586
587// ------------------------------ Abs
588
589// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
590template <size_t N>
591HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
592#if HWY_COMPILER_MSVC
593 // Workaround for incorrect codegen? (reaches breakpoint)
594 const auto zero = Zero(DFromV<decltype(v)>());
595 return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
596#else
597 return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
598#endif
599}
600template <size_t N>
601HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
602 return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
603}
604template <size_t N>
605HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
606 return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
607}
608// i64 is implemented after BroadcastSignBit.
609template <size_t N>
610HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
611 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
612 return v & BitCast(DFromV<decltype(v)>(), mask);
613}
614template <size_t N>
616 const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
617 return v & BitCast(DFromV<decltype(v)>(), mask);
618}
619
620// ------------------------------ CopySign
621
622template <typename T, size_t N>
623HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
624 const Vec128<T, N> sign) {
625 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
626
627 const DFromV<decltype(magn)> d;
628 const auto msb = SignBit(d);
629
630#if HWY_TARGET <= HWY_AVX3
631 const RebindToUnsigned<decltype(d)> du;
632 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
633 // 0 0 0 | 0
634 // 0 0 1 | 0
635 // 0 1 0 | 1
636 // 0 1 1 | 1
637 // 1 0 0 | 0
638 // 1 0 1 | 1
639 // 1 1 0 | 0
640 // 1 1 1 | 1
641 // The lane size does not matter because we are not using predication.
642 const __m128i out = _mm_ternarylogic_epi32(
643 BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
644 return BitCast(d, VFromD<decltype(du)>{out});
645#else
646 return Or(AndNot(msb, magn), And(msb, sign));
647#endif
648}
649
650template <typename T, size_t N>
651HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
652 const Vec128<T, N> sign) {
653#if HWY_TARGET <= HWY_AVX3
654 // AVX3 can also handle abs < 0, so no extra action needed.
655 return CopySign(abs, sign);
656#else
657 return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
658#endif
659}
660
661// ================================================== MASK
662
663#if HWY_TARGET <= HWY_AVX3
664
665// ------------------------------ IfThenElse
666
667// Returns mask ? b : a.
668
669namespace detail {
670
671// Templates for signed/unsigned integer of a particular size.
672template <typename T, size_t N>
674 Mask128<T, N> mask, Vec128<T, N> yes,
675 Vec128<T, N> no) {
676 return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
677}
678template <typename T, size_t N>
680 Mask128<T, N> mask, Vec128<T, N> yes,
681 Vec128<T, N> no) {
682 return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
683}
684template <typename T, size_t N>
686 Mask128<T, N> mask, Vec128<T, N> yes,
687 Vec128<T, N> no) {
688 return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
689}
690template <typename T, size_t N>
692 Mask128<T, N> mask, Vec128<T, N> yes,
693 Vec128<T, N> no) {
694 return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
695}
696
697} // namespace detail
698
699template <typename T, size_t N>
700HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
701 Vec128<T, N> no) {
702 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
703}
704
705template <size_t N>
708 return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)};
709}
710
711template <size_t N>
715 return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)};
716}
717
718namespace detail {
719
720template <typename T, size_t N>
722 Mask128<T, N> mask, Vec128<T, N> yes) {
723 return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
724}
725template <typename T, size_t N>
727 Mask128<T, N> mask, Vec128<T, N> yes) {
728 return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
729}
730template <typename T, size_t N>
732 Mask128<T, N> mask, Vec128<T, N> yes) {
733 return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
734}
735template <typename T, size_t N>
737 Mask128<T, N> mask, Vec128<T, N> yes) {
738 return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
739}
740
741} // namespace detail
742
743template <typename T, size_t N>
744HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
745 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
746}
747
748template <size_t N>
750 Vec128<float, N> yes) {
751 return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
752}
753
754template <size_t N>
756 Vec128<double, N> yes) {
757 return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
758}
759
760namespace detail {
761
762template <typename T, size_t N>
764 Mask128<T, N> mask, Vec128<T, N> no) {
765 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
766 return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
767}
768template <typename T, size_t N>
770 Mask128<T, N> mask, Vec128<T, N> no) {
771 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
772}
773template <typename T, size_t N>
775 Mask128<T, N> mask, Vec128<T, N> no) {
776 return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
777}
778template <typename T, size_t N>
780 Mask128<T, N> mask, Vec128<T, N> no) {
781 return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
782}
783
784} // namespace detail
785
786template <typename T, size_t N>
787HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
788 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
789}
790
791template <size_t N>
793 Vec128<float, N> no) {
794 return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
795}
796
797template <size_t N>
800 return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
801}
802
803// ------------------------------ Mask logical
804
805// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
806#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
807#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
808 HWY_COMPILER_CLANG >= 800
809#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
810#else
811#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
812#endif
813#endif // HWY_COMPILER_HAS_MASK_INTRINSICS
814
815namespace detail {
816
817template <typename T, size_t N>
819 const Mask128<T, N> b) {
820#if HWY_COMPILER_HAS_MASK_INTRINSICS
821 return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
822#else
823 return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
824#endif
825}
826template <typename T, size_t N>
828 const Mask128<T, N> b) {
829#if HWY_COMPILER_HAS_MASK_INTRINSICS
830 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
831#else
832 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
833#endif
834}
835template <typename T, size_t N>
837 const Mask128<T, N> b) {
838#if HWY_COMPILER_HAS_MASK_INTRINSICS
839 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
840#else
841 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
842#endif
843}
844template <typename T, size_t N>
846 const Mask128<T, N> b) {
847#if HWY_COMPILER_HAS_MASK_INTRINSICS
848 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
849#else
850 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
851#endif
852}
853
854template <typename T, size_t N>
856 const Mask128<T, N> b) {
857#if HWY_COMPILER_HAS_MASK_INTRINSICS
858 return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
859#else
860 return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
861#endif
862}
863template <typename T, size_t N>
865 const Mask128<T, N> b) {
866#if HWY_COMPILER_HAS_MASK_INTRINSICS
867 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
868#else
869 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
870#endif
871}
872template <typename T, size_t N>
874 const Mask128<T, N> b) {
875#if HWY_COMPILER_HAS_MASK_INTRINSICS
876 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
877#else
878 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
879#endif
880}
881template <typename T, size_t N>
883 const Mask128<T, N> b) {
884#if HWY_COMPILER_HAS_MASK_INTRINSICS
885 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
886#else
887 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
888#endif
889}
890
891template <typename T, size_t N>
893 const Mask128<T, N> b) {
894#if HWY_COMPILER_HAS_MASK_INTRINSICS
895 return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
896#else
897 return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
898#endif
899}
900template <typename T, size_t N>
902 const Mask128<T, N> b) {
903#if HWY_COMPILER_HAS_MASK_INTRINSICS
904 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
905#else
906 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
907#endif
908}
909template <typename T, size_t N>
911 const Mask128<T, N> b) {
912#if HWY_COMPILER_HAS_MASK_INTRINSICS
913 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
914#else
915 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
916#endif
917}
918template <typename T, size_t N>
920 const Mask128<T, N> b) {
921#if HWY_COMPILER_HAS_MASK_INTRINSICS
922 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
923#else
924 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
925#endif
926}
927
928template <typename T, size_t N>
930 const Mask128<T, N> b) {
931#if HWY_COMPILER_HAS_MASK_INTRINSICS
932 return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
933#else
934 return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
935#endif
936}
937template <typename T, size_t N>
939 const Mask128<T, N> b) {
940#if HWY_COMPILER_HAS_MASK_INTRINSICS
941 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
942#else
943 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
944#endif
945}
946template <typename T, size_t N>
948 const Mask128<T, N> b) {
949#if HWY_COMPILER_HAS_MASK_INTRINSICS
950 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
951#else
952 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
953#endif
954}
955template <typename T, size_t N>
957 const Mask128<T, N> b) {
958#if HWY_COMPILER_HAS_MASK_INTRINSICS
959 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
960#else
961 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
962#endif
963}
964
965} // namespace detail
966
967template <typename T, size_t N>
968HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
969 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
970}
971
972template <typename T, size_t N>
973HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
974 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
975}
976
977template <typename T, size_t N>
978HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
979 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
980}
981
982template <typename T, size_t N>
983HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
984 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
985}
986
987template <typename T, size_t N>
988HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
989 // Flip only the valid bits.
990 // TODO(janwas): use _knot intrinsics if N >= 8.
991 return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
992}
993
994#else // AVX2 or below
995
996// ------------------------------ Mask
997
998// Mask and Vec are the same (true = FF..FF).
999template <typename T, size_t N>
1000HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1001 return Mask128<T, N>{v.raw};
1002}
1003
1004template <typename T, size_t N>
1005HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1006 return Vec128<T, N>{v.raw};
1007}
1008
1009template <typename T, size_t N>
1010HWY_API Vec128<T, N> VecFromMask(const Simd<T, N, 0> /* tag */,
1011 const Mask128<T, N> v) {
1012 return Vec128<T, N>{v.raw};
1013}
1014
1015#if HWY_TARGET == HWY_SSSE3
1016
1017// mask ? yes : no
1018template <typename T, size_t N>
1019HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1020 Vec128<T, N> no) {
1021 const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
1022 return Or(And(vmask, yes), AndNot(vmask, no));
1023}
1024
1025#else // HWY_TARGET == HWY_SSSE3
1026
1027// mask ? yes : no
1028template <typename T, size_t N>
1029HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1030 Vec128<T, N> no) {
1031 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1032}
1033template <size_t N>
1034HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask,
1035 const Vec128<float, N> yes,
1036 const Vec128<float, N> no) {
1037 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1038}
1039template <size_t N>
1040HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask,
1041 const Vec128<double, N> yes,
1042 const Vec128<double, N> no) {
1043 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1044}
1045
1046#endif // HWY_TARGET == HWY_SSSE3
1047
1048// mask ? yes : 0
1049template <typename T, size_t N>
1050HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1051 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1052}
1053
1054// mask ? 0 : no
1055template <typename T, size_t N>
1056HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1057 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1058}
1059
1060// ------------------------------ Mask logical
1061
1062template <typename T, size_t N>
1063HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1064 return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1065}
1066
1067template <typename T, size_t N>
1068HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1069 const Simd<T, N, 0> d;
1070 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1071}
1072
1073template <typename T, size_t N>
1074HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1075 const Simd<T, N, 0> d;
1076 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1077}
1078
1079template <typename T, size_t N>
1080HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1081 const Simd<T, N, 0> d;
1082 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1083}
1084
1085template <typename T, size_t N>
1086HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1087 const Simd<T, N, 0> d;
1088 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1089}
1090
1091#endif // HWY_TARGET <= HWY_AVX3
1092
1093// ------------------------------ ShiftLeft
1094
1095template <int kBits, size_t N>
1096HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
1097 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1098}
1099
1100template <int kBits, size_t N>
1101HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
1102 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1103}
1104
1105template <int kBits, size_t N>
1106HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
1107 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1108}
1109
1110template <int kBits, size_t N>
1111HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
1112 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1113}
1114template <int kBits, size_t N>
1115HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
1116 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1117}
1118template <int kBits, size_t N>
1119HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
1120 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1121}
1122
1123template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1124HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
1125 const DFromV<decltype(v)> d8;
1126 // Use raw instead of BitCast to support N=1.
1127 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
1128 return kBits == 1
1129 ? (v + v)
1130 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1131}
1132
1133// ------------------------------ ShiftRight
1134
1135template <int kBits, size_t N>
1136HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
1137 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
1138}
1139template <int kBits, size_t N>
1140HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
1141 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
1142}
1143template <int kBits, size_t N>
1144HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
1145 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
1146}
1147
1148template <int kBits, size_t N>
1149HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
1150 const DFromV<decltype(v)> d8;
1151 // Use raw instead of BitCast to support N=1.
1152 const Vec128<uint8_t, N> shifted{
1153 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
1154 return shifted & Set(d8, 0xFF >> kBits);
1155}
1156
1157template <int kBits, size_t N>
1158HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
1159 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
1160}
1161template <int kBits, size_t N>
1162HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
1163 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
1164}
1165
1166template <int kBits, size_t N>
1167HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
1168 const DFromV<decltype(v)> di;
1169 const RebindToUnsigned<decltype(di)> du;
1170 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1171 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1172 return (shifted ^ shifted_sign) - shifted_sign;
1173}
1174
1175// i64 is implemented after BroadcastSignBit.
1176
1177// ================================================== SWIZZLE (1)
1178
1179// ------------------------------ TableLookupBytes
1180template <typename T, size_t N, typename TI, size_t NI>
1181HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
1182 const Vec128<TI, NI> from) {
1183 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
1184}
1185
1186// ------------------------------ TableLookupBytesOr0
1187// For all vector widths; x86 anyway zeroes if >= 0x80.
1188template <class V, class VI>
1189HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
1190 return TableLookupBytes(bytes, from);
1191}
1192
1193// ------------------------------ Shuffles (ShiftRight, TableLookupBytes)
1194
1195// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1196// Shuffle0321 rotates one lane to the right (the previous least-significant
1197// lane is now most-significant). These could also be implemented via
1198// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1199
1200// Swap 32-bit halves in 64-bit halves.
1201template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1202HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
1203 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1204 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1205}
1206template <size_t N>
1208 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1209 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
1210}
1211
1212// These are used by generic_ops-inl to implement LoadInterleaved3. As with
1213// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
1214// comes from the first argument.
1215namespace detail {
1216
1217template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1219 const Twice<DFromV<decltype(a)>> d2;
1220 const auto ba = Combine(d2, b, a);
1221 alignas(16) const T kShuffle[8] = {1, 0, 7, 6};
1222 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1223}
1224template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1226 const Twice<DFromV<decltype(a)>> d2;
1227 const auto ba = Combine(d2, b, a);
1228 alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
1229 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1230}
1231template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1232HWY_API Vec128<T, 4> Shuffle2301(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1233 const DFromV<decltype(a)> d;
1234 const RebindToFloat<decltype(d)> df;
1235 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
1236 return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1237 BitCast(df, b).raw, m)});
1238}
1239
1240template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1242 const Twice<DFromV<decltype(a)>> d2;
1243 const auto ba = Combine(d2, b, a);
1244 alignas(16) const T kShuffle[8] = {0, 3, 6, 5};
1245 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1246}
1247template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1249 const Twice<DFromV<decltype(a)>> d2;
1250 const auto ba = Combine(d2, b, a);
1251 alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
1252 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1253}
1254template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1255HWY_API Vec128<T, 4> Shuffle1230(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1256 const DFromV<decltype(a)> d;
1257 const RebindToFloat<decltype(d)> df;
1258 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
1259 return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1260 BitCast(df, b).raw, m)});
1261}
1262
1263template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1265 const Twice<DFromV<decltype(a)>> d2;
1266 const auto ba = Combine(d2, b, a);
1267 alignas(16) const T kShuffle[8] = {2, 1, 4, 7};
1268 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1269}
1270template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1272 const Twice<DFromV<decltype(a)>> d2;
1273 const auto ba = Combine(d2, b, a);
1274 alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
1275 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1276}
1277template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1278HWY_API Vec128<T, 4> Shuffle3012(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1279 const DFromV<decltype(a)> d;
1280 const RebindToFloat<decltype(d)> df;
1281 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
1282 return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1283 BitCast(df, b).raw, m)});
1284}
1285
1286} // namespace detail
1287
1288// Swap 64-bit halves
1289HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
1290 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1291}
1292HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
1293 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1294}
1295HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
1296 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
1297}
1299 return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1300}
1302 return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1303}
1305 return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
1306}
1307
1308// Rotate right 32 bits
1309HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
1310 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1311}
1312HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
1313 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1314}
1315HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
1316 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
1317}
1318// Rotate left 32 bits
1319HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
1320 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1321}
1322HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
1323 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1324}
1325HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
1326 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
1327}
1328
1329// Reverse
1330HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
1331 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1332}
1333HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
1334 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1335}
1336HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
1337 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
1338}
1339
1340// ================================================== COMPARE
1341
1342#if HWY_TARGET <= HWY_AVX3
1343
1344// Comparisons set a mask bit to 1 if the condition is true, else 0.
1345
1346template <typename TFrom, size_t NFrom, typename TTo, size_t NTo>
1349 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1350 return Mask128<TTo, NTo>{m.raw};
1351}
1352
1353namespace detail {
1354
1355template <typename T, size_t N>
1357 const Vec128<T, N> bit) {
1358 return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
1359}
1360template <typename T, size_t N>
1362 const Vec128<T, N> bit) {
1363 return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
1364}
1365template <typename T, size_t N>
1367 const Vec128<T, N> bit) {
1368 return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
1369}
1370template <typename T, size_t N>
1372 const Vec128<T, N> bit) {
1373 return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
1374}
1375
1376} // namespace detail
1377
1378template <typename T, size_t N>
1379HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
1380 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1381 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1382}
1383
1384// ------------------------------ Equality
1385
1386template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1388 return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
1389}
1390
1391template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1392HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1393 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1394}
1395
1396template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1397HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1398 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1399}
1400
1401template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1402HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1403 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1404}
1405
1406template <size_t N>
1407HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
1408 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1409}
1410
1411template <size_t N>
1414 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1415}
1416
1417// ------------------------------ Inequality
1418
1419template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1421 return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
1422}
1423
1424template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1425HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1426 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1427}
1428
1429template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1430HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1431 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1432}
1433
1434template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1435HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1436 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1437}
1438
1439template <size_t N>
1440HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
1441 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1442}
1443
1444template <size_t N>
1447 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1448}
1449
1450// ------------------------------ Strict inequality
1451
1452// Signed/float <
1453template <size_t N>
1454HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1455 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1456}
1457template <size_t N>
1458HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1459 Vec128<int16_t, N> b) {
1460 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1461}
1462template <size_t N>
1463HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1464 Vec128<int32_t, N> b) {
1465 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1466}
1467template <size_t N>
1468HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
1469 Vec128<int64_t, N> b) {
1470 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1471}
1472
1473template <size_t N>
1474HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
1475 Vec128<uint8_t, N> b) {
1476 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1477}
1478template <size_t N>
1479HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
1480 Vec128<uint16_t, N> b) {
1481 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1482}
1483template <size_t N>
1484HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
1485 Vec128<uint32_t, N> b) {
1486 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1487}
1488template <size_t N>
1489HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
1490 Vec128<uint64_t, N> b) {
1491 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1492}
1493
1494template <size_t N>
1495HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1496 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1497}
1498template <size_t N>
1500 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1501}
1502
1503// ------------------------------ Weak inequality
1504
1505template <size_t N>
1506HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
1507 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1508}
1509template <size_t N>
1512 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1513}
1514
1515// ------------------------------ Mask
1516
1517namespace detail {
1518
1519template <typename T, size_t N>
1521 const Vec128<T, N> v) {
1522 return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
1523}
1524template <typename T, size_t N>
1526 const Vec128<T, N> v) {
1527 return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
1528}
1529template <typename T, size_t N>
1531 const Vec128<T, N> v) {
1532 return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
1533}
1534template <typename T, size_t N>
1536 const Vec128<T, N> v) {
1537 return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
1538}
1539
1540} // namespace detail
1541
1542template <typename T, size_t N>
1543HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1544 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1545}
1546// There do not seem to be native floating-point versions of these instructions.
1547template <size_t N>
1549 const RebindToSigned<DFromV<decltype(v)>> di;
1550 return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
1551}
1552template <size_t N>
1554 const RebindToSigned<DFromV<decltype(v)>> di;
1555 return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
1556}
1557
1558template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1560 return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1561}
1562
1563template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1564HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1565 return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1566}
1567
1568template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1569HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1570 return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1571}
1572
1573template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1574HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1575 return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1576}
1577
1578template <size_t N>
1580 return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1581}
1582
1583template <size_t N>
1585 return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1586}
1587
1588template <typename T, size_t N>
1589HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */,
1590 const Mask128<T, N> v) {
1591 return VecFromMask(v);
1592}
1593
1594#else // AVX2 or below
1595
1596// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1597
1598template <typename TFrom, typename TTo, size_t N>
1599HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
1600 Mask128<TFrom, N> m) {
1601 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1602 const Simd<TFrom, N, 0> d;
1603 return MaskFromVec(BitCast(Simd<TTo, N, 0>(), VecFromMask(d, m)));
1604}
1605
1606template <typename T, size_t N>
1607HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1608 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1609 return (v & bit) == bit;
1610}
1611
1612// ------------------------------ Equality
1613
1614// Unsigned
1615template <size_t N>
1616HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
1617 const Vec128<uint8_t, N> b) {
1618 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1619}
1620template <size_t N>
1621HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
1622 const Vec128<uint16_t, N> b) {
1623 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1624}
1625template <size_t N>
1626HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
1627 const Vec128<uint32_t, N> b) {
1628 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1629}
1630template <size_t N>
1631HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1632 const Vec128<uint64_t, N> b) {
1633#if HWY_TARGET == HWY_SSSE3
1634 const Simd<uint32_t, N * 2, 0> d32;
1635 const Simd<uint64_t, N, 0> d64;
1636 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1637 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1638 return MaskFromVec(BitCast(d64, cmp64));
1639#else
1640 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1641#endif
1642}
1643
1644// Signed
1645template <size_t N>
1646HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
1647 const Vec128<int8_t, N> b) {
1648 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1649}
1650template <size_t N>
1651HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
1652 Vec128<int16_t, N> b) {
1653 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1654}
1655template <size_t N>
1656HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
1657 const Vec128<int32_t, N> b) {
1658 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1659}
1660template <size_t N>
1661HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1662 const Vec128<int64_t, N> b) {
1663 // Same as signed ==; avoid duplicating the SSSE3 version.
1664 const DFromV<decltype(a)> d;
1665 RebindToUnsigned<decltype(d)> du;
1666 return RebindMask(d, BitCast(du, a) == BitCast(du, b));
1667}
1668
1669// Float
1670template <size_t N>
1671HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
1672 const Vec128<float, N> b) {
1673 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1674}
1675template <size_t N>
1676HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
1677 const Vec128<double, N> b) {
1678 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1679}
1680
1681// ------------------------------ Inequality
1682
1683// This cannot have T as a template argument, otherwise it is not more
1684// specialized than rewritten operator== in C++20, leading to compile
1685// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
1686template <size_t N>
1687HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
1688 Vec128<uint8_t, N> b) {
1689 return Not(a == b);
1690}
1691template <size_t N>
1692HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
1693 Vec128<uint16_t, N> b) {
1694 return Not(a == b);
1695}
1696template <size_t N>
1697HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
1698 Vec128<uint32_t, N> b) {
1699 return Not(a == b);
1700}
1701template <size_t N>
1702HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
1703 Vec128<uint64_t, N> b) {
1704 return Not(a == b);
1705}
1706template <size_t N>
1707HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
1708 Vec128<int8_t, N> b) {
1709 return Not(a == b);
1710}
1711template <size_t N>
1712HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
1713 Vec128<int16_t, N> b) {
1714 return Not(a == b);
1715}
1716template <size_t N>
1717HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
1718 Vec128<int32_t, N> b) {
1719 return Not(a == b);
1720}
1721template <size_t N>
1722HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
1723 Vec128<int64_t, N> b) {
1724 return Not(a == b);
1725}
1726
1727template <size_t N>
1728HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
1729 const Vec128<float, N> b) {
1730 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1731}
1732template <size_t N>
1733HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
1734 const Vec128<double, N> b) {
1735 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1736}
1737
1738// ------------------------------ Strict inequality
1739
1740// Signed/float <
1741template <size_t N>
1742HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1743 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1744}
1745template <size_t N>
1746HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1747 Vec128<int16_t, N> b) {
1748 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1749}
1750template <size_t N>
1751HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1752 Vec128<int32_t, N> b) {
1753 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1754}
1755
1756template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
1757HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
1758 const DFromV<decltype(a)> du;
1759 const RebindToSigned<decltype(du)> di;
1760 const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1761 return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1762}
1763
1764template <size_t N>
1765HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1766 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1767}
1768template <size_t N>
1769HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
1770 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1771}
1772
1773template <size_t N>
1774HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
1775 const Vec128<int64_t, N> b) {
1776#if HWY_TARGET == HWY_SSSE3
1777 // See https://stackoverflow.com/questions/65166174/:
1778 const Simd<int64_t, N, 0> d;
1779 const RepartitionToNarrow<decltype(d)> d32;
1780 const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
1781 const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
1782 // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
1783 // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
1784 const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
1785 // Duplicate upper to lower half.
1786 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
1787#else
1788 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2
1789#endif
1790}
1791
1792// ------------------------------ Weak inequality
1793
1794template <size_t N>
1795HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
1796 const Vec128<float, N> b) {
1797 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1798}
1799template <size_t N>
1800HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
1801 const Vec128<double, N> b) {
1802 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1803}
1804
1805#endif // HWY_TARGET <= HWY_AVX3
1806
1807// ------------------------------ Reversed comparisons
1808
1809template <typename T, size_t N>
1810HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
1811 return b > a;
1812}
1813
1814template <typename T, size_t N>
1815HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
1816 return b >= a;
1817}
1818
1819// ------------------------------ FirstN (Iota, Lt)
1820
1821template <typename T, size_t N, HWY_IF_LE128(T, N)>
1823#if HWY_TARGET <= HWY_AVX3
1824 (void)d;
1825 const uint64_t all = (1ull << N) - 1;
1826 // BZHI only looks at the lower 8 bits of num!
1827 const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1828 return Mask128<T, N>::FromBits(bits);
1829#else
1830 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1831 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1832#endif
1833}
1834
1835template <class D>
1836using MFromD = decltype(FirstN(D(), 0));
1837
1838// ================================================== MEMORY (1)
1839
1840// Clang static analysis claims the memory immediately after a partial vector
1841// store is uninitialized, and also flags the input to partial loads (at least
1842// for loadl_pd) as "garbage". This is a false alarm because msan does not
1843// raise errors. We work around this by using CopyBytes instead of intrinsics,
1844// but only for the analyzer to avoid potentially bad code generation.
1845// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
1846#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1847#if defined(__clang_analyzer__) || \
1848 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1849#define HWY_SAFE_PARTIAL_LOAD_STORE 1
1850#else
1851#define HWY_SAFE_PARTIAL_LOAD_STORE 0
1852#endif
1853#endif // HWY_SAFE_PARTIAL_LOAD_STORE
1854
1855// ------------------------------ Load
1856
1857template <typename T>
1858HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1859 return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1860}
1862 const float* HWY_RESTRICT aligned) {
1863 return Vec128<float>{_mm_load_ps(aligned)};
1864}
1866 const double* HWY_RESTRICT aligned) {
1867 return Vec128<double>{_mm_load_pd(aligned)};
1868}
1869
1870template <typename T>
1872 return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1873}
1874HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
1875 const float* HWY_RESTRICT p) {
1876 return Vec128<float>{_mm_loadu_ps(p)};
1877}
1879 const double* HWY_RESTRICT p) {
1880 return Vec128<double>{_mm_loadu_pd(p)};
1881}
1882
1883template <typename T>
1885#if HWY_SAFE_PARTIAL_LOAD_STORE
1886 __m128i v = _mm_setzero_si128();
1887 CopyBytes<8>(p, &v);
1888 return Vec64<T>{v};
1889#else
1890 return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
1891#endif
1892}
1893
1895 const float* HWY_RESTRICT p) {
1896#if HWY_SAFE_PARTIAL_LOAD_STORE
1897 __m128 v = _mm_setzero_ps();
1898 CopyBytes<8>(p, &v);
1899 return Vec128<float, 2>{v};
1900#else
1901 const __m128 hi = _mm_setzero_ps();
1902 return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
1903#endif
1904}
1905
1907 const double* HWY_RESTRICT p) {
1908#if HWY_SAFE_PARTIAL_LOAD_STORE
1909 __m128d v = _mm_setzero_pd();
1910 CopyBytes<8>(p, &v);
1911 return Vec64<double>{v};
1912#else
1913 return Vec64<double>{_mm_load_sd(p)};
1914#endif
1915}
1916
1918 const float* HWY_RESTRICT p) {
1919#if HWY_SAFE_PARTIAL_LOAD_STORE
1920 __m128 v = _mm_setzero_ps();
1921 CopyBytes<4>(p, &v);
1922 return Vec128<float, 1>{v};
1923#else
1924 return Vec128<float, 1>{_mm_load_ss(p)};
1925#endif
1926}
1927
1928// Any <= 32 bit except <float, 1>
1929template <typename T, size_t N, HWY_IF_LE32(T, N)>
1930HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
1931 constexpr size_t kSize = sizeof(T) * N;
1932#if HWY_SAFE_PARTIAL_LOAD_STORE
1933 __m128 v = _mm_setzero_ps();
1934 CopyBytes<kSize>(p, &v);
1935 return Vec128<T, N>{v};
1936#else
1937 int32_t bits = 0;
1938 CopyBytes<kSize>(p, &bits);
1939 return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1940#endif
1941}
1942
1943// For < 128 bit, LoadU == Load.
1944template <typename T, size_t N, HWY_IF_LE64(T, N)>
1946 return Load(d, p);
1947}
1948
1949// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1950template <typename T, size_t N, HWY_IF_LE128(T, N)>
1951HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1952 return LoadU(d, p);
1953}
1954
1955// Returns a vector with lane i=[0, N) set to "first" + i.
1956template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
1957HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
1958 HWY_ALIGN T lanes[16 / sizeof(T)];
1959 for (size_t i = 0; i < 16 / sizeof(T); ++i) {
1960 lanes[i] = static_cast<T>(first + static_cast<T2>(i));
1961 }
1962 return Load(d, lanes);
1963}
1964
1965// ------------------------------ MaskedLoad
1966
1967#if HWY_TARGET <= HWY_AVX3
1968
1969template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1971 const T* HWY_RESTRICT p) {
1972 return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, p)};
1973}
1974
1975template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1976HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1977 const T* HWY_RESTRICT p) {
1978 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
1979}
1980
1981template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1982HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1983 const T* HWY_RESTRICT p) {
1984 return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
1985}
1986
1987template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1988HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1989 const T* HWY_RESTRICT p) {
1990 return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
1991}
1992
1993template <size_t N>
1995 Simd<float, N, 0> /* tag */,
1996 const float* HWY_RESTRICT p) {
1997 return Vec128<float, N>{_mm_maskz_loadu_ps(m.raw, p)};
1998}
1999
2000template <size_t N>
2002 Simd<double, N, 0> /* tag */,
2003 const double* HWY_RESTRICT p) {
2004 return Vec128<double, N>{_mm_maskz_loadu_pd(m.raw, p)};
2005}
2006
2007#elif HWY_TARGET == HWY_AVX2
2008
2009template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2010HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2011 const T* HWY_RESTRICT p) {
2012 auto p_p = reinterpret_cast<const int*>(p); // NOLINT
2013 return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
2014}
2015
2016template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2017HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2018 const T* HWY_RESTRICT p) {
2019 auto p_p = reinterpret_cast<const long long*>(p); // NOLINT
2020 return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
2021}
2022
2023template <size_t N>
2024HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, Simd<float, N, 0> d,
2025 const float* HWY_RESTRICT p) {
2026 const Vec128<int32_t, N> mi =
2027 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2028 return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
2029}
2030
2031template <size_t N>
2032HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, Simd<double, N, 0> d,
2033 const double* HWY_RESTRICT p) {
2034 const Vec128<int64_t, N> mi =
2035 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2036 return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
2037}
2038
2039// There is no maskload_epi8/16, so blend instead.
2040template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2041HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
2042 const T* HWY_RESTRICT p) {
2043 return IfThenElseZero(m, Load(d, p));
2044}
2045
2046#else // <= SSE4
2047
2048// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
2049template <typename T, size_t N>
2050HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
2051 const T* HWY_RESTRICT p) {
2052 return IfThenElseZero(m, Load(d, p));
2053}
2054
2055#endif
2056
2057// ------------------------------ Store
2058
2059template <typename T>
2060HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
2061 _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
2062}
2064 float* HWY_RESTRICT aligned) {
2065 _mm_store_ps(aligned, v.raw);
2066}
2068 double* HWY_RESTRICT aligned) {
2069 _mm_store_pd(aligned, v.raw);
2070}
2071
2072template <typename T>
2074 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
2075}
2076HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
2077 float* HWY_RESTRICT p) {
2078 _mm_storeu_ps(p, v.raw);
2079}
2081 double* HWY_RESTRICT p) {
2082 _mm_storeu_pd(p, v.raw);
2083}
2084
2085template <typename T>
2087#if HWY_SAFE_PARTIAL_LOAD_STORE
2088 CopyBytes<8>(&v, p);
2089#else
2090 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
2091#endif
2092}
2094 float* HWY_RESTRICT p) {
2095#if HWY_SAFE_PARTIAL_LOAD_STORE
2096 CopyBytes<8>(&v, p);
2097#else
2098 _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
2099#endif
2100}
2102 double* HWY_RESTRICT p) {
2103#if HWY_SAFE_PARTIAL_LOAD_STORE
2104 CopyBytes<8>(&v, p);
2105#else
2106 _mm_storel_pd(p, v.raw);
2107#endif
2108}
2109
2110// Any <= 32 bit except <float, 1>
2111template <typename T, size_t N, HWY_IF_LE32(T, N)>
2112HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2113 CopyBytes<sizeof(T) * N>(&v, p);
2114}
2116 float* HWY_RESTRICT p) {
2117#if HWY_SAFE_PARTIAL_LOAD_STORE
2118 CopyBytes<4>(&v, p);
2119#else
2120 _mm_store_ss(p, v.raw);
2121#endif
2122}
2123
2124// For < 128 bit, StoreU == Store.
2125template <typename T, size_t N, HWY_IF_LE64(T, N)>
2127 Store(v, d, p);
2128}
2129
2130// ------------------------------ BlendedStore
2131
2132namespace detail {
2133
2134// There is no maskload_epi8/16 with which we could safely implement
2135// BlendedStore. Manual blending is also unsafe because loading a full vector
2136// that crosses the array end causes asan faults. Resort to scalar code; the
2137// caller should instead use memcpy, assuming m is FirstN(d, n).
2138template <typename T, size_t N>
2140 T* HWY_RESTRICT p) {
2141 const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
2142 using TI = TFromD<decltype(di)>;
2143 alignas(16) TI buf[N];
2144 alignas(16) TI mask[N];
2145 Store(BitCast(di, v), di, buf);
2146 Store(BitCast(di, VecFromMask(d, m)), di, mask);
2147 for (size_t i = 0; i < N; ++i) {
2148 if (mask[i]) {
2149 CopyBytes<sizeof(T)>(buf + i, p + i);
2150 }
2151 }
2152}
2153} // namespace detail
2154
2155#if HWY_TARGET <= HWY_AVX3
2156
2157template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2159 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2160 _mm_mask_storeu_epi8(p, m.raw, v.raw);
2161}
2162template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2163HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2164 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2165 _mm_mask_storeu_epi16(p, m.raw, v.raw);
2166}
2167
2168template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2169HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2170 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2171 auto pi = reinterpret_cast<int*>(p); // NOLINT
2172 _mm_mask_storeu_epi32(pi, m.raw, v.raw);
2173}
2174
2175template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2176HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2177 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2178 auto pi = reinterpret_cast<long long*>(p); // NOLINT
2179 _mm_mask_storeu_epi64(pi, m.raw, v.raw);
2180}
2181
2182template <size_t N>
2184 Simd<float, N, 0>, float* HWY_RESTRICT p) {
2185 _mm_mask_storeu_ps(p, m.raw, v.raw);
2186}
2187
2188template <size_t N>
2190 Simd<double, N, 0>, double* HWY_RESTRICT p) {
2191 _mm_mask_storeu_pd(p, m.raw, v.raw);
2192}
2193
2194#elif HWY_TARGET == HWY_AVX2
2195
2196template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
2197HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2198 T* HWY_RESTRICT p) {
2199 detail::ScalarMaskedStore(v, m, d, p);
2200}
2201
2202template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2203HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2204 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2205 // For partial vectors, avoid writing other lanes by zeroing their mask.
2206 if (N < 4) {
2207 const Full128<T> df;
2208 const Mask128<T> mf{m.raw};
2209 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2210 }
2211
2212 auto pi = reinterpret_cast<int*>(p); // NOLINT
2213 _mm_maskstore_epi32(pi, m.raw, v.raw);
2214}
2215
2216template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2217HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2218 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2219 // For partial vectors, avoid writing other lanes by zeroing their mask.
2220 if (N < 2) {
2221 const Full128<T> df;
2222 const Mask128<T> mf{m.raw};
2223 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2224 }
2225
2226 auto pi = reinterpret_cast<long long*>(p); // NOLINT
2227 _mm_maskstore_epi64(pi, m.raw, v.raw);
2228}
2229
2230template <size_t N>
2231HWY_API void BlendedStore(Vec128<float, N> v, Mask128<float, N> m,
2232 Simd<float, N, 0> d, float* HWY_RESTRICT p) {
2233 using T = float;
2234 // For partial vectors, avoid writing other lanes by zeroing their mask.
2235 if (N < 4) {
2236 const Full128<T> df;
2237 const Mask128<T> mf{m.raw};
2238 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2239 }
2240
2241 const Vec128<MakeSigned<T>, N> mi =
2242 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2243 _mm_maskstore_ps(p, mi.raw, v.raw);
2244}
2245
2246template <size_t N>
2247HWY_API void BlendedStore(Vec128<double, N> v, Mask128<double, N> m,
2248 Simd<double, N, 0> d, double* HWY_RESTRICT p) {
2249 using T = double;
2250 // For partial vectors, avoid writing other lanes by zeroing their mask.
2251 if (N < 2) {
2252 const Full128<T> df;
2253 const Mask128<T> mf{m.raw};
2254 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2255 }
2256
2257 const Vec128<MakeSigned<T>, N> mi =
2258 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2259 _mm_maskstore_pd(p, mi.raw, v.raw);
2260}
2261
2262#else // <= SSE4
2263
2264template <typename T, size_t N>
2265HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2266 T* HWY_RESTRICT p) {
2267 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
2269}
2270
2271#endif // SSE4
2272
2273// ================================================== ARITHMETIC
2274
2275// ------------------------------ Addition
2276
2277// Unsigned
2278template <size_t N>
2279HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
2280 const Vec128<uint8_t, N> b) {
2281 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2282}
2283template <size_t N>
2284HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
2285 const Vec128<uint16_t, N> b) {
2286 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2287}
2288template <size_t N>
2289HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
2290 const Vec128<uint32_t, N> b) {
2291 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2292}
2293template <size_t N>
2294HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
2295 const Vec128<uint64_t, N> b) {
2296 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2297}
2298
2299// Signed
2300template <size_t N>
2301HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
2302 const Vec128<int8_t, N> b) {
2303 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2304}
2305template <size_t N>
2306HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
2307 const Vec128<int16_t, N> b) {
2308 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2309}
2310template <size_t N>
2311HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
2312 const Vec128<int32_t, N> b) {
2313 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2314}
2315template <size_t N>
2316HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
2317 const Vec128<int64_t, N> b) {
2318 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2319}
2320
2321// Float
2322template <size_t N>
2323HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
2324 const Vec128<float, N> b) {
2325 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
2326}
2327template <size_t N>
2329 const Vec128<double, N> b) {
2330 return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
2331}
2332
2333// ------------------------------ Subtraction
2334
2335// Unsigned
2336template <size_t N>
2337HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
2338 const Vec128<uint8_t, N> b) {
2339 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2340}
2341template <size_t N>
2342HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
2343 Vec128<uint16_t, N> b) {
2344 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2345}
2346template <size_t N>
2347HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
2348 const Vec128<uint32_t, N> b) {
2349 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2350}
2351template <size_t N>
2352HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
2353 const Vec128<uint64_t, N> b) {
2354 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2355}
2356
2357// Signed
2358template <size_t N>
2359HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
2360 const Vec128<int8_t, N> b) {
2361 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2362}
2363template <size_t N>
2364HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
2365 const Vec128<int16_t, N> b) {
2366 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2367}
2368template <size_t N>
2369HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
2370 const Vec128<int32_t, N> b) {
2371 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2372}
2373template <size_t N>
2374HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
2375 const Vec128<int64_t, N> b) {
2376 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2377}
2378
2379// Float
2380template <size_t N>
2381HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
2382 const Vec128<float, N> b) {
2383 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
2384}
2385template <size_t N>
2387 const Vec128<double, N> b) {
2388 return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
2389}
2390
2391// ------------------------------ SumsOf8
2392template <size_t N>
2393HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
2394 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
2395}
2396
2397// ------------------------------ SaturatedAdd
2398
2399// Returns a + b clamped to the destination range.
2400
2401// Unsigned
2402template <size_t N>
2403HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
2404 const Vec128<uint8_t, N> b) {
2405 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
2406}
2407template <size_t N>
2408HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
2409 const Vec128<uint16_t, N> b) {
2410 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
2411}
2412
2413// Signed
2414template <size_t N>
2415HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
2416 const Vec128<int8_t, N> b) {
2417 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2418}
2419template <size_t N>
2420HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
2421 const Vec128<int16_t, N> b) {
2422 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2423}
2424
2425// ------------------------------ SaturatedSub
2426
2427// Returns a - b clamped to the destination range.
2428
2429// Unsigned
2430template <size_t N>
2431HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
2432 const Vec128<uint8_t, N> b) {
2433 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2434}
2435template <size_t N>
2436HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
2437 const Vec128<uint16_t, N> b) {
2438 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2439}
2440
2441// Signed
2442template <size_t N>
2443HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
2444 const Vec128<int8_t, N> b) {
2445 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2446}
2447template <size_t N>
2448HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
2449 const Vec128<int16_t, N> b) {
2450 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2451}
2452
2453// ------------------------------ AverageRound
2454
2455// Returns (a + b + 1) / 2
2456
2457// Unsigned
2458template <size_t N>
2459HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
2460 const Vec128<uint8_t, N> b) {
2461 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2462}
2463template <size_t N>
2464HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
2465 const Vec128<uint16_t, N> b) {
2466 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2467}
2468
2469// ------------------------------ Integer multiplication
2470
2471template <size_t N>
2472HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
2473 const Vec128<uint16_t, N> b) {
2474 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2475}
2476template <size_t N>
2477HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
2478 const Vec128<int16_t, N> b) {
2479 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2480}
2481
2482// Returns the upper 16 bits of a * b in each lane.
2483template <size_t N>
2484HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
2485 const Vec128<uint16_t, N> b) {
2486 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2487}
2488template <size_t N>
2489HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
2490 const Vec128<int16_t, N> b) {
2491 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2492}
2493
2494template <size_t N>
2495HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
2496 const Vec128<int16_t, N> b) {
2497 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
2498}
2499
2500// Multiplies even lanes (0, 2 ..) and places the double-wide result into
2501// even and the upper half into its odd neighbor lane.
2502template <size_t N>
2503HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
2504 const Vec128<uint32_t, N> b) {
2505 return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2506}
2507
2508#if HWY_TARGET == HWY_SSSE3
2509
2510template <size_t N, HWY_IF_LE64(int32_t, N)> // N=1 or 2
2511HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2512 const Vec128<int32_t, N> b) {
2513 return Set(Simd<int64_t, (N + 1) / 2, 0>(),
2514 static_cast<int64_t>(GetLane(a)) * GetLane(b));
2515}
2516HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a,
2517 const Vec128<int32_t> b) {
2518 alignas(16) int32_t a_lanes[4];
2519 alignas(16) int32_t b_lanes[4];
2520 const Full128<int32_t> di32;
2521 Store(a, di32, a_lanes);
2522 Store(b, di32, b_lanes);
2523 alignas(16) int64_t mul[2];
2524 mul[0] = static_cast<int64_t>(a_lanes[0]) * b_lanes[0];
2525 mul[1] = static_cast<int64_t>(a_lanes[2]) * b_lanes[2];
2526 return Load(Full128<int64_t>(), mul);
2527}
2528
2529#else // HWY_TARGET == HWY_SSSE3
2530
2531template <size_t N>
2532HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2533 const Vec128<int32_t, N> b) {
2534 return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2535}
2536
2537#endif // HWY_TARGET == HWY_SSSE3
2538
2539template <size_t N>
2540HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
2541 const Vec128<uint32_t, N> b) {
2542#if HWY_TARGET == HWY_SSSE3
2543 // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
2544 // 64-bit right shift would also work but also needs port 5, so no benefit.
2545 // Notation: x=don't care, z=0.
2546 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2547 const auto mullo_x2x0 = MulEven(a, b);
2548 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2549 const auto mullo_x3x1 =
2550 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2551 // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
2552 // the latter requires one more instruction or a constant.
2553 const __m128i mul_20 =
2554 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2555 const __m128i mul_31 =
2556 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2557 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2558#else
2559 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2560#endif
2561}
2562
2563template <size_t N>
2564HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
2565 const Vec128<int32_t, N> b) {
2566 // Same as unsigned; avoid duplicating the SSSE3 code.
2567 const DFromV<decltype(a)> d;
2568 const RebindToUnsigned<decltype(d)> du;
2569 return BitCast(d, BitCast(du, a) * BitCast(du, b));
2570}
2571
2572// ------------------------------ RotateRight (ShiftRight, Or)
2573
2574template <int kBits, size_t N>
2575HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
2576 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
2577#if HWY_TARGET <= HWY_AVX3
2578 return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
2579#else
2580 if (kBits == 0) return v;
2581 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
2582#endif
2583}
2584
2585template <int kBits, size_t N>
2586HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
2587 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
2588#if HWY_TARGET <= HWY_AVX3
2589 return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
2590#else
2591 if (kBits == 0) return v;
2592 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
2593#endif
2594}
2595
2596// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2597
2598template <size_t N>
2599HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
2600 const DFromV<decltype(v)> d;
2601 return VecFromMask(v < Zero(d));
2602}
2603
2604template <size_t N>
2606 return ShiftRight<15>(v);
2607}
2608
2609template <size_t N>
2611 return ShiftRight<31>(v);
2612}
2613
2614template <size_t N>
2616 const DFromV<decltype(v)> d;
2617#if HWY_TARGET <= HWY_AVX3
2618 (void)d;
2619 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
2620#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2621 return VecFromMask(v < Zero(d));
2622#else
2623 // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
2624 // avoids generating a zero.
2625 const RepartitionToNarrow<decltype(d)> d32;
2626 const auto sign = ShiftRight<31>(BitCast(d32, v));
2627 return Vec128<int64_t, N>{
2628 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2629#endif
2630}
2631
2632template <size_t N>
2633HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
2634#if HWY_TARGET <= HWY_AVX3
2635 return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
2636#else
2637 const auto zero = Zero(DFromV<decltype(v)>());
2638 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2639#endif
2640}
2641
2642template <int kBits, size_t N>
2643HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
2644#if HWY_TARGET <= HWY_AVX3
2645 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)};
2646#else
2647 const DFromV<decltype(v)> di;
2648 const RebindToUnsigned<decltype(di)> du;
2649 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2650 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2651 return right | sign;
2652#endif
2653}
2654
2655// ------------------------------ ZeroIfNegative (BroadcastSignBit)
2656template <typename T, size_t N, HWY_IF_FLOAT(T)>
2657HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2658 const DFromV<decltype(v)> d;
2659#if HWY_TARGET == HWY_SSSE3
2660 const RebindToSigned<decltype(d)> di;
2661 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2662#else
2663 const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS
2664#endif
2665 return IfThenElse(mask, Zero(d), v);
2666}
2667
2668// ------------------------------ IfNegativeThenElse
2669template <size_t N>
2671 const Vec128<int8_t, N> yes,
2672 const Vec128<int8_t, N> no) {
2673 // int8: IfThenElse only looks at the MSB.
2674 return IfThenElse(MaskFromVec(v), yes, no);
2675}
2676
2677template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2679 Vec128<T, N> no) {
2680 static_assert(IsSigned<T>(), "Only works for signed/float");
2681 const DFromV<decltype(v)> d;
2682 const RebindToSigned<decltype(d)> di;
2683
2684 // 16-bit: no native blendv, so copy sign to lower byte's MSB.
2685 v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
2686 return IfThenElse(MaskFromVec(v), yes, no);
2687}
2688
2689template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2690HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
2691 Vec128<T, N> no) {
2692 static_assert(IsSigned<T>(), "Only works for signed/float");
2693 const DFromV<decltype(v)> d;
2694 const RebindToFloat<decltype(d)> df;
2695
2696 // 32/64-bit: use float IfThenElse, which only looks at the MSB.
2697 return BitCast(d, IfThenElse(MaskFromVec(BitCast(df, v)), BitCast(df, yes),
2698 BitCast(df, no)));
2699}
2700
2701// ------------------------------ ShiftLeftSame
2702
2703template <size_t N>
2704HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
2705 const int bits) {
2706 return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2707}
2708template <size_t N>
2709HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
2710 const int bits) {
2711 return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2712}
2713template <size_t N>
2714HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
2715 const int bits) {
2716 return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2717}
2718
2719template <size_t N>
2720HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
2721 const int bits) {
2722 return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2723}
2724
2725template <size_t N>
2726HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
2727 const int bits) {
2728 return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2729}
2730
2731template <size_t N>
2732HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
2733 const int bits) {
2734 return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2735}
2736
2737template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2738HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
2739 const DFromV<decltype(v)> d8;
2740 // Use raw instead of BitCast to support N=1.
2741 const Vec128<T, N> shifted{
2742 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
2743 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
2744}
2745
2746// ------------------------------ ShiftRightSame (BroadcastSignBit)
2747
2748template <size_t N>
2749HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
2750 const int bits) {
2751 return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2752}
2753template <size_t N>
2754HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
2755 const int bits) {
2756 return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2757}
2758template <size_t N>
2759HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
2760 const int bits) {
2761 return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2762}
2763
2764template <size_t N>
2765HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
2766 const int bits) {
2767 const DFromV<decltype(v)> d8;
2768 // Use raw instead of BitCast to support N=1.
2769 const Vec128<uint8_t, N> shifted{
2770 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
2771 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
2772}
2773
2774template <size_t N>
2775HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
2776 const int bits) {
2777 return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2778}
2779
2780template <size_t N>
2781HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
2782 const int bits) {
2783 return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2784}
2785template <size_t N>
2786HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
2787 const int bits) {
2788#if HWY_TARGET <= HWY_AVX3
2789 return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2790#else
2791 const DFromV<decltype(v)> di;
2792 const RebindToUnsigned<decltype(di)> du;
2793 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2794 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
2795 return right | sign;
2796#endif
2797}
2798
2799template <size_t N>
2800HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
2801 const DFromV<decltype(v)> di;
2802 const RebindToUnsigned<decltype(di)> du;
2803 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2804 const auto shifted_sign =
2805 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
2806 return (shifted ^ shifted_sign) - shifted_sign;
2807}
2808
2809// ------------------------------ Floating-point mul / div
2810
2811template <size_t N>
2812HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
2813 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2814}
2816 const Vec128<float, 1> b) {
2817 return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
2818}
2819template <size_t N>
2821 const Vec128<double, N> b) {
2822 return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
2823}
2825 return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
2826}
2827
2828template <size_t N>
2829HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
2830 const Vec128<float, N> b) {
2831 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2832}
2834 const Vec128<float, 1> b) {
2835 return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
2836}
2837template <size_t N>
2839 const Vec128<double, N> b) {
2840 return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
2841}
2843 return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
2844}
2845
2846// Approximate reciprocal
2847template <size_t N>
2848HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
2849 return Vec128<float, N>{_mm_rcp_ps(v.raw)};
2850}
2852 return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
2853}
2854
2855// Absolute value of difference.
2856template <size_t N>
2857HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
2858 const Vec128<float, N> b) {
2859 return Abs(a - b);
2860}
2861
2862// ------------------------------ Floating-point multiply-add variants
2863
2864// Returns mul * x + add
2865template <size_t N>
2866HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
2867 const Vec128<float, N> x,
2868 const Vec128<float, N> add) {
2869#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2870 return mul * x + add;
2871#else
2872 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2873#endif
2874}
2875template <size_t N>
2877 const Vec128<double, N> x,
2878 const Vec128<double, N> add) {
2879#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2880 return mul * x + add;
2881#else
2882 return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
2883#endif
2884}
2885
2886// Returns add - mul * x
2887template <size_t N>
2888HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
2889 const Vec128<float, N> x,
2890 const Vec128<float, N> add) {
2891#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2892 return add - mul * x;
2893#else
2894 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2895#endif
2896}
2897template <size_t N>
2899 const Vec128<double, N> x,
2900 const Vec128<double, N> add) {
2901#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2902 return add - mul * x;
2903#else
2904 return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
2905#endif
2906}
2907
2908// Returns mul * x - sub
2909template <size_t N>
2910HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
2911 const Vec128<float, N> x,
2912 const Vec128<float, N> sub) {
2913#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2914 return mul * x - sub;
2915#else
2916 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2917#endif
2918}
2919template <size_t N>
2921 const Vec128<double, N> x,
2922 const Vec128<double, N> sub) {
2923#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2924 return mul * x - sub;
2925#else
2926 return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
2927#endif
2928}
2929
2930// Returns -mul * x - sub
2931template <size_t N>
2932HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
2933 const Vec128<float, N> x,
2934 const Vec128<float, N> sub) {
2935#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2936 return Neg(mul) * x - sub;
2937#else
2938 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2939#endif
2940}
2941template <size_t N>
2943 const Vec128<double, N> x,
2944 const Vec128<double, N> sub) {
2945#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2946 return Neg(mul) * x - sub;
2947#else
2948 return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2949#endif
2950}
2951
2952// ------------------------------ Floating-point square root
2953
2954// Full precision square root
2955template <size_t N>
2956HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
2957 return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
2958}
2960 return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
2961}
2962template <size_t N>
2964 return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
2965}
2967 return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
2968}
2969
2970// Approximate reciprocal square root
2971template <size_t N>
2972HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
2973 return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
2974}
2976 return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
2977}
2978
2979// ------------------------------ Min (Gt, IfThenElse)
2980
2981namespace detail {
2982
2983template <typename T, size_t N>
2985 const Vec128<T, N> b) {
2986 const DFromV<decltype(a)> d;
2987 const RebindToUnsigned<decltype(d)> du;
2988 const RebindToSigned<decltype(d)> di;
2989 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2990 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2991 return IfThenElse(gt, b, a);
2992}
2993
2994} // namespace detail
2995
2996// Unsigned
2997template <size_t N>
2998HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
2999 const Vec128<uint8_t, N> b) {
3000 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
3001}
3002template <size_t N>
3003HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
3004 const Vec128<uint16_t, N> b) {
3005#if HWY_TARGET == HWY_SSSE3
3006 return detail::MinU(a, b);
3007#else
3008 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
3009#endif
3010}
3011template <size_t N>
3012HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
3013 const Vec128<uint32_t, N> b) {
3014#if HWY_TARGET == HWY_SSSE3
3015 return detail::MinU(a, b);
3016#else
3017 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
3018#endif
3019}
3020template <size_t N>
3021HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
3022 const Vec128<uint64_t, N> b) {
3023#if HWY_TARGET <= HWY_AVX3
3024 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
3025#else
3026 return detail::MinU(a, b);
3027#endif
3028}
3029
3030// Signed
3031template <size_t N>
3032HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
3033 const Vec128<int8_t, N> b) {
3034#if HWY_TARGET == HWY_SSSE3
3035 return IfThenElse(a < b, a, b);
3036#else
3037 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
3038#endif
3039}
3040template <size_t N>
3041HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
3042 const Vec128<int16_t, N> b) {
3043 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
3044}
3045template <size_t N>
3046HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
3047 const Vec128<int32_t, N> b) {
3048#if HWY_TARGET == HWY_SSSE3
3049 return IfThenElse(a < b, a, b);
3050#else
3051 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
3052#endif
3053}
3054template <size_t N>
3055HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
3056 const Vec128<int64_t, N> b) {
3057#if HWY_TARGET <= HWY_AVX3
3058 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
3059#else
3060 return IfThenElse(a < b, a, b);
3061#endif
3062}
3063
3064// Float
3065template <size_t N>
3066HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
3067 const Vec128<float, N> b) {
3068 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
3069}
3070template <size_t N>
3072 const Vec128<double, N> b) {
3073 return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
3074}
3075
3076// ------------------------------ Max (Gt, IfThenElse)
3077
3078namespace detail {
3079template <typename T, size_t N>
3081 const Vec128<T, N> b) {
3082 const DFromV<decltype(a)> d;
3083 const RebindToUnsigned<decltype(d)> du;
3084 const RebindToSigned<decltype(d)> di;
3085 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
3086 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
3087 return IfThenElse(gt, a, b);
3088}
3089
3090} // namespace detail
3091
3092// Unsigned
3093template <size_t N>
3094HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
3095 const Vec128<uint8_t, N> b) {
3096 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
3097}
3098template <size_t N>
3099HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
3100 const Vec128<uint16_t, N> b) {
3101#if HWY_TARGET == HWY_SSSE3
3102 return detail::MaxU(a, b);
3103#else
3104 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
3105#endif
3106}
3107template <size_t N>
3108HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
3109 const Vec128<uint32_t, N> b) {
3110#if HWY_TARGET == HWY_SSSE3
3111 return detail::MaxU(a, b);
3112#else
3113 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
3114#endif
3115}
3116template <size_t N>
3117HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
3118 const Vec128<uint64_t, N> b) {
3119#if HWY_TARGET <= HWY_AVX3
3120 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
3121#else
3122 return detail::MaxU(a, b);
3123#endif
3124}
3125
3126// Signed
3127template <size_t N>
3128HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
3129 const Vec128<int8_t, N> b) {
3130#if HWY_TARGET == HWY_SSSE3
3131 return IfThenElse(a < b, b, a);
3132#else
3133 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
3134#endif
3135}
3136template <size_t N>
3137HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
3138 const Vec128<int16_t, N> b) {
3139 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
3140}
3141template <size_t N>
3142HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
3143 const Vec128<int32_t, N> b) {
3144#if HWY_TARGET == HWY_SSSE3
3145 return IfThenElse(a < b, b, a);
3146#else
3147 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
3148#endif
3149}
3150template <size_t N>
3151HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
3152 const Vec128<int64_t, N> b) {
3153#if HWY_TARGET <= HWY_AVX3
3154 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
3155#else
3156 return IfThenElse(a < b, b, a);
3157#endif
3158}
3159
3160// Float
3161template <size_t N>
3162HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
3163 const Vec128<float, N> b) {
3164 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
3165}
3166template <size_t N>
3168 const Vec128<double, N> b) {
3169 return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
3170}
3171
3172// ================================================== MEMORY (2)
3173
3174// ------------------------------ Non-temporal stores
3175
3176// On clang6, we see incorrect code generated for _mm_stream_pi, so
3177// round even partial vectors up to 16 bytes.
3178template <typename T, size_t N>
3179HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
3180 T* HWY_RESTRICT aligned) {
3181 _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
3182}
3183template <size_t N>
3185 float* HWY_RESTRICT aligned) {
3186 _mm_stream_ps(aligned, v.raw);
3187}
3188template <size_t N>
3190 double* HWY_RESTRICT aligned) {
3191 _mm_stream_pd(aligned, v.raw);
3192}
3193
3194// ------------------------------ Scatter
3195
3196// Work around warnings in the intrinsic definitions (passing -1 as a mask).
3197HWY_DIAGNOSTICS(push)
3198HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3199
3200// Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
3201using GatherIndex64 = long long int; // NOLINT(runtime/int)
3202static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
3203
3204#if HWY_TARGET <= HWY_AVX3
3205namespace detail {
3206
3207template <typename T, size_t N>
3209 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3210 const Vec128<int32_t, N> offset) {
3211 if (N == 4) {
3212 _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
3213 } else {
3214 const __mmask8 mask = (1u << N) - 1;
3215 _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
3216 }
3217}
3218template <typename T, size_t N>
3220 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3221 const Vec128<int32_t, N> index) {
3222 if (N == 4) {
3223 _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
3224 } else {
3225 const __mmask8 mask = (1u << N) - 1;
3226 _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
3227 }
3228}
3229
3230template <typename T, size_t N>
3232 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3233 const Vec128<int64_t, N> offset) {
3234 if (N == 2) {
3235 _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
3236 } else {
3237 const __mmask8 mask = (1u << N) - 1;
3238 _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
3239 }
3240}
3241template <typename T, size_t N>
3243 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3244 const Vec128<int64_t, N> index) {
3245 if (N == 2) {
3246 _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
3247 } else {
3248 const __mmask8 mask = (1u << N) - 1;
3249 _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
3250 }
3251}
3252
3253} // namespace detail
3254
3255template <typename T, size_t N, typename Offset>
3257 T* HWY_RESTRICT base,
3258 const Vec128<Offset, N> offset) {
3259 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3260 return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
3261}
3262template <typename T, size_t N, typename Index>
3263HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
3264 const Vec128<Index, N> index) {
3265 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3266 return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
3267}
3268
3269template <size_t N>
3271 float* HWY_RESTRICT base,
3272 const Vec128<int32_t, N> offset) {
3273 if (N == 4) {
3274 _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
3275 } else {
3276 const __mmask8 mask = (1u << N) - 1;
3277 _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
3278 }
3279}
3280template <size_t N>
3282 float* HWY_RESTRICT base,
3283 const Vec128<int32_t, N> index) {
3284 if (N == 4) {
3285 _mm_i32scatter_ps(base, index.raw, v.raw, 4);
3286 } else {
3287 const __mmask8 mask = (1u << N) - 1;
3288 _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
3289 }
3290}
3291
3292template <size_t N>
3294 double* HWY_RESTRICT base,
3295 const Vec128<int64_t, N> offset) {
3296 if (N == 2) {
3297 _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
3298 } else {
3299 const __mmask8 mask = (1u << N) - 1;
3300 _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
3301 }
3302}
3303template <size_t N>
3305 double* HWY_RESTRICT base,
3306 const Vec128<int64_t, N> index) {
3307 if (N == 2) {
3308 _mm_i64scatter_pd(base, index.raw, v.raw, 8);
3309 } else {
3310 const __mmask8 mask = (1u << N) - 1;
3311 _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
3312 }
3313}
3314#else // HWY_TARGET <= HWY_AVX3
3315
3316template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
3317HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
3318 T* HWY_RESTRICT base,
3319 const Vec128<Offset, N> offset) {
3320 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3321
3322 alignas(16) T lanes[N];
3323 Store(v, d, lanes);
3324
3325 alignas(16) Offset offset_lanes[N];
3326 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3327
3328 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
3329 for (size_t i = 0; i < N; ++i) {
3330 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
3331 }
3332}
3333
3334template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
3335HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
3336 const Vec128<Index, N> index) {
3337 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3338
3339 alignas(16) T lanes[N];
3340 Store(v, d, lanes);
3341
3342 alignas(16) Index index_lanes[N];
3343 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3344
3345 for (size_t i = 0; i < N; ++i) {
3346 base[index_lanes[i]] = lanes[i];
3347 }
3348}
3349
3350#endif
3351
3352// ------------------------------ Gather (Load/Store)
3353
3354#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3355
3356template <typename T, size_t N, typename Offset>
3357HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
3358 const T* HWY_RESTRICT base,
3359 const Vec128<Offset, N> offset) {
3360 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3361
3362 alignas(16) Offset offset_lanes[N];
3363 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3364
3365 alignas(16) T lanes[N];
3366 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
3367 for (size_t i = 0; i < N; ++i) {
3368 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
3369 }
3370 return Load(d, lanes);
3371}
3372
3373template <typename T, size_t N, typename Index>
3374HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
3375 const T* HWY_RESTRICT base,
3376 const Vec128<Index, N> index) {
3377 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3378
3379 alignas(16) Index index_lanes[N];
3380 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3381
3382 alignas(16) T lanes[N];
3383 for (size_t i = 0; i < N; ++i) {
3384 lanes[i] = base[index_lanes[i]];
3385 }
3386 return Load(d, lanes);
3387}
3388
3389#else
3390
3391namespace detail {
3392
3393template <typename T, size_t N>
3394HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */,
3395 Simd<T, N, 0> /* d */,
3396 const T* HWY_RESTRICT base,
3397 const Vec128<int32_t, N> offset) {
3398 return Vec128<T, N>{_mm_i32gather_epi32(
3399 reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
3400}
3401template <typename T, size_t N>
3402HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */,
3403 Simd<T, N, 0> /* d */,
3404 const T* HWY_RESTRICT base,
3405 const Vec128<int32_t, N> index) {
3406 return Vec128<T, N>{_mm_i32gather_epi32(
3407 reinterpret_cast<const int32_t*>(base), index.raw, 4)};
3408}
3409
3410template <typename T, size_t N>
3411HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */,
3412 Simd<T, N, 0> /* d */,
3413 const T* HWY_RESTRICT base,
3414 const Vec128<int64_t, N> offset) {
3415 return Vec128<T, N>{_mm_i64gather_epi64(
3416 reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
3417}
3418template <typename T, size_t N>
3419HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */,
3420 Simd<T, N, 0> /* d */,
3421 const T* HWY_RESTRICT base,
3422 const Vec128<int64_t, N> index) {
3423 return Vec128<T, N>{_mm_i64gather_epi64(
3424 reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
3425}
3426
3427} // namespace detail
3428
3429template <typename T, size_t N, typename Offset>
3430HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3431 const Vec128<Offset, N> offset) {
3432 return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
3433}
3434template <typename T, size_t N, typename Index>
3435HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3436 const Vec128<Index, N> index) {
3437 return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
3438}
3439
3440template <size_t N>
3441HWY_API Vec128<float, N> GatherOffset(Simd<float, N, 0> /* tag */,
3442 const float* HWY_RESTRICT base,
3443 const Vec128<int32_t, N> offset) {
3444 return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3445}
3446template <size_t N>
3447HWY_API Vec128<float, N> GatherIndex(Simd<float, N, 0> /* tag */,
3448 const float* HWY_RESTRICT base,
3449 const Vec128<int32_t, N> index) {
3450 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3451}
3452
3453template <size_t N>
3454HWY_API Vec128<double, N> GatherOffset(Simd<double, N, 0> /* tag */,
3455 const double* HWY_RESTRICT base,
3456 const Vec128<int64_t, N> offset) {
3457 return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3458}
3459template <size_t N>
3460HWY_API Vec128<double, N> GatherIndex(Simd<double, N, 0> /* tag */,
3461 const double* HWY_RESTRICT base,
3462 const Vec128<int64_t, N> index) {
3463 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3464}
3465
3466#endif // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3467
3468HWY_DIAGNOSTICS(pop)
3469
3470// ================================================== SWIZZLE (2)
3471
3472// ------------------------------ LowerHalf
3473
3474// Returns upper/lower half of a vector.
3475template <typename T, size_t N>
3476HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
3477 Vec128<T, N> v) {
3478 return Vec128<T, N / 2>{v.raw};
3479}
3480
3481template <typename T, size_t N>
3482HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
3483 return LowerHalf(Simd<T, N / 2, 0>(), v);
3484}
3485
3486// ------------------------------ ShiftLeftBytes
3487
3488template <int kBytes, typename T, size_t N>
3489HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3490 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3491 return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
3492}
3493
3494template <int kBytes, typename T, size_t N>
3495HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
3496 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
3497}
3498
3499// ------------------------------ ShiftLeftLanes
3500
3501template <int kLanes, typename T, size_t N>
3502HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3503 const Repartition<uint8_t, decltype(d)> d8;
3504 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3505}
3506
3507template <int kLanes, typename T, size_t N>
3508HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
3509 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
3510}
3511
3512// ------------------------------ ShiftRightBytes
3513template <int kBytes, typename T, size_t N>
3514HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3515 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3516 // For partial vectors, clear upper lanes so we shift in zeros.
3517 if (N != 16 / sizeof(T)) {
3518 const Vec128<T> vfull{v.raw};
3519 v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
3520 }
3521 return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
3522}
3523
3524// ------------------------------ ShiftRightLanes
3525template <int kLanes, typename T, size_t N>
3526HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3527 const Repartition<uint8_t, decltype(d)> d8;
3528 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3529}
3530
3531// ------------------------------ UpperHalf (ShiftRightBytes)
3532
3533// Full input: copy hi into lo (smaller instruction encoding than shifts).
3534template <typename T>
3536 return Vec64<T>{_mm_unpackhi_epi64(v.raw, v.raw)};
3537}
3538HWY_API Vec128<float, 2> UpperHalf(Full64<float> /* tag */, Vec128<float> v) {
3539 return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
3540}
3542 return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
3543}
3544
3545// Partial
3546template <typename T, size_t N, HWY_IF_LE64(T, N)>
3547HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3548 Vec128<T, N> v) {
3549 const DFromV<decltype(v)> d;
3550 const RebindToUnsigned<decltype(d)> du;
3551 const auto vu = BitCast(du, v);
3552 const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3553 return Vec128<T, (N + 1) / 2>{upper.raw};
3554}
3555
3556// ------------------------------ ExtractLane (UpperHalf)
3557
3558namespace detail {
3559
3560template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3561HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3562 static_assert(kLane < N, "Lane index out of bounds");
3563#if HWY_TARGET == HWY_SSSE3
3564 const int pair = _mm_extract_epi16(v.raw, kLane / 2);
3565 constexpr int kShift = kLane & 1 ? 8 : 0;
3566 return static_cast<T>((pair >> kShift) & 0xFF);
3567#else
3568 return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
3569#endif
3570}
3571
3572template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3573HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3574 static_assert(kLane < N, "Lane index out of bounds");
3575 return static_cast<T>(_mm_extract_epi16(v.raw, kLane) & 0xFFFF);
3576}
3577
3578template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3579HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3580 static_assert(kLane < N, "Lane index out of bounds");
3581#if HWY_TARGET == HWY_SSSE3
3582 alignas(16) T lanes[4];
3583 Store(v, DFromV<decltype(v)>(), lanes);
3584 return lanes[kLane];
3585#else
3586 return static_cast<T>(_mm_extract_epi32(v.raw, kLane));
3587#endif
3588}
3589
3590template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3591HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3592 static_assert(kLane < N, "Lane index out of bounds");
3593#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3594 alignas(16) T lanes[2];
3595 Store(v, DFromV<decltype(v)>(), lanes);
3596 return lanes[kLane];
3597#else
3598 return static_cast<T>(_mm_extract_epi64(v.raw, kLane));
3599#endif
3600}
3601
3602template <size_t kLane, size_t N>
3603HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
3604 static_assert(kLane < N, "Lane index out of bounds");
3605#if HWY_TARGET == HWY_SSSE3
3606 alignas(16) float lanes[4];
3607 Store(v, DFromV<decltype(v)>(), lanes);
3608 return lanes[kLane];
3609#else
3610 // Bug in the intrinsic, returns int but should be float.
3611 const int bits = _mm_extract_ps(v.raw, kLane);
3612 float ret;
3613 CopyBytes<4>(&bits, &ret);
3614 return ret;
3615#endif
3616}
3617
3618// There is no extract_pd; two overloads because there is no UpperHalf for N=1.
3619template <size_t kLane>
3621 static_assert(kLane == 0, "Lane index out of bounds");
3622 return GetLane(v);
3623}
3624
3625template <size_t kLane>
3627 static_assert(kLane < 2, "Lane index out of bounds");
3628 const Half<DFromV<decltype(v)>> dh;
3629 return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v));
3630}
3631
3632} // namespace detail
3633
3634// Requires one overload per vector length because ExtractLane<3> may be a
3635// compile error if it calls _mm_extract_epi64.
3636template <typename T>
3637HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
3638 HWY_DASSERT(i == 0);
3639 (void)i;
3640 return GetLane(v);
3641}
3642
3643template <typename T>
3644HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
3645#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3646 if (__builtin_constant_p(i)) {
3647 switch (i) {
3648 case 0:
3649 return detail::ExtractLane<0>(v);
3650 case 1:
3651 return detail::ExtractLane<1>(v);
3652 }
3653 }
3654#endif
3655 alignas(16) T lanes[2];
3656 Store(v, DFromV<decltype(v)>(), lanes);
3657 return lanes[i];
3658}
3659
3660template <typename T>
3661HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
3662#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3663 if (__builtin_constant_p(i)) {
3664 switch (i) {
3665 case 0:
3666 return detail::ExtractLane<0>(v);
3667 case 1:
3668 return detail::ExtractLane<1>(v);
3669 case 2:
3670 return detail::ExtractLane<2>(v);
3671 case 3:
3672 return detail::ExtractLane<3>(v);
3673 }
3674 }
3675#endif
3676 alignas(16) T lanes[4];
3677 Store(v, DFromV<decltype(v)>(), lanes);
3678 return lanes[i];
3679}
3680
3681template <typename T>
3682HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
3683#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3684 if (__builtin_constant_p(i)) {
3685 switch (i) {
3686 case 0:
3687 return detail::ExtractLane<0>(v);
3688 case 1:
3689 return detail::ExtractLane<1>(v);
3690 case 2:
3691 return detail::ExtractLane<2>(v);
3692 case 3:
3693 return detail::ExtractLane<3>(v);
3694 case 4:
3695 return detail::ExtractLane<4>(v);
3696 case 5:
3697 return detail::ExtractLane<5>(v);
3698 case 6:
3699 return detail::ExtractLane<6>(v);
3700 case 7:
3701 return detail::ExtractLane<7>(v);
3702 }
3703 }
3704#endif
3705 alignas(16) T lanes[8];
3706 Store(v, DFromV<decltype(v)>(), lanes);
3707 return lanes[i];
3708}
3709
3710template <typename T>
3711HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
3712#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3713 if (__builtin_constant_p(i)) {
3714 switch (i) {
3715 case 0:
3716 return detail::ExtractLane<0>(v);
3717 case 1:
3718 return detail::ExtractLane<1>(v);
3719 case 2:
3720 return detail::ExtractLane<2>(v);
3721 case 3:
3722 return detail::ExtractLane<3>(v);
3723 case 4:
3724 return detail::ExtractLane<4>(v);
3725 case 5:
3726 return detail::ExtractLane<5>(v);
3727 case 6:
3728 return detail::ExtractLane<6>(v);
3729 case 7:
3730 return detail::ExtractLane<7>(v);
3731 case 8:
3732 return detail::ExtractLane<8>(v);
3733 case 9:
3734 return detail::ExtractLane<9>(v);
3735 case 10:
3736 return detail::ExtractLane<10>(v);
3737 case 11:
3738 return detail::ExtractLane<11>(v);
3739 case 12:
3740 return detail::ExtractLane<12>(v);
3741 case 13:
3742 return detail::ExtractLane<13>(v);
3743 case 14:
3744 return detail::ExtractLane<14>(v);
3745 case 15:
3746 return detail::ExtractLane<15>(v);
3747 }
3748 }
3749#endif
3750 alignas(16) T lanes[16];
3751 Store(v, DFromV<decltype(v)>(), lanes);
3752 return lanes[i];
3753}
3754
3755// ------------------------------ InsertLane (UpperHalf)
3756
3757namespace detail {
3758
3759template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3760HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3761 static_assert(kLane < N, "Lane index out of bounds");
3762#if HWY_TARGET == HWY_SSSE3
3763 const DFromV<decltype(v)> d;
3764 alignas(16) T lanes[16];
3765 Store(v, d, lanes);
3766 lanes[kLane] = t;
3767 return Load(d, lanes);
3768#else
3769 return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)};
3770#endif
3771}
3772
3773template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3774HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3775 static_assert(kLane < N, "Lane index out of bounds");
3776 return Vec128<T, N>{_mm_insert_epi16(v.raw, t, kLane)};
3777}
3778
3779template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3780HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3781 static_assert(kLane < N, "Lane index out of bounds");
3782#if HWY_TARGET == HWY_SSSE3
3783 alignas(16) T lanes[4];
3784 const DFromV<decltype(v)> d;
3785 Store(v, d, lanes);
3786 lanes[kLane] = t;
3787 return Load(d, lanes);
3788#else
3789 MakeSigned<T> ti;
3790 CopyBytes<sizeof(T)>(&t, &ti); // don't just cast because T might be float.
3791 return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
3792#endif
3793}
3794
3795template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3796HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3797 static_assert(kLane < N, "Lane index out of bounds");
3798#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3799 const DFromV<decltype(v)> d;
3800 alignas(16) T lanes[2];
3801 Store(v, d, lanes);
3802 lanes[kLane] = t;
3803 return Load(d, lanes);
3804#else
3805 MakeSigned<T> ti;
3806 CopyBytes<sizeof(T)>(&t, &ti); // don't just cast because T might be float.
3807 return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
3808#endif
3809}
3810
3811template <size_t kLane, size_t N>
3812HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
3813 static_assert(kLane < N, "Lane index out of bounds");
3814#if HWY_TARGET == HWY_SSSE3
3815 const DFromV<decltype(v)> d;
3816 alignas(16) float lanes[4];
3817 Store(v, d, lanes);
3818 lanes[kLane] = t;
3819 return Load(d, lanes);
3820#else
3821 return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
3822#endif
3823}
3824
3825// There is no insert_pd; two overloads because there is no UpperHalf for N=1.
3826template <size_t kLane>
3828 static_assert(kLane == 0, "Lane index out of bounds");
3829 return Set(DFromV<decltype(v)>(), t);
3830}
3831
3832template <size_t kLane>
3834 static_assert(kLane < 2, "Lane index out of bounds");
3835 const DFromV<decltype(v)> d;
3836 const Vec128<double> vt = Set(d, t);
3837 if (kLane == 0) {
3838 return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)};
3839 }
3840 return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)};
3841}
3842
3843} // namespace detail
3844
3845// Requires one overload per vector length because InsertLane<3> may be a
3846// compile error if it calls _mm_insert_epi64.
3847
3848template <typename T>
3849HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
3850 HWY_DASSERT(i == 0);
3851 (void)i;
3852 return Set(DFromV<decltype(v)>(), t);
3853}
3854
3855template <typename T>
3856HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
3857#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3858 if (__builtin_constant_p(i)) {
3859 switch (i) {
3860 case 0:
3861 return detail::InsertLane<0>(v, t);
3862 case 1:
3863 return detail::InsertLane<1>(v, t);
3864 }
3865 }
3866#endif
3867 const DFromV<decltype(v)> d;
3868 alignas(16) T lanes[2];
3869 Store(v, d, lanes);
3870 lanes[i] = t;
3871 return Load(d, lanes);
3872}
3873
3874template <typename T>
3875HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
3876#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3877 if (__builtin_constant_p(i)) {
3878 switch (i) {
3879 case 0:
3880 return detail::InsertLane<0>(v, t);
3881 case 1:
3882 return detail::InsertLane<1>(v, t);
3883 case 2:
3884 return detail::InsertLane<2>(v, t);
3885 case 3:
3886 return detail::InsertLane<3>(v, t);
3887 }
3888 }
3889#endif
3890 const DFromV<decltype(v)> d;
3891 alignas(16) T lanes[4];
3892 Store(v, d, lanes);
3893 lanes[i] = t;
3894 return Load(d, lanes);
3895}
3896
3897template <typename T>
3898HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
3899#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3900 if (__builtin_constant_p(i)) {
3901 switch (i) {
3902 case 0:
3903 return detail::InsertLane<0>(v, t);
3904 case 1:
3905 return detail::InsertLane<1>(v, t);
3906 case 2:
3907 return detail::InsertLane<2>(v, t);
3908 case 3:
3909 return detail::InsertLane<3>(v, t);
3910 case 4:
3911 return detail::InsertLane<4>(v, t);
3912 case 5:
3913 return detail::InsertLane<5>(v, t);
3914 case 6:
3915 return detail::InsertLane<6>(v, t);
3916 case 7:
3917 return detail::InsertLane<7>(v, t);
3918 }
3919 }
3920#endif
3921 const DFromV<decltype(v)> d;
3922 alignas(16) T lanes[8];
3923 Store(v, d, lanes);
3924 lanes[i] = t;
3925 return Load(d, lanes);
3926}
3927
3928template <typename T>
3929HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
3930#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3931 if (__builtin_constant_p(i)) {
3932 switch (i) {
3933 case 0:
3934 return detail::InsertLane<0>(v, t);
3935 case 1:
3936 return detail::InsertLane<1>(v, t);
3937 case 2:
3938 return detail::InsertLane<2>(v, t);
3939 case 3:
3940 return detail::InsertLane<3>(v, t);
3941 case 4:
3942 return detail::InsertLane<4>(v, t);
3943 case 5:
3944 return detail::InsertLane<5>(v, t);
3945 case 6:
3946 return detail::InsertLane<6>(v, t);
3947 case 7:
3948 return detail::InsertLane<7>(v, t);
3949 case 8:
3950 return detail::InsertLane<8>(v, t);
3951 case 9:
3952 return detail::InsertLane<9>(v, t);
3953 case 10:
3954 return detail::InsertLane<10>(v, t);
3955 case 11:
3956 return detail::InsertLane<11>(v, t);
3957 case 12:
3958 return detail::InsertLane<12>(v, t);
3959 case 13:
3960 return detail::InsertLane<13>(v, t);
3961 case 14:
3962 return detail::InsertLane<14>(v, t);
3963 case 15:
3964 return detail::InsertLane<15>(v, t);
3965 }
3966 }
3967#endif
3968 const DFromV<decltype(v)> d;
3969 alignas(16) T lanes[16];
3970 Store(v, d, lanes);
3971 lanes[i] = t;
3972 return Load(d, lanes);
3973}
3974
3975// ------------------------------ CombineShiftRightBytes
3976
3977template <int kBytes, typename T, class V = Vec128<T>>
3978HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) {
3979 const Repartition<uint8_t, decltype(d)> d8;
3980 return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
3981 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
3982}
3983
3984template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
3985 class V = Vec128<T, N>>
3986HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
3987 constexpr size_t kSize = N * sizeof(T);
3988 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3989 const Repartition<uint8_t, decltype(d)> d8;
3990 const Full128<uint8_t> d_full8;
3991 using V8 = VFromD<decltype(d_full8)>;
3992 const V8 hi8{BitCast(d8, hi).raw};
3993 // Move into most-significant bytes
3994 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
3995 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
3996 return V{BitCast(Full128<T>(), r).raw};
3997}
3998
3999// ------------------------------ Broadcast/splat any lane
4000
4001// Unsigned
4002template <int kLane, size_t N>
4004 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4005 if (kLane < 4) {
4006 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
4007 return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
4008 } else {
4009 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
4010 return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
4011 }
4012}
4013template <int kLane, size_t N>
4015 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4016 return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
4017}
4018template <int kLane, size_t N>
4020 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4021 return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4022}
4023
4024// Signed
4025template <int kLane, size_t N>
4027 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4028 if (kLane < 4) {
4029 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
4030 return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
4031 } else {
4032 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
4033 return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
4034 }
4035}
4036template <int kLane, size_t N>
4038 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4039 return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
4040}
4041template <int kLane, size_t N>
4043 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4044 return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4045}
4046
4047// Float
4048template <int kLane, size_t N>
4050 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4051 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
4052}
4053template <int kLane, size_t N>
4055 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4056 return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
4057}
4058
4059// ------------------------------ TableLookupLanes (Shuffle01)
4060
4061// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
4062template <typename T, size_t N = 16 / sizeof(T)>
4063struct Indices128 {
4064 __m128i raw;
4065};
4066
4067template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
4068 HWY_IF_LANE_SIZE(T, 4)>
4070 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
4071#if HWY_IS_DEBUG_BUILD
4072 const Rebind<TI, decltype(d)> di;
4073 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4074 AllTrue(di, Lt(vec, Set(di, N))));
4075#endif
4076
4077#if HWY_TARGET <= HWY_AVX2
4078 (void)d;
4079 return Indices128<T, N>{vec.raw};
4080#else
4081 const Repartition<uint8_t, decltype(d)> d8;
4082 using V8 = VFromD<decltype(d8)>;
4083 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
4084 0, 1, 2, 3, 0, 1, 2, 3};
4085
4086 // Broadcast each lane index to all 4 bytes of T
4087 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
4088 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
4089 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
4090
4091 // Shift to bytes
4092 const Repartition<uint16_t, decltype(d)> d16;
4093 const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
4094
4095 return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
4096#endif
4097}
4098
4099template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
4100 HWY_IF_LANE_SIZE(T, 8)>
4101HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
4102 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
4103#if HWY_IS_DEBUG_BUILD
4104 const Rebind<TI, decltype(d)> di;
4105 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4106 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
4107#else
4108 (void)d;
4109#endif
4110
4111 // No change - even without AVX3, we can shuffle+blend.
4112 return Indices128<T, N>{vec.raw};
4113}
4114
4115template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
4116HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
4117 const Rebind<TI, decltype(d)> di;
4118 return IndicesFromVec(d, LoadU(di, idx));
4119}
4120
4121template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4123#if HWY_TARGET <= HWY_AVX2
4124 const DFromV<decltype(v)> d;
4125 const RebindToFloat<decltype(d)> df;
4126 const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
4127 return BitCast(d, perm);
4128#else
4129 return TableLookupBytes(v, Vec128<T, N>{idx.raw});
4130#endif
4131}
4132
4133template <size_t N, HWY_IF_GE64(float, N)>
4136#if HWY_TARGET <= HWY_AVX2
4137 return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
4138#else
4139 const DFromV<decltype(v)> df;
4140 const RebindToSigned<decltype(df)> di;
4141 return BitCast(df,
4143#endif
4144}
4145
4146// Single lane: no change
4147template <typename T>
4149 Indices128<T, 1> /* idx */) {
4150 return v;
4151}
4152
4153template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4155 const Full128<T> d;
4156 Vec128<int64_t> vidx{idx.raw};
4157#if HWY_TARGET <= HWY_AVX2
4158 // There is no _mm_permute[x]var_epi64.
4159 vidx += vidx; // bit1 is the decider (unusual)
4160 const Full128<double> df;
4161 return BitCast(
4162 d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
4163#else
4164 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
4165 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
4166 // to obtain an all-zero or all-one mask.
4167 const Full128<int64_t> di;
4168 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
4169 const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
4170 return IfThenElse(mask_same, v, Shuffle01(v));
4171#endif
4172}
4173
4175 Indices128<double> idx) {
4176 Vec128<int64_t> vidx{idx.raw};
4177#if HWY_TARGET <= HWY_AVX2
4178 vidx += vidx; // bit1 is the decider (unusual)
4179 return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
4180#else
4181 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
4182 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
4183 // to obtain an all-zero or all-one mask.
4184 const Full128<double> d;
4185 const Full128<int64_t> di;
4186 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
4187 const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
4188 return IfThenElse(mask_same, v, Shuffle01(v));
4189#endif
4190}
4191
4192// ------------------------------ ReverseBlocks
4193
4194// Single block: no change
4195template <typename T>
4196HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
4197 return v;
4198}
4199
4200// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
4201
4202// Single lane: no change
4203template <typename T>
4204HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
4205 return v;
4206}
4207
4208// Two lanes: shuffle
4209template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4211 return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
4212}
4213
4214template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4215HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
4216 return Shuffle01(v);
4217}
4218
4219// Four lanes: shuffle
4220template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4221HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
4222 return Shuffle0123(v);
4223}
4224
4225// 16-bit
4226template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4227HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
4228#if HWY_TARGET <= HWY_AVX3
4229 if (N == 1) return v;
4230 if (N == 2) {
4231 const Repartition<uint32_t, decltype(d)> du32;
4232 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
4233 }
4234 const RebindToSigned<decltype(d)> di;
4235 alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4236 const Vec128<int16_t, N> idx = Load(di, kReverse + (N == 8 ? 0 : 4));
4237 return BitCast(d, Vec128<int16_t, N>{
4238 _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4239#else
4240 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
4241 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
4242#endif
4243}
4244
4245// ------------------------------ Reverse2
4246
4247template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4248HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
4249 const Repartition<uint32_t, decltype(d)> du32;
4250 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
4251}
4252
4253template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4254HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4255 return Shuffle2301(v);
4256}
4257
4258template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4259HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4260 return Shuffle01(v);
4261}
4262
4263// ------------------------------ Reverse4
4264
4265template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4266HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
4267 const RebindToSigned<decltype(d)> di;
4268 // 4x 16-bit: a single shufflelo suffices.
4269 if (N == 4) {
4270 return BitCast(d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
4271 BitCast(di, v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
4272 }
4273
4274#if HWY_TARGET <= HWY_AVX3
4275 alignas(16) constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
4276 const Vec128<int16_t, N> idx = Load(di, kReverse4);
4277 return BitCast(d, Vec128<int16_t, N>{
4278 _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4279#else
4280 const RepartitionToWide<decltype(di)> dw;
4281 return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
4282#endif
4283}
4284
4285// 4x 32-bit: use Shuffle0123
4286template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4287HWY_API Vec128<T> Reverse4(Full128<T> /* tag */, const Vec128<T> v) {
4288 return Shuffle0123(v);
4289}
4290
4291template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4292HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
4293 HWY_ASSERT(0); // don't have 4 u64 lanes
4294}
4295
4296// ------------------------------ Reverse8
4297
4298template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4299HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
4300#if HWY_TARGET <= HWY_AVX3
4301 const RebindToSigned<decltype(d)> di;
4302 alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
4303 15, 14, 13, 12, 11, 10, 9, 8};
4304 const Vec128<int16_t, N> idx = Load(di, kReverse8);
4305 return BitCast(d, Vec128<int16_t, N>{
4306 _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4307#else
4308 const RepartitionToWide<decltype(d)> dw;
4309 return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
4310#endif
4311}
4312
4313template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4314HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
4315 HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
4316}
4317
4318// ------------------------------ InterleaveLower
4319
4320// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
4321// the least-significant lane) and "b". To concatenate two half-width integers
4322// into one, use ZipLower/Upper instead (also works with scalar).
4323
4324template <size_t N, HWY_IF_LE128(uint8_t, N)>
4326 const Vec128<uint8_t, N> b) {
4327 return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
4328}
4329template <size_t N, HWY_IF_LE128(uint16_t, N)>
4331 const Vec128<uint16_t, N> b) {
4332 return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
4333}
4334template <size_t N, HWY_IF_LE128(uint32_t, N)>
4336 const Vec128<uint32_t, N> b) {
4337 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
4338}
4339template <size_t N, HWY_IF_LE128(uint64_t, N)>
4341 const Vec128<uint64_t, N> b) {
4342 return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
4343}
4344
4345template <size_t N, HWY_IF_LE128(int8_t, N)>
4347 const Vec128<int8_t, N> b) {
4348 return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
4349}
4350template <size_t N, HWY_IF_LE128(int16_t, N)>
4352 const Vec128<int16_t, N> b) {
4353 return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
4354}
4355template <size_t N, HWY_IF_LE128(int32_t, N)>
4357 const Vec128<int32_t, N> b) {
4358 return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
4359}
4360template <size_t N, HWY_IF_LE128(int64_t, N)>
4362 const Vec128<int64_t, N> b) {
4363 return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
4364}
4365
4366template <size_t N, HWY_IF_LE128(float, N)>
4367HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
4368 const Vec128<float, N> b) {
4369 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
4370}
4371template <size_t N, HWY_IF_LE128(double, N)>
4373 const Vec128<double, N> b) {
4374 return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
4375}
4376
4377// Additional overload for the optional tag (also for 256/512).
4378template <class V>
4379HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
4380 return InterleaveLower(a, b);
4381}
4382
4383// ------------------------------ InterleaveUpper (UpperHalf)
4384
4385// All functions inside detail lack the required D parameter.
4386namespace detail {
4387
4389 const Vec128<uint8_t> b) {
4390 return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
4391}
4393 const Vec128<uint16_t> b) {
4394 return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
4395}
4397 const Vec128<uint32_t> b) {
4398 return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
4399}
4401 const Vec128<uint64_t> b) {
4402 return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
4403}
4404
4406 const Vec128<int8_t> b) {
4407 return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
4408}
4410 const Vec128<int16_t> b) {
4411 return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
4412}
4414 const Vec128<int32_t> b) {
4415 return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
4416}
4418 const Vec128<int64_t> b) {
4419 return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
4420}
4421
4422HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a,
4423 const Vec128<float> b) {
4424 return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
4425}
4427 const Vec128<double> b) {
4428 return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)};
4429}
4430
4431} // namespace detail
4432
4433// Full
4434template <typename T, class V = Vec128<T>>
4435HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
4436 return detail::InterleaveUpper(a, b);
4437}
4438
4439// Partial
4440template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
4441HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
4442 const Half<decltype(d)> d2;
4443 return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
4444}
4445
4446// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
4447
4448// Same as Interleave*, except that the return lanes are double-width integers;
4449// this is necessary because the single-lane scalar cannot return two values.
4450template <class V, class DW = RepartitionToWide<DFromV<V>>>
4451HWY_API VFromD<DW> ZipLower(V a, V b) {
4452 return BitCast(DW(), InterleaveLower(a, b));
4453}
4454template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4455HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4456 return BitCast(dw, InterleaveLower(D(), a, b));
4457}
4458
4459template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4460HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4461 return BitCast(dw, InterleaveUpper(D(), a, b));
4462}
4463
4464// ================================================== COMBINE
4465
4466// ------------------------------ Combine (InterleaveLower)
4467
4468// N = N/2 + N/2 (upper half undefined)
4469template <typename T, size_t N, HWY_IF_LE128(T, N)>
4470HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
4471 Vec128<T, N / 2> lo_half) {
4472 const Half<decltype(d)> d2;
4473 const RebindToUnsigned<decltype(d2)> du2;
4474 // Treat half-width input as one lane, and expand to two lanes.
4475 using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
4476 const VU lo{BitCast(du2, lo_half).raw};
4477 const VU hi{BitCast(du2, hi_half).raw};
4478 return BitCast(d, InterleaveLower(lo, hi));
4479}
4480
4481// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
4482
4483template <typename T, HWY_IF_NOT_FLOAT(T)>
4485 return Vec128<T>{_mm_move_epi64(lo.raw)};
4486}
4487
4488template <typename T, HWY_IF_FLOAT(T)>
4489HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
4490 const RebindToUnsigned<decltype(d)> du;
4491 return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
4492}
4493
4494template <typename T, size_t N, HWY_IF_LE64(T, N)>
4496 return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
4497}
4498
4499// ------------------------------ Concat full (InterleaveLower)
4500
4501// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4502template <typename T>
4503HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4504 const Repartition<uint64_t, decltype(d)> d64;
4505 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
4506}
4507
4508// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4509template <typename T>
4510HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4511 const Repartition<uint64_t, decltype(d)> d64;
4512 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
4513}
4514
4515// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
4516template <typename T>
4517HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
4518 const Vec128<T> lo) {
4519 return CombineShiftRightBytes<8>(d, hi, lo);
4520}
4521
4522// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4523template <typename T>
4525 const Repartition<double, decltype(d)> dd;
4526#if HWY_TARGET == HWY_SSSE3
4527 return BitCast(
4528 d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw,
4529 _MM_SHUFFLE2(1, 0))});
4530#else
4531 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle.
4532 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
4533 BitCast(dd, lo).raw, 1)});
4534#endif
4535}
4537 Vec128<float> lo) {
4538#if HWY_TARGET == HWY_SSSE3
4539 (void)d;
4540 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
4541#else
4542 // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4543 const RepartitionToWide<decltype(d)> dd;
4544 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
4545 BitCast(dd, lo).raw, 1)});
4546#endif
4547}
4550#if HWY_TARGET == HWY_SSSE3
4551 return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
4552#else
4553 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4554 return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)};
4555#endif
4556}
4557
4558// ------------------------------ Concat partial (Combine, LowerHalf)
4559
4560template <typename T, size_t N, HWY_IF_LE64(T, N)>
4561HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, Vec128<T, N> hi,
4562 Vec128<T, N> lo) {
4563 const Half<decltype(d)> d2;
4564 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
4565}
4566
4567template <typename T, size_t N, HWY_IF_LE64(T, N)>
4568HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, Vec128<T, N> hi,
4569 Vec128<T, N> lo) {
4570 const Half<decltype(d)> d2;
4571 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
4572}
4573
4574template <typename T, size_t N, HWY_IF_LE64(T, N)>
4575HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
4576 const Vec128<T, N> lo) {
4577 const Half<decltype(d)> d2;
4578 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
4579}
4580
4581template <typename T, size_t N, HWY_IF_LE64(T, N)>
4583 Vec128<T, N> lo) {
4584 const Half<decltype(d)> d2;
4585 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
4586}
4587
4588// ------------------------------ ConcatOdd
4589
4590// 8-bit full
4591template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4592HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4593 const Repartition<uint16_t, decltype(d)> dw;
4594 // Right-shift 8 bits per u16 so we can pack.
4595 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
4596 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
4597 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4598}
4599
4600// 8-bit x8
4601template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4603 const Repartition<uint32_t, decltype(d)> du32;
4604 // Don't care about upper half, no need to zero.
4605 alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
4606 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8));
4607 const Vec64<T> L = TableLookupBytes(lo, shuf);
4608 const Vec64<T> H = TableLookupBytes(hi, shuf);
4609 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4610}
4611
4612// 8-bit x4
4613template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4615 const Repartition<uint16_t, decltype(d)> du16;
4616 // Don't care about upper half, no need to zero.
4617 alignas(16) const uint8_t kCompactOddU8[4] = {1, 3};
4618 const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8));
4619 const Vec32<T> L = TableLookupBytes(lo, shuf);
4620 const Vec32<T> H = TableLookupBytes(hi, shuf);
4621 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
4622}
4623
4624// 16-bit full
4625template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4626HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4627 const Repartition<uint32_t, decltype(d)> dw;
4628 // Right-shift 16 bits per u32 so we can pack.
4629 const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
4630 const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
4631 return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4632}
4633
4634// 16-bit x4
4635template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4637 const Repartition<uint32_t, decltype(d)> du32;
4638 // Don't care about upper half, no need to zero.
4639 alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
4640 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16));
4641 const Vec64<T> L = TableLookupBytes(lo, shuf);
4642 const Vec64<T> H = TableLookupBytes(hi, shuf);
4643 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4644}
4645
4646// 32-bit full
4647template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4648HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4649 const RebindToFloat<decltype(d)> df;
4650 return BitCast(
4651 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4652 _MM_SHUFFLE(3, 1, 3, 1))});
4653}
4654template <size_t N>
4656 Vec128<float> lo) {
4657 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
4658}
4659
4660// Any type x2
4661template <typename T>
4662HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
4663 Vec128<T, 2> lo) {
4664 return InterleaveUpper(d, lo, hi);
4665}
4666
4667// ------------------------------ ConcatEven (InterleaveLower)
4668
4669// 8-bit full
4670template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4671HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4672 const Repartition<uint16_t, decltype(d)> dw;
4673 // Isolate lower 8 bits per u16 so we can pack.
4674 const Vec128<uint16_t> mask = Set(dw, 0x00FF);
4675 const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask);
4676 const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask);
4677 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4678}
4679
4680// 8-bit x8
4681template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4683 const Repartition<uint32_t, decltype(d)> du32;
4684 // Don't care about upper half, no need to zero.
4685 alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
4686 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8));
4687 const Vec64<T> L = TableLookupBytes(lo, shuf);
4688 const Vec64<T> H = TableLookupBytes(hi, shuf);
4689 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4690}
4691
4692// 8-bit x4
4693template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4695 const Repartition<uint16_t, decltype(d)> du16;
4696 // Don't care about upper half, no need to zero.
4697 alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2};
4698 const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8));
4699 const Vec32<T> L = TableLookupBytes(lo, shuf);
4700 const Vec32<T> H = TableLookupBytes(hi, shuf);
4701 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
4702}
4703
4704// 16-bit full
4705template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4706HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4707 const Repartition<uint32_t, decltype(d)> dw;
4708 // Isolate lower 16 bits per u32 so we can pack.
4709 const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
4710 const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
4711 const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
4712 return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4713}
4714
4715// 16-bit x4
4716template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4718 const Repartition<uint32_t, decltype(d)> du32;
4719 // Don't care about upper half, no need to zero.
4720 alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
4721 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16));
4722 const Vec64<T> L = TableLookupBytes(lo, shuf);
4723 const Vec64<T> H = TableLookupBytes(hi, shuf);
4724 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4725}
4726
4727// 32-bit full
4728template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4729HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4730 const RebindToFloat<decltype(d)> df;
4731 return BitCast(
4732 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4733 _MM_SHUFFLE(2, 0, 2, 0))});
4734}
4736 Vec128<float> lo) {
4737 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
4738}
4739
4740// Any T x2
4741template <typename T>
4742HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
4743 Vec128<T, 2> lo) {
4744 return InterleaveLower(d, lo, hi);
4745}
4746
4747// ------------------------------ DupEven (InterleaveLower)
4748
4749template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4750HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
4751 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4752}
4753template <size_t N>
4755 return Vec128<float, N>{
4756 _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4757}
4758
4759template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4760HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
4761 return InterleaveLower(DFromV<decltype(v)>(), v, v);
4762}
4763
4764// ------------------------------ DupOdd (InterleaveUpper)
4765
4766template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4767HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
4768 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4769}
4770template <size_t N>
4772 return Vec128<float, N>{
4773 _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4774}
4775
4776template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4777HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4778 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
4779}
4780
4781// ------------------------------ OddEven (IfThenElse)
4782
4783template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4785 const DFromV<decltype(a)> d;
4786 const Repartition<uint8_t, decltype(d)> d8;
4787 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4788 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4789 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4790}
4791
4792template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4793HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4794#if HWY_TARGET == HWY_SSSE3
4795 const DFromV<decltype(a)> d;
4796 const Repartition<uint8_t, decltype(d)> d8;
4797 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4798 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4799 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4800#else
4801 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
4802#endif
4803}
4804
4805template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4806HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4807#if HWY_TARGET == HWY_SSSE3
4808 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4809 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4810 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4811#else
4812 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle.
4813 const DFromV<decltype(a)> d;
4814 const RebindToFloat<decltype(d)> df;
4815 return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw,
4816 BitCast(df, b).raw, 5)});
4817#endif
4818}
4819
4820template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4821HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4822 // Same as ConcatUpperLower for full vectors; do not call that because this
4823 // is more efficient for 64x1 vectors.
4824 const DFromV<decltype(a)> d;
4825 const RebindToFloat<decltype(d)> dd;
4826#if HWY_TARGET == HWY_SSSE3
4827 return BitCast(
4828 d, Vec128<double, N>{_mm_shuffle_pd(
4829 BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))});
4830#else
4831 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4832 return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw,
4833 BitCast(dd, b).raw, 1)});
4834#endif
4835}
4836
4837template <size_t N>
4838HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
4839#if HWY_TARGET == HWY_SSSE3
4840 // SHUFPS must fill the lower half of the output from one input, so we
4841 // need another shuffle. Unpack avoids another immediate byte.
4842 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4843 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4844 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4845#else
4846 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
4847#endif
4848}
4849
4850// ------------------------------ OddEvenBlocks
4851template <typename T, size_t N>
4852HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
4853 return even;
4854}
4855
4856// ------------------------------ SwapAdjacentBlocks
4857
4858template <typename T, size_t N>
4859HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
4860 return v;
4861}
4862
4863// ------------------------------ Shl (ZipLower, Mul)
4864
4865// Use AVX2/3 variable shifts where available, otherwise multiply by powers of
4866// two from loading float exponents, which is considerably faster (according
4867// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
4868
4869#if HWY_TARGET > HWY_AVX3 // AVX2 or older
4870namespace detail {
4871
4872// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
4873template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4874HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4875 const DFromV<decltype(v)> d;
4876 const RepartitionToWide<decltype(d)> dw;
4877 const Rebind<float, decltype(dw)> df;
4878 const auto zero = Zero(d);
4879 // Move into exponent (this u16 will become the upper half of an f32)
4880 const auto exp = ShiftLeft<23 - 16>(v);
4881 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
4882 // Insert 0 into lower halves for reinterpreting as binary32.
4883 const auto f0 = ZipLower(dw, zero, upper);
4884 const auto f1 = ZipUpper(dw, zero, upper);
4885 // See comment below.
4886 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
4887 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
4888 return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4889}
4890
4891// Same, for 32-bit shifts.
4892template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4893HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4894 const DFromV<decltype(v)> d;
4895 const auto exp = ShiftLeft<23>(v);
4896 const auto f = exp + Set(d, 0x3F800000); // 1.0f
4897 // Do not use ConvertTo because we rely on the native 0x80..00 overflow
4898 // behavior. cvt instead of cvtt should be equivalent, but avoids test
4899 // failure under GCC 10.2.1.
4900 return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
4901}
4902
4903} // namespace detail
4904#endif // HWY_TARGET > HWY_AVX3
4905
4906template <size_t N>
4908 const Vec128<uint16_t, N> bits) {
4909#if HWY_TARGET <= HWY_AVX3
4910 return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
4911#else
4912 return v * detail::Pow2(bits);
4913#endif
4914}
4916 const Vec128<uint16_t, 1> bits) {
4917 return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
4918}
4919
4920template <size_t N>
4922 const Vec128<uint32_t, N> bits) {
4923#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4924 return v * detail::Pow2(bits);
4925#else
4926 return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
4927#endif
4928}
4930 const Vec128<uint32_t, 1> bits) {
4931 return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
4932}
4933
4934HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
4935 const Vec128<uint64_t> bits) {
4936#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4937 // Individual shifts and combine
4938 const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
4939 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4940 const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
4941 return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
4942#else
4943 return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
4944#endif
4945}
4946HWY_API Vec64<uint64_t> operator<<(const Vec64<uint64_t> v,
4947 const Vec64<uint64_t> bits) {
4948 return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
4949}
4950
4951// Signed left shift is the same as unsigned.
4952template <typename T, size_t N, HWY_IF_SIGNED(T)>
4953HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
4954 const DFromV<decltype(v)> di;
4955 const RebindToUnsigned<decltype(di)> du;
4956 return BitCast(di, BitCast(du, v) << BitCast(du, bits));
4957}
4958
4959// ------------------------------ Shr (mul, mask, BroadcastSignBit)
4960
4961// Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use
4962// widening multiplication by powers of two obtained by loading float exponents,
4963// followed by a constant right-shift. This is still faster than a scalar or
4964// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
4965
4966template <size_t N>
4968 const Vec128<uint16_t, N> bits) {
4969#if HWY_TARGET <= HWY_AVX3
4970 return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
4971#else
4972 const Simd<uint16_t, N, 0> d;
4973 // For bits=0, we cannot mul by 2^16, so fix the result later.
4974 const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
4975 // Replace output with input where bits == 0.
4976 return IfThenElse(bits == Zero(d), in, out);
4977#endif
4978}
4980 const Vec128<uint16_t, 1> bits) {
4981 return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)};
4982}
4983
4984template <size_t N>
4986 const Vec128<uint32_t, N> bits) {
4987#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4988 // 32x32 -> 64 bit mul, then shift right by 32.
4989 const Simd<uint32_t, N, 0> d32;
4990 // Move odd lanes into position for the second mul. Shuffle more gracefully
4991 // handles N=1 than repartitioning to u64 and shifting 32 bits right.
4992 const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
4993 // For bits=0, we cannot mul by 2^32, so fix the result later.
4994 const auto mul = detail::Pow2(Set(d32, 32) - bits);
4995 const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0
4996 const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
4997 // No need to shift right, already in the correct position.
4998 const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ?
4999 const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
5000 // Replace output with input where bits == 0.
5001 return IfThenElse(bits == Zero(d32), in, out);
5002#else
5003 return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
5004#endif
5005}
5007 const Vec128<uint32_t, 1> bits) {
5008 return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)};
5009}
5010
5011HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
5012 const Vec128<uint64_t> bits) {
5013#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5014 // Individual shifts and combine
5015 const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
5016 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
5017 const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
5018 return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
5019#else
5020 return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
5021#endif
5022}
5023HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
5024 const Vec64<uint64_t> bits) {
5025 return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
5026}
5027
5028#if HWY_TARGET > HWY_AVX3 // AVX2 or older
5029namespace detail {
5030
5031// Also used in x86_256-inl.h.
5032template <class DI, class V>
5033HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
5034 const RebindToUnsigned<DI> du;
5035 const auto count = BitCast(du, count_i); // same type as value to shift
5036 // Clear sign and restore afterwards. This is preferable to shifting the MSB
5037 // downwards because Shr is somewhat more expensive than Shl.
5038 const auto sign = BroadcastSignBit(v);
5039 const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below
5040 return BitCast(di, abs >> count) ^ sign;
5041}
5042
5043} // namespace detail
5044#endif // HWY_TARGET > HWY_AVX3
5045
5046template <size_t N>
5048 const Vec128<int16_t, N> bits) {
5049#if HWY_TARGET <= HWY_AVX3
5050 return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
5051#else
5052 return detail::SignedShr(Simd<int16_t, N, 0>(), v, bits);
5053#endif
5054}
5056 const Vec128<int16_t, 1> bits) {
5057 return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)};
5058}
5059
5060template <size_t N>
5062 const Vec128<int32_t, N> bits) {
5063#if HWY_TARGET <= HWY_AVX3
5064 return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
5065#else
5066 return detail::SignedShr(Simd<int32_t, N, 0>(), v, bits);
5067#endif
5068}
5070 const Vec128<int32_t, 1> bits) {
5071 return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)};
5072}
5073
5074template <size_t N>
5076 const Vec128<int64_t, N> bits) {
5077#if HWY_TARGET <= HWY_AVX3
5078 return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
5079#else
5080 return detail::SignedShr(Simd<int64_t, N, 0>(), v, bits);
5081#endif
5082}
5083
5084// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
5085
5086HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
5087 const Vec128<uint64_t> b) {
5088 alignas(16) uint64_t mul[2];
5089 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
5090 return Load(Full128<uint64_t>(), mul);
5091}
5092
5093HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
5094 const Vec128<uint64_t> b) {
5095 alignas(16) uint64_t mul[2];
5096 const Half<Full128<uint64_t>> d2;
5097 mul[0] =
5098 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
5099 return Load(Full128<uint64_t>(), mul);
5100}
5101
5102// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5103
5104template <size_t N>
5105HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
5106 Vec128<bfloat16_t, 2 * N> a,
5107 Vec128<bfloat16_t, 2 * N> b,
5108 const Vec128<float, N> sum0,
5109 Vec128<float, N>& sum1) {
5110 // TODO(janwas): _mm_dpbf16_ps when available
5111 const Repartition<uint16_t, decltype(df32)> du16;
5112 const RebindToUnsigned<decltype(df32)> du32;
5113 const Vec128<uint16_t, 2 * N> zero = Zero(du16);
5114 // Lane order within sum0/1 is undefined, hence we can avoid the
5115 // longer-latency lane-crossing PromoteTo.
5116 const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
5117 const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
5118 const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
5119 const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
5120 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
5121 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
5122}
5123
5124// ================================================== CONVERT
5125
5126// ------------------------------ Promotions (part w/ narrow lanes -> full)
5127
5128// Unsigned: zero-extend.
5129template <size_t N>
5130HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
5131 const Vec128<uint8_t, N> v) {
5132#if HWY_TARGET == HWY_SSSE3
5133 const __m128i zero = _mm_setzero_si128();
5134 return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
5135#else
5136 return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
5137#endif
5138}
5139template <size_t N>
5140HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
5141 const Vec128<uint16_t, N> v) {
5142#if HWY_TARGET == HWY_SSSE3
5143 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
5144#else
5145 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
5146#endif
5147}
5148template <size_t N>
5149HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
5150 const Vec128<uint32_t, N> v) {
5151#if HWY_TARGET == HWY_SSSE3
5152 return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
5153#else
5154 return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)};
5155#endif
5156}
5157template <size_t N>
5158HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
5159 const Vec128<uint8_t, N> v) {
5160#if HWY_TARGET == HWY_SSSE3
5161 const __m128i zero = _mm_setzero_si128();
5162 const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
5163 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
5164#else
5165 return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
5166#endif
5167}
5168
5169// Unsigned to signed: same plus cast.
5170template <size_t N>
5171HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> di,
5172 const Vec128<uint8_t, N> v) {
5173 return BitCast(di, PromoteTo(Simd<uint16_t, N, 0>(), v));
5174}
5175template <size_t N>
5176HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
5177 const Vec128<uint16_t, N> v) {
5178 return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
5179}
5180template <size_t N>
5181HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
5182 const Vec128<uint8_t, N> v) {
5183 return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
5184}
5185
5186// Signed: replicate sign bit.
5187template <size_t N>
5188HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
5189 const Vec128<int8_t, N> v) {
5190#if HWY_TARGET == HWY_SSSE3
5191 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
5192#else
5193 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
5194#endif
5195}
5196template <size_t N>
5197HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
5198 const Vec128<int16_t, N> v) {
5199#if HWY_TARGET == HWY_SSSE3
5200 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
5201#else
5202 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
5203#endif
5204}
5205template <size_t N>
5206HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
5207 const Vec128<int32_t, N> v) {
5208#if HWY_TARGET == HWY_SSSE3
5209 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
5210#else
5211 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
5212#endif
5213}
5214template <size_t N>
5215HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
5216 const Vec128<int8_t, N> v) {
5217#if HWY_TARGET == HWY_SSSE3
5218 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
5219 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
5220 return ShiftRight<24>(Vec128<int32_t, N>{x4});
5221#else
5222 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
5223#endif
5224}
5225
5226// Workaround for origin tracking bug in Clang msan prior to 11.0
5227// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
5228#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
5229#define HWY_INLINE_F16 HWY_NOINLINE
5230#else
5231#define HWY_INLINE_F16 HWY_INLINE
5232#endif
5233template <size_t N>
5235 const Vec128<float16_t, N> v) {
5236#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5237 const RebindToSigned<decltype(df32)> di32;
5238 const RebindToUnsigned<decltype(df32)> du32;
5239 // Expand to u32 so we can shift.
5240 const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
5241 const auto sign = ShiftRight<15>(bits16);
5242 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
5243 const auto mantissa = bits16 & Set(du32, 0x3FF);
5244 const auto subnormal =
5245 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
5246 Set(df32, 1.0f / 16384 / 1024));
5247
5248 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
5249 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
5250 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
5251 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
5252 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
5253#else
5254 (void)df32;
5255 return Vec128<float, N>{_mm_cvtph_ps(v.raw)};
5256#endif
5257}
5258
5259template <size_t N>
5260HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
5261 const Vec128<bfloat16_t, N> v) {
5262 const Rebind<uint16_t, decltype(df32)> du16;
5263 const RebindToSigned<decltype(df32)> di32;
5264 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
5265}
5266
5267template <size_t N>
5269 const Vec128<float, N> v) {
5270 return Vec128<double, N>{_mm_cvtps_pd(v.raw)};
5271}
5272
5273template <size_t N>
5274HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
5275 const Vec128<int32_t, N> v) {
5276 return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
5277}
5278
5279// ------------------------------ Demotions (full -> part w/ narrow lanes)
5280
5281template <size_t N>
5282HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
5283 const Vec128<int32_t, N> v) {
5284#if HWY_TARGET == HWY_SSSE3
5285 const Simd<int32_t, N, 0> di32;
5286 const Simd<uint16_t, N * 2, 0> du16;
5287 const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
5288 const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
5289 const auto clamped = Or(zero_if_neg, too_big);
5290 // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
5291 alignas(16) constexpr uint16_t kLower2Bytes[16] = {
5292 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
5293 const auto lo2 = Load(du16, kLower2Bytes);
5294 return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
5295#else
5296 return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
5297#endif
5298}
5299
5300template <size_t N>
5301HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
5302 const Vec128<int32_t, N> v) {
5303 return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
5304}
5305
5306template <size_t N>
5307HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
5308 const Vec128<int32_t, N> v) {
5309 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
5310 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
5311}
5312
5313template <size_t N>
5314HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
5315 const Vec128<int16_t, N> v) {
5316 return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
5317}
5318
5319template <size_t N>
5320HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
5321 const Vec128<int32_t, N> v) {
5322 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
5323 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
5324}
5325
5326template <size_t N>
5327HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
5328 const Vec128<int16_t, N> v) {
5329 return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
5330}
5331
5332// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate).
5333// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain.
5334HWY_DIAGNOSTICS(push)
5335HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
5336
5337template <size_t N>
5338HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
5339 const Vec128<float, N> v) {
5340#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5341 const RebindToUnsigned<decltype(df16)> du16;
5342 const Rebind<uint32_t, decltype(df16)> du;
5343 const RebindToSigned<decltype(du)> di;
5344 const auto bits32 = BitCast(du, v);
5345 const auto sign = ShiftRight<31>(bits32);
5346 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
5347 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
5348
5349 const auto k15 = Set(di, 15);
5350 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
5351 const auto is_tiny = exp < Set(di, -24);
5352
5353 const auto is_subnormal = exp < Set(di, -14);
5354 const auto biased_exp16 =
5355 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
5356 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
5357 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
5358 (mantissa32 >> (Set(du, 13) + sub_exp));
5359 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
5360 ShiftRight<13>(mantissa32)); // <1024
5361
5362 const auto sign16 = ShiftLeft<15>(sign);
5363 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
5364 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
5365 return BitCast(df16, DemoteTo(du16, bits16));
5366#else
5367 (void)df16;
5368 return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
5369#endif
5370}
5371
5372HWY_DIAGNOSTICS(pop)
5373
5374template <size_t N>
5375HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
5376 const Vec128<float, N> v) {
5377 // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
5378 const Rebind<int32_t, decltype(dbf16)> di32;
5379 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
5380 const Rebind<uint16_t, decltype(dbf16)> du16;
5381 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
5382 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
5383}
5384
5385template <size_t N>
5386HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
5387 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
5388 // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
5389 const RebindToUnsigned<decltype(dbf16)> du16;
5390 const Repartition<uint32_t, decltype(dbf16)> du32;
5391 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
5392 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
5393}
5394
5395template <size_t N>
5396HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
5397 const Vec128<double, N> v) {
5398 return Vec128<float, N>{_mm_cvtpd_ps(v.raw)};
5399}
5400
5401namespace detail {
5402
5403// For well-defined float->int demotion in all x86_*-inl.h.
5404
5405template <size_t N>
5407 -> decltype(Zero(d)) {
5408 // The max can be exactly represented in binary64, so clamping beforehand
5409 // prevents x86 conversion from raising an exception and returning 80..00.
5410 return Min(v, Set(d, 2147483647.0));
5411}
5412
5413// For ConvertTo float->int of same size, clamping before conversion would
5414// change the result because the max integer value is not exactly representable.
5415// Instead detect the overflow result after conversion and fix it.
5416template <class DI, class DF = RebindToFloat<DI>>
5418 decltype(Zero(di).raw) converted_raw)
5419 -> VFromD<DI> {
5420 // Combinations of original and output sign:
5421 // --: normal <0 or -huge_val to 80..00: OK
5422 // -+: -0 to 0 : OK
5423 // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF
5424 // ++: normal >0 : OK
5425 const auto converted = VFromD<DI>{converted_raw};
5426 const auto sign_wrong = AndNot(BitCast(di, original), converted);
5427#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
5428 // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
5429 // Add() if using that instead. Work around with one more instruction.
5430 const RebindToUnsigned<DI> du;
5431 const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
5432 const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
5433 return IfVecThenElse(mask, max, converted);
5434#else
5435 return Xor(converted, BroadcastSignBit(sign_wrong));
5436#endif
5437}
5438
5439} // namespace detail
5440
5441template <size_t N>
5442HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* tag */,
5443 const Vec128<double, N> v) {
5444 const auto clamped = detail::ClampF64ToI32Max(Simd<double, N, 0>(), v);
5445 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
5446}
5447
5448// For already range-limited input [0, 255].
5449template <size_t N>
5450HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
5451 const Simd<uint32_t, N, 0> d32;
5452 const Simd<uint8_t, N * 4, 0> d8;
5453 alignas(16) static constexpr uint32_t k8From32[4] = {
5454 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
5455 // Also replicate bytes into all 32 bit lanes for safety.
5456 const auto quad = TableLookupBytes(v, Load(d32, k8From32));
5457 return LowerHalf(LowerHalf(BitCast(d8, quad)));
5458}
5459
5460// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
5461
5462template <size_t N>
5463HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
5464 const Vec128<int32_t, N> v) {
5465 return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
5466}
5467
5468template <size_t N>
5470 const Vec128<int64_t, N> v) {
5471#if HWY_TARGET <= HWY_AVX3
5472 (void)dd;
5473 return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
5474#else
5475 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
5476 const Repartition<uint32_t, decltype(dd)> d32;
5477 const Repartition<uint64_t, decltype(dd)> d64;
5478
5479 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
5480 const auto k84_63 = Set(d64, 0x4530000080000000ULL);
5481 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
5482
5483 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
5484 const auto k52 = Set(d32, 0x43300000);
5485 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
5486
5487 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
5488 return (v_upper - k84_63_52) + v_lower; // order matters!
5489#endif
5490}
5491
5492// Truncates (rounds toward zero).
5493template <size_t N>
5494HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
5495 const Vec128<float, N> v) {
5496 return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw));
5497}
5498
5499// Full (partial handled below)
5501#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
5502 return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw));
5503#elif HWY_ARCH_X86_64
5504 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
5505 const Half<Full128<double>> dd2;
5506 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
5507 return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1));
5508#else
5509 using VI = VFromD<decltype(di)>;
5510 const VI k0 = Zero(di);
5511 const VI k1 = Set(di, 1);
5512 const VI k51 = Set(di, 51);
5513
5514 // Exponent indicates whether the number can be represented as int64_t.
5515 const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
5516 const VI exp = biased_exp - Set(di, 0x3FF);
5517 const auto in_range = exp < Set(di, 63);
5518
5519 // If we were to cap the exponent at 51 and add 2^52, the number would be in
5520 // [2^52, 2^53) and mantissa bits could be read out directly. We need to
5521 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
5522 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
5523 // manually shift the mantissa into place (we already have many of the
5524 // inputs anyway).
5525 const VI shift_mnt = Max(k51 - exp, k0);
5526 const VI shift_int = Max(exp - k51, k0);
5527 const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
5528 // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
5529 const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
5530 // For inputs larger than 2^52, insert zeros at the bottom.
5531 const VI shifted = int52 << shift_int;
5532 // Restore the one bit lost when shifting in the implicit 1-bit.
5533 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
5534
5535 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
5536 const VI sign_mask = BroadcastSignBit(BitCast(di, v));
5537 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
5538 const VI magnitude = IfThenElse(in_range, restored, limit);
5539
5540 // If the input was negative, negate the integer (two's complement).
5541 return (magnitude ^ sign_mask) - sign_mask;
5542#endif
5543}
5545 // Only need to specialize for non-AVX3, 64-bit (single scalar op)
5546#if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
5547 const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
5548 return detail::FixConversionOverflow(di, v, i0.raw);
5549#else
5550 (void)di;
5551 const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw});
5552 return Vec64<int64_t>{full.raw};
5553#endif
5554}
5555
5556template <size_t N>
5557HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
5558 const Simd<int32_t, N, 0> di;
5559 return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw));
5560}
5561
5562// ------------------------------ Floating-point rounding (ConvertTo)
5563
5564#if HWY_TARGET == HWY_SSSE3
5565
5566// Toward nearest integer, ties to even
5567template <typename T, size_t N, HWY_IF_FLOAT(T)>
5569 // Rely on rounding after addition with a large value such that no mantissa
5570 // bits remain (assuming the current mode is nearest-even). We may need a
5571 // compiler flag for precise floating-point to prevent "optimizing" this out.
5572 const Simd<T, N, 0> df;
5573 const auto max = Set(df, MantissaEnd<T>());
5574 const auto large = CopySignToAbs(max, v);
5575 const auto added = large + v;
5576 const auto rounded = added - large;
5577 // Keep original if NaN or the magnitude is large (already an int).
5578 return IfThenElse(Abs(v) < max, rounded, v);
5579}
5580
5581namespace detail {
5582
5583// Truncating to integer and converting back to float is correct except when the
5584// input magnitude is large, in which case the input was already an integer
5585// (because mantissa >> exponent is zero).
5586template <typename T, size_t N, HWY_IF_FLOAT(T)>
5588 return Abs(v) < Set(Simd<T, N, 0>(), MantissaEnd<T>());
5589}
5590
5591} // namespace detail
5592
5593// Toward zero, aka truncate
5594template <typename T, size_t N, HWY_IF_FLOAT(T)>
5596 const Simd<T, N, 0> df;
5597 const RebindToSigned<decltype(df)> di;
5598
5599 const auto integer = ConvertTo(di, v); // round toward 0
5600 const auto int_f = ConvertTo(df, integer);
5601
5602 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
5603}
5604
5605// Toward +infinity, aka ceiling
5606template <typename T, size_t N, HWY_IF_FLOAT(T)>
5608 const Simd<T, N, 0> df;
5609 const RebindToSigned<decltype(df)> di;
5610
5611 const auto integer = ConvertTo(di, v); // round toward 0
5612 const auto int_f = ConvertTo(df, integer);
5613
5614 // Truncating a positive non-integer ends up smaller; if so, add 1.
5615 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
5616
5617 return IfThenElse(detail::UseInt(v), int_f - neg1, v);
5618}
5619
5620// Toward -infinity, aka floor
5621template <typename T, size_t N, HWY_IF_FLOAT(T)>
5623 const Simd<T, N, 0> df;
5624 const RebindToSigned<decltype(df)> di;
5625
5626 const auto integer = ConvertTo(di, v); // round toward 0
5627 const auto int_f = ConvertTo(df, integer);
5628
5629 // Truncating a negative non-integer ends up larger; if so, subtract 1.
5630 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
5631
5632 return IfThenElse(detail::UseInt(v), int_f + neg1, v);
5633}
5634
5635#else
5636
5637// Toward nearest integer, ties to even
5638template <size_t N>
5639HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
5640 return Vec128<float, N>{
5641 _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5642}
5643template <size_t N>
5644HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
5645 return Vec128<double, N>{
5646 _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5647}
5648
5649// Toward zero, aka truncate
5650template <size_t N>
5651HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
5652 return Vec128<float, N>{
5653 _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5654}
5655template <size_t N>
5656HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
5657 return Vec128<double, N>{
5658 _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5659}
5660
5661// Toward +infinity, aka ceiling
5662template <size_t N>
5663HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
5664 return Vec128<float, N>{
5665 _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5666}
5667template <size_t N>
5668HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
5669 return Vec128<double, N>{
5670 _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5671}
5672
5673// Toward -infinity, aka floor
5674template <size_t N>
5675HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
5676 return Vec128<float, N>{
5677 _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5678}
5679template <size_t N>
5680HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
5681 return Vec128<double, N>{
5682 _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5683}
5684
5685#endif // !HWY_SSSE3
5686
5687// ------------------------------ Floating-point classification
5688
5689template <size_t N>
5691#if HWY_TARGET <= HWY_AVX3
5692 return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x81)};
5693#else
5694 return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)};
5695#endif
5696}
5697template <size_t N>
5699#if HWY_TARGET <= HWY_AVX3
5700 return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x81)};
5701#else
5702 return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)};
5703#endif
5704}
5705
5706#if HWY_TARGET <= HWY_AVX3
5707
5708template <size_t N>
5710 return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x18)};
5711}
5712template <size_t N>
5714 return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x18)};
5715}
5716
5717// Returns whether normal/subnormal/zero.
5718template <size_t N>
5720 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
5721 // and negate the mask.
5722 return Not(Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x99)});
5723}
5724template <size_t N>
5726 return Not(Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x99)});
5727}
5728
5729#else
5730
5731template <typename T, size_t N, HWY_IF_FLOAT(T)>
5732HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
5733 const Simd<T, N, 0> d;
5734 const RebindToSigned<decltype(d)> di;
5735 const VFromD<decltype(di)> vi = BitCast(di, v);
5736 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
5737 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
5738}
5739
5740// Returns whether normal/subnormal/zero.
5741template <typename T, size_t N, HWY_IF_FLOAT(T)>
5742HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
5743 const Simd<T, N, 0> d;
5744 const RebindToUnsigned<decltype(d)> du;
5745 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
5746 const VFromD<decltype(du)> vu = BitCast(du, v);
5747 // Shift left to clear the sign bit, then right so we can compare with the
5748 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
5749 // negative and non-negative floats would be greater). MSVC seems to generate
5750 // incorrect code if we instead add vu + vu.
5751 const VFromD<decltype(di)> exp =
5752 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
5753 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
5754}
5755
5756#endif // HWY_TARGET <= HWY_AVX3
5757
5758// ================================================== CRYPTO
5759
5760#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5761
5762// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
5763#ifdef HWY_NATIVE_AES
5764#undef HWY_NATIVE_AES
5765#else
5766#define HWY_NATIVE_AES
5767#endif
5768
5769HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
5770 Vec128<uint8_t> round_key) {
5771 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
5772}
5773
5774HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
5775 Vec128<uint8_t> round_key) {
5776 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
5777}
5778
5779template <size_t N, HWY_IF_LE128(uint64_t, N)>
5780HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
5781 Vec128<uint64_t, N> b) {
5782 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
5783}
5784
5785template <size_t N, HWY_IF_LE128(uint64_t, N)>
5786HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
5787 Vec128<uint64_t, N> b) {
5788 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
5789}
5790
5791#endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5792
5793// ================================================== MISC
5794
5795template <typename T>
5796struct CompressIsPartition {
5797#if HWY_TARGET <= HWY_AVX3
5798 // AVX3 supports native compress, but a table-based approach allows
5799 // 'partitioning' (also moving mask=false lanes to the top), which helps
5800 // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
5801 // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
5802 // u32x8 etc.).
5803 enum { value = (sizeof(T) == 8) };
5804#else
5805 enum { value = 1 };
5806#endif
5807};
5808
5809#if HWY_TARGET <= HWY_AVX3
5810
5811// ------------------------------ LoadMaskBits
5812
5813// `p` points to at least 8 readable bytes, not all of which need be valid.
5814template <typename T, size_t N, HWY_IF_LE128(T, N)>
5815HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> /* tag */,
5816 const uint8_t* HWY_RESTRICT bits) {
5817 uint64_t mask_bits = 0;
5818 constexpr size_t kNumBytes = (N + 7) / 8;
5819 CopyBytes<kNumBytes>(bits, &mask_bits);
5820 if (N < 8) {
5821 mask_bits &= (1ull << N) - 1;
5822 }
5823
5824 return Mask128<T, N>::FromBits(mask_bits);
5825}
5826
5827// ------------------------------ StoreMaskBits
5828
5829// `p` points to at least 8 writable bytes.
5830template <typename T, size_t N>
5831HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
5832 const Mask128<T, N> mask, uint8_t* bits) {
5833 constexpr size_t kNumBytes = (N + 7) / 8;
5834 CopyBytes<kNumBytes>(&mask.raw, bits);
5835
5836 // Non-full byte, need to clear the undefined upper bits.
5837 if (N < 8) {
5838 const int mask = (1 << N) - 1;
5839 bits[0] = static_cast<uint8_t>(bits[0] & mask);
5840 }
5841
5842 return kNumBytes;
5843}
5844
5845// ------------------------------ Mask testing
5846
5847// Beware: the suffix indicates the number of mask bits, not lane size!
5848
5849template <typename T, size_t N>
5850HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
5851 const Mask128<T, N> mask) {
5852 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5853 return PopCount(mask_bits);
5854}
5855
5856template <typename T, size_t N>
5857HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
5858 const Mask128<T, N> mask) {
5859 const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
5860 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
5861}
5862
5863template <typename T, size_t N>
5864HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5865 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5866 return mask_bits == 0;
5867}
5868
5869template <typename T, size_t N>
5870HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5871 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5872 // Cannot use _kortestc because we may have less than 8 mask bits.
5873 return mask_bits == (1u << N) - 1;
5874}
5875
5876// ------------------------------ Compress
5877
5878#if HWY_TARGET != HWY_AVX3_DL
5879namespace detail {
5880
5881// Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256.
5882HWY_INLINE Vec128<uint16_t> IndicesForCompress16(uint64_t mask_bits) {
5883 Full128<uint16_t> du16;
5884 // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked
5885 // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used)
5886 // bits into each lane and then varshift, but that does not fit in 16 bits.
5887 Rebind<uint8_t, decltype(du16)> du8;
5888 alignas(16) constexpr uint8_t tbl[2048] = {
5889 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
5890 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
5891 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
5892 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
5893 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
5894 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
5895 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
5896 0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
5897 0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
5898 3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
5899 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
5900 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
5901 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
5902 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
5903 0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
5904 0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
5905 1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
5906 2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
5907 5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
5908 4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
5909 5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
5910 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
5911 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
5912 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
5913 0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
5914 2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
5915 6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
5916 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
5917 6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
5918 0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
5919 0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
5920 0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
5921 2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
5922 1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
5923 5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
5924 5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
5925 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
5926 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
5927 0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
5928 0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
5929 0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
5930 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
5931 7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
5932 0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
5933 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
5934 0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
5935 0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
5936 0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
5937 1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
5938 3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
5939 4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
5940 3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
5941 0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
5942 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
5943 0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
5944 0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
5945 0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
5946 4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
5947 4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
5948 7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
5949 5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
5950 7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
5951 0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
5952 0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
5953 3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
5954 1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
5955 3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
5956 7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
5957 0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
5958 7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
5959 0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
5960 0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
5961 0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
5962 5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
5963 2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
5964 6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
5965 6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
5966 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
5967 0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
5968 0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
5969 1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
5970 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
5971 return PromoteTo(du16, Load(du8, tbl + mask_bits * 8));
5972}
5973
5974} // namespace detail
5975#endif // HWY_TARGET != HWY_AVX3_DL
5976
5977// Single lane: no-op
5978template <typename T>
5979HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
5980 return v;
5981}
5982
5983template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5984HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5985 const Simd<T, N, 0> d;
5986 const Rebind<uint16_t, decltype(d)> du;
5987 const auto vu = BitCast(du, v); // (required for float16_t inputs)
5988
5989#if HWY_TARGET == HWY_AVX3_DL // VBMI2
5990 const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)};
5991#else
5992 const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw});
5993 const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
5994#endif // HWY_TARGET != HWY_AVX3_DL
5995 return BitCast(d, cu);
5996}
5997
5998template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5999HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6000 return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
6001}
6002
6003template <size_t N, HWY_IF_GE64(float, N)>
6005 return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
6006}
6007
6008template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6009HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
6010 HWY_DASSERT(mask.raw < 4);
6011
6012 // There are only 2 lanes, so we can afford to load the index vector directly.
6013 alignas(16) constexpr uint8_t u8_indices[64] = {
6014 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6015 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6016 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6017 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6018
6019 const Full128<T> d;
6020 const Repartition<uint8_t, decltype(d)> d8;
6021 const auto index = Load(d8, u8_indices + 16 * mask.raw);
6022 return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
6023}
6024
6025// ------------------------------ CompressNot (Compress)
6026
6027// Single lane: no-op
6028template <typename T>
6029HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6030 return v;
6031}
6032
6033template <typename T, size_t N>
6034HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6035 return Compress(v, Not(mask));
6036}
6037
6038// ------------------------------ CompressBlocksNot
6039HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
6040 Mask128<uint64_t> /* m */) {
6041 return v;
6042}
6043
6044// ------------------------------ CompressBits (LoadMaskBits)
6045
6046template <typename T, size_t N>
6047HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
6048 const uint8_t* HWY_RESTRICT bits) {
6049 return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
6050}
6051
6052// ------------------------------ CompressStore
6053
6054template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6056 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6057 const Rebind<uint16_t, decltype(d)> du;
6058 const auto vu = BitCast(du, v); // (required for float16_t inputs)
6059
6060 const uint64_t mask_bits{mask.raw};
6061
6062#if HWY_TARGET == HWY_AVX3_DL // VBMI2
6063 _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
6064#else
6065 const auto idx = detail::IndicesForCompress16(mask_bits);
6066 const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
6067 StoreU(BitCast(d, cu), d, unaligned);
6068#endif // HWY_TARGET == HWY_AVX3_DL
6069
6070 const size_t count = PopCount(mask_bits & ((1ull << N) - 1));
6071 // Workaround for MSAN not marking output as initialized (b/233326619)
6072#if HWY_IS_MSAN
6073 __msan_unpoison(unaligned, count * sizeof(T));
6074#endif
6075 return count;
6076}
6077
6078template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
6079HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
6080 Simd<T, N, 0> /* tag */,
6081 T* HWY_RESTRICT unaligned) {
6082 _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6083 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6084 // Workaround for MSAN not marking output as initialized (b/233326619)
6085#if HWY_IS_MSAN
6086 __msan_unpoison(unaligned, count * sizeof(T));
6087#endif
6088 return count;
6089}
6090
6091template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
6092HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
6093 Simd<T, N, 0> /* tag */,
6094 T* HWY_RESTRICT unaligned) {
6095 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6096 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6097 // Workaround for MSAN not marking output as initialized (b/233326619)
6098#if HWY_IS_MSAN
6099 __msan_unpoison(unaligned, count * sizeof(T));
6100#endif
6101 return count;
6102}
6103
6104template <size_t N, HWY_IF_LE128(float, N)>
6106 Simd<float, N, 0> /* tag */,
6107 float* HWY_RESTRICT unaligned) {
6108 _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6109 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6110 // Workaround for MSAN not marking output as initialized (b/233326619)
6111#if HWY_IS_MSAN
6112 __msan_unpoison(unaligned, count * sizeof(float));
6113#endif
6114 return count;
6115}
6116
6117template <size_t N, HWY_IF_LE128(double, N)>
6119 Simd<double, N, 0> /* tag */,
6120 double* HWY_RESTRICT unaligned) {
6121 _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6122 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6123 // Workaround for MSAN not marking output as initialized (b/233326619)
6124#if HWY_IS_MSAN
6125 __msan_unpoison(unaligned, count * sizeof(double));
6126#endif
6127 return count;
6128}
6129
6130// ------------------------------ CompressBlendedStore (CompressStore)
6131template <typename T, size_t N>
6132HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
6133 Simd<T, N, 0> d,
6134 T* HWY_RESTRICT unaligned) {
6135 // AVX-512 already does the blending at no extra cost (latency 11,
6136 // rthroughput 2 - same as compress plus store).
6137 if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
6138 // We're relying on the mask to blend. Clear the undefined upper bits.
6139 if (N != 16 / sizeof(T)) {
6140 m = And(m, FirstN(d, N));
6141 }
6142 return CompressStore(v, m, d, unaligned);
6143 } else {
6144 const size_t count = CountTrue(d, m);
6145 const Vec128<T, N> compressed = Compress(v, m);
6146#if HWY_MEM_OPS_MIGHT_FAULT
6147 // BlendedStore tests mask for each lane, but we know that the mask is
6148 // FirstN, so we can just copy.
6149 alignas(16) T buf[N];
6150 Store(compressed, d, buf);
6151 memcpy(unaligned, buf, count * sizeof(T));
6152#else
6153 BlendedStore(compressed, FirstN(d, count), d, unaligned);
6154#endif
6155 // Workaround: as of 2022-02-23 MSAN does not mark the output as
6156 // initialized.
6157#if HWY_IS_MSAN
6158 __msan_unpoison(unaligned, count * sizeof(T));
6159#endif
6160 return count;
6161 }
6162}
6163
6164// ------------------------------ CompressBitsStore (LoadMaskBits)
6165
6166template <typename T, size_t N>
6167HWY_API size_t CompressBitsStore(Vec128<T, N> v,
6168 const uint8_t* HWY_RESTRICT bits,
6169 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6170 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
6171}
6172
6173#else // AVX2 or below
6174
6175// ------------------------------ LoadMaskBits (TestBit)
6176
6177namespace detail {
6178
6179template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
6180HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6181 const RebindToUnsigned<decltype(d)> du;
6182 // Easier than Set(), which would require an >8-bit type, which would not
6183 // compile for T=uint8_t, N=1.
6184 const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
6185
6186 // Replicate bytes 8x such that each byte contains the bit that governs it.
6187 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
6188 1, 1, 1, 1, 1, 1, 1, 1};
6189 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
6190
6191 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6192 1, 2, 4, 8, 16, 32, 64, 128};
6193 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
6194}
6195
6196template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6197HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6198 const RebindToUnsigned<decltype(d)> du;
6199 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
6200 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
6201 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
6202}
6203
6204template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
6205HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6206 const RebindToUnsigned<decltype(d)> du;
6207 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
6208 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
6209 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
6210}
6211
6212template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
6213HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6214 const RebindToUnsigned<decltype(d)> du;
6215 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
6216 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
6217}
6218
6219} // namespace detail
6220
6221// `p` points to at least 8 readable bytes, not all of which need be valid.
6222template <typename T, size_t N, HWY_IF_LE128(T, N)>
6223HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
6224 const uint8_t* HWY_RESTRICT bits) {
6225 uint64_t mask_bits = 0;
6226 constexpr size_t kNumBytes = (N + 7) / 8;
6227 CopyBytes<kNumBytes>(bits, &mask_bits);
6228 if (N < 8) {
6229 mask_bits &= (1ull << N) - 1;
6230 }
6231
6232 return detail::LoadMaskBits(d, mask_bits);
6233}
6234
6235// ------------------------------ StoreMaskBits
6236
6237namespace detail {
6238
6239constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
6240 return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
6241}
6242
6243template <typename T, size_t N>
6244HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
6245 const Mask128<T, N> mask) {
6246 const Simd<T, N, 0> d;
6247 const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
6248 return U64FromInt(_mm_movemask_epi8(sign_bits));
6249}
6250
6251template <typename T, size_t N>
6252HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
6253 const Mask128<T, N> mask) {
6254 // Remove useless lower half of each u16 while preserving the sign bit.
6255 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
6256 return U64FromInt(_mm_movemask_epi8(sign_bits));
6257}
6258
6259template <typename T, size_t N>
6260HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
6261 const Mask128<T, N> mask) {
6262 const Simd<T, N, 0> d;
6263 const Simd<float, N, 0> df;
6264 const auto sign_bits = BitCast(df, VecFromMask(d, mask));
6265 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
6266}
6267
6268template <typename T, size_t N>
6269HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
6270 const Mask128<T, N> mask) {
6271 const Simd<T, N, 0> d;
6272 const Simd<double, N, 0> df;
6273 const auto sign_bits = BitCast(df, VecFromMask(d, mask));
6274 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
6275}
6276
6277// Returns the lowest N of the _mm_movemask* bits.
6278template <typename T, size_t N>
6279constexpr uint64_t OnlyActive(uint64_t mask_bits) {
6280 return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
6281}
6282
6283template <typename T, size_t N>
6284HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
6285 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
6286}
6287
6288} // namespace detail
6289
6290// `p` points to at least 8 writable bytes.
6291template <typename T, size_t N>
6292HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
6293 const Mask128<T, N> mask, uint8_t* bits) {
6294 constexpr size_t kNumBytes = (N + 7) / 8;
6295 const uint64_t mask_bits = detail::BitsFromMask(mask);
6296 CopyBytes<kNumBytes>(&mask_bits, bits);
6297 return kNumBytes;
6298}
6299
6300// ------------------------------ Mask testing
6301
6302template <typename T, size_t N>
6303HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6304 // Cheaper than PTEST, which is 2 uop / 3L.
6305 return detail::BitsFromMask(mask) == 0;
6306}
6307
6308template <typename T, size_t N>
6309HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6310 constexpr uint64_t kAllBits =
6311 detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1);
6312 return detail::BitsFromMask(mask) == kAllBits;
6313}
6314
6315template <typename T, size_t N>
6316HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
6317 const Mask128<T, N> mask) {
6318 return PopCount(detail::BitsFromMask(mask));
6319}
6320
6321template <typename T, size_t N>
6322HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
6323 const Mask128<T, N> mask) {
6324 const uint64_t mask_bits = detail::BitsFromMask(mask);
6325 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
6326}
6327
6328// ------------------------------ Compress, CompressBits
6329
6330namespace detail {
6331
6332// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
6333template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6334HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6335 HWY_DASSERT(mask_bits < 256);
6336 const Rebind<uint8_t, decltype(d)> d8;
6337 const Simd<uint16_t, N, 0> du;
6338
6339 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
6340 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
6341 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
6342 // store lane indices and convert to byte indices (2*lane + 0..1), with the
6343 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
6344 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
6345 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
6346 // is likely more costly than the higher cache footprint from storing bytes.
6347 alignas(16) constexpr uint8_t table[2048] = {
6348 // PrintCompress16x8Tables
6349 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6350 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6351 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
6352 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6353 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
6354 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
6355 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
6356 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6357 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
6358 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
6359 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
6360 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
6361 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
6362 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
6363 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
6364 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6365 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
6366 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
6367 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
6368 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
6369 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
6370 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
6371 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
6372 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
6373 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
6374 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
6375 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
6376 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
6377 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
6378 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
6379 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
6380 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6381 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
6382 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
6383 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
6384 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
6385 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
6386 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
6387 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
6388 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
6389 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
6390 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
6391 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
6392 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
6393 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
6394 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
6395 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
6396 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
6397 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
6398 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
6399 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
6400 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
6401 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
6402 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
6403 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
6404 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
6405 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
6406 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
6407 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
6408 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
6409 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
6410 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
6411 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
6412 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6413 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
6414 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
6415 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
6416 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
6417 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
6418 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
6419 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
6420 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
6421 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
6422 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
6423 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
6424 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
6425 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
6426 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
6427 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
6428 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
6429 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
6430 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
6431 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
6432 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
6433 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
6434 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
6435 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
6436 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
6437 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
6438 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
6439 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
6440 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
6441 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
6442 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
6443 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
6444 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
6445 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
6446 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
6447 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
6448 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
6449 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
6450 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
6451 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
6452 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
6453 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
6454 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
6455 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
6456 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
6457 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
6458 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
6459 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
6460 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
6461 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
6462 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
6463 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
6464 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
6465 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
6466 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
6467 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
6468 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
6469 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
6470 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
6471 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
6472 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
6473 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
6474 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
6475 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
6476 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
6477
6478 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
6479 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
6480 return BitCast(d, pairs + Set(du, 0x0100));
6481}
6482
6483template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6484HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6485 uint64_t mask_bits) {
6486 HWY_DASSERT(mask_bits < 256);
6487 const Rebind<uint8_t, decltype(d)> d8;
6488 const Simd<uint16_t, N, 0> du;
6489
6490 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
6491 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
6492 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
6493 // store lane indices and convert to byte indices (2*lane + 0..1), with the
6494 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
6495 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
6496 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
6497 // is likely more costly than the higher cache footprint from storing bytes.
6498 alignas(16) constexpr uint8_t table[2048] = {
6499 // PrintCompressNot16x8Tables
6500 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
6501 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
6502 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
6503 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
6504 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
6505 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
6506 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
6507 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
6508 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
6509 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
6510 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
6511 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
6512 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
6513 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
6514 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
6515 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
6516 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
6517 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
6518 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
6519 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
6520 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
6521 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
6522 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
6523 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
6524 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
6525 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
6526 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
6527 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
6528 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
6529 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
6530 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
6531 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
6532 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
6533 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
6534 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
6535 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
6536 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
6537 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
6538 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
6539 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
6540 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
6541 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
6542 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
6543 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
6544 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
6545 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
6546 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
6547 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
6548 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
6549 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
6550 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
6551 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
6552 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
6553 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
6554 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
6555 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
6556 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
6557 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
6558 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
6559 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
6560 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
6561 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
6562 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
6563 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
6564 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
6565 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
6566 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
6567 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
6568 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
6569 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
6570 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
6571 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
6572 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
6573 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
6574 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
6575 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
6576 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
6577 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
6578 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
6579 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
6580 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
6581 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
6582 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
6583 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
6584 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
6585 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
6586 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
6587 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
6588 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
6589 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
6590 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
6591 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
6592 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
6593 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
6594 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
6595 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
6596 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
6597 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
6598 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
6599 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
6600 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
6601 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
6602 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
6603 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
6604 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
6605 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
6606 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
6607 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
6608 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
6609 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
6610 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
6611 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
6612 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
6613 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
6614 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
6615 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
6616 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
6617 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
6618 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
6619 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
6620 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
6621 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
6622 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
6623 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
6624 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
6625 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
6626 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
6627 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6628
6629 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
6630 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
6631 return BitCast(d, pairs + Set(du, 0x0100));
6632}
6633
6634template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6635HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6636 HWY_DASSERT(mask_bits < 16);
6637
6638 // There are only 4 lanes, so we can afford to load the index vector directly.
6639 alignas(16) constexpr uint8_t u8_indices[256] = {
6640 // PrintCompress32x4Tables
6641 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6642 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6643 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
6644 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6645 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
6646 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
6647 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
6648 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6649 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
6650 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
6651 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
6652 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
6653 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
6654 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
6655 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
6656 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6657
6658 const Repartition<uint8_t, decltype(d)> d8;
6659 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6660}
6661
6662template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6663HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6664 uint64_t mask_bits) {
6665 HWY_DASSERT(mask_bits < 16);
6666
6667 // There are only 4 lanes, so we can afford to load the index vector directly.
6668 alignas(16) constexpr uint8_t u8_indices[256] = {
6669 // PrintCompressNot32x4Tables
6670 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6671 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6672 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6673 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6674 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6675 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6676 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6677 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6678 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6679 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6680 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6681 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6682 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6683 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6684 12, 13, 14, 15};
6685
6686 const Repartition<uint8_t, decltype(d)> d8;
6687 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6688}
6689
6690template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6691HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6692 HWY_DASSERT(mask_bits < 4);
6693
6694 // There are only 2 lanes, so we can afford to load the index vector directly.
6695 alignas(16) constexpr uint8_t u8_indices[64] = {
6696 // PrintCompress64x2Tables
6697 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6698 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6699 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6700 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6701
6702 const Repartition<uint8_t, decltype(d)> d8;
6703 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6704}
6705
6706template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6707HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6708 uint64_t mask_bits) {
6709 HWY_DASSERT(mask_bits < 4);
6710
6711 // There are only 2 lanes, so we can afford to load the index vector directly.
6712 alignas(16) constexpr uint8_t u8_indices[64] = {
6713 // PrintCompressNot64x2Tables
6714 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6715 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6716 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6717 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6718
6719 const Repartition<uint8_t, decltype(d)> d8;
6720 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6721}
6722
6723template <typename T, size_t N>
6724HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
6725 const Simd<T, N, 0> d;
6726 const RebindToUnsigned<decltype(d)> du;
6727
6728 HWY_DASSERT(mask_bits < (1ull << N));
6729 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6730 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6731}
6732
6733template <typename T, size_t N>
6734HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
6735 const Simd<T, N, 0> d;
6736 const RebindToUnsigned<decltype(d)> du;
6737
6738 HWY_DASSERT(mask_bits < (1ull << N));
6739 const auto indices = BitCast(du, detail::IndicesFromNotBits(d, mask_bits));
6740 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6741}
6742
6743} // namespace detail
6744
6745// Single lane: no-op
6746template <typename T>
6747HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6748 return v;
6749}
6750
6751// Two lanes: conditional swap
6752template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6753HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
6754 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
6755 const Full128<T> d;
6756 const Vec128<T> m = VecFromMask(d, mask);
6757 const Vec128<T> maskL = DupEven(m);
6758 const Vec128<T> maskH = DupOdd(m);
6759 const Vec128<T> swap = AndNot(maskL, maskH);
6760 return IfVecThenElse(swap, Shuffle01(v), v);
6761}
6762
6763// General case
6764template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
6765HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6767}
6768
6769// Single lane: no-op
6770template <typename T>
6771HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6772 return v;
6773}
6774
6775// Two lanes: conditional swap
6776template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6777HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
6778 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
6779 const Full128<T> d;
6780 const Vec128<T> m = VecFromMask(d, mask);
6781 const Vec128<T> maskL = DupEven(m);
6782 const Vec128<T> maskH = DupOdd(m);
6783 const Vec128<T> swap = AndNot(maskH, maskL);
6784 return IfVecThenElse(swap, Shuffle01(v), v);
6785}
6786
6787// General case
6788template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
6789HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6790 // For partial vectors, we cannot pull the Not() into the table because
6791 // BitsFromMask clears the upper bits.
6792 if (N < 16 / sizeof(T)) {
6794 }
6795 return detail::CompressNotBits(v, detail::BitsFromMask(mask));
6796}
6797
6798// ------------------------------ CompressBlocksNot
6799HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
6800 Mask128<uint64_t> /* m */) {
6801 return v;
6802}
6803
6804template <typename T, size_t N>
6805HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
6806 const uint8_t* HWY_RESTRICT bits) {
6807 uint64_t mask_bits = 0;
6808 constexpr size_t kNumBytes = (N + 7) / 8;
6809 CopyBytes<kNumBytes>(bits, &mask_bits);
6810 if (N < 8) {
6811 mask_bits &= (1ull << N) - 1;
6812 }
6813
6814 return detail::CompressBits(v, mask_bits);
6815}
6816
6817// ------------------------------ CompressStore, CompressBitsStore
6818
6819template <typename T, size_t N>
6820HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
6821 T* HWY_RESTRICT unaligned) {
6822 const RebindToUnsigned<decltype(d)> du;
6823
6824 const uint64_t mask_bits = detail::BitsFromMask(m);
6825 HWY_DASSERT(mask_bits < (1ull << N));
6826 const size_t count = PopCount(mask_bits);
6827
6828 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6829 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6830 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6831 StoreU(compressed, d, unaligned);
6832 // Workaround for MSAN not marking output as initialized (b/233326619)
6833#if HWY_IS_MSAN
6834 __msan_unpoison(unaligned, count * sizeof(T));
6835#endif
6836
6837 return count;
6838}
6839
6840template <typename T, size_t N>
6841HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
6842 Simd<T, N, 0> d,
6843 T* HWY_RESTRICT unaligned) {
6844 const RebindToUnsigned<decltype(d)> du;
6845
6846 const uint64_t mask_bits = detail::BitsFromMask(m);
6847 HWY_DASSERT(mask_bits < (1ull << N));
6848 const size_t count = PopCount(mask_bits);
6849
6850 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6851 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6852 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6853 BlendedStore(compressed, FirstN(d, count), d, unaligned);
6854 // Workaround for MSAN not marking output as initialized (b/233326619)
6855#if HWY_IS_MSAN
6856 __msan_unpoison(unaligned, count * sizeof(T));
6857#endif
6858 return count;
6859}
6860
6861template <typename T, size_t N>
6862HWY_API size_t CompressBitsStore(Vec128<T, N> v,
6863 const uint8_t* HWY_RESTRICT bits,
6864 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6865 const RebindToUnsigned<decltype(d)> du;
6866
6867 uint64_t mask_bits = 0;
6868 constexpr size_t kNumBytes = (N + 7) / 8;
6869 CopyBytes<kNumBytes>(bits, &mask_bits);
6870 if (N < 8) {
6871 mask_bits &= (1ull << N) - 1;
6872 }
6873 const size_t count = PopCount(mask_bits);
6874
6875 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6876 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6877 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6878 StoreU(compressed, d, unaligned);
6879
6880 // Workaround for MSAN not marking output as initialized (b/233326619)
6881#if HWY_IS_MSAN
6882 __msan_unpoison(unaligned, count * sizeof(T));
6883#endif
6884 return count;
6885}
6886
6887#endif // HWY_TARGET <= HWY_AVX3
6888
6889// ------------------------------ StoreInterleaved2/3/4
6890
6891// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
6892// generic_ops-inl.h.
6893
6894// ------------------------------ Reductions
6895
6896namespace detail {
6897
6898// N=1 for any T: no-op
6899template <typename T>
6900HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6901 const Vec128<T, 1> v) {
6902 return v;
6903}
6904template <typename T>
6905HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6906 const Vec128<T, 1> v) {
6907 return v;
6908}
6909template <typename T>
6910HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6911 const Vec128<T, 1> v) {
6912 return v;
6913}
6914
6915// u32/i32/f32:
6916
6917// N=2
6918template <typename T>
6919HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
6920 const Vec128<T, 2> v10) {
6921 return v10 + Shuffle2301(v10);
6922}
6923template <typename T>
6924HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
6925 const Vec128<T, 2> v10) {
6926 return Min(v10, Shuffle2301(v10));
6927}
6928template <typename T>
6929HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
6930 const Vec128<T, 2> v10) {
6931 return Max(v10, Shuffle2301(v10));
6932}
6933
6934// N=4 (full)
6935template <typename T>
6936HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
6937 const Vec128<T> v3210) {
6938 const Vec128<T> v1032 = Shuffle1032(v3210);
6939 const Vec128<T> v31_20_31_20 = v3210 + v1032;
6940 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6941 return v20_31_20_31 + v31_20_31_20;
6942}
6943template <typename T>
6944HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
6945 const Vec128<T> v3210) {
6946 const Vec128<T> v1032 = Shuffle1032(v3210);
6947 const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
6948 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6949 return Min(v20_31_20_31, v31_20_31_20);
6950}
6951template <typename T>
6952HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
6953 const Vec128<T> v3210) {
6954 const Vec128<T> v1032 = Shuffle1032(v3210);
6955 const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
6956 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6957 return Max(v20_31_20_31, v31_20_31_20);
6958}
6959
6960// u64/i64/f64:
6961
6962// N=2 (full)
6963template <typename T>
6964HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
6965 const Vec128<T> v10) {
6966 const Vec128<T> v01 = Shuffle01(v10);
6967 return v10 + v01;
6968}
6969template <typename T>
6970HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
6971 const Vec128<T> v10) {
6972 const Vec128<T> v01 = Shuffle01(v10);
6973 return Min(v10, v01);
6974}
6975template <typename T>
6976HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
6977 const Vec128<T> v10) {
6978 const Vec128<T> v01 = Shuffle01(v10);
6979 return Max(v10, v01);
6980}
6981
6982// u16/i16
6983template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6984HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
6985 const Repartition<int32_t, Simd<T, N, 0>> d32;
6986 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6987 const auto odd = ShiftRight<16>(BitCast(d32, v));
6988 const auto min = MinOfLanes(d32, Min(even, odd));
6989 // Also broadcast into odd lanes.
6990 return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
6991}
6992template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6993HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
6994 const Repartition<int32_t, Simd<T, N, 0>> d32;
6995 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6996 const auto odd = ShiftRight<16>(BitCast(d32, v));
6997 const auto min = MaxOfLanes(d32, Max(even, odd));
6998 // Also broadcast into odd lanes.
6999 return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
7000}
7001
7002} // namespace detail
7003
7004// Supported for u/i/f 32/64. Returns the same value in each lane.
7005template <typename T, size_t N>
7006HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7007 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7008}
7009template <typename T, size_t N>
7010HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7011 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7012}
7013template <typename T, size_t N>
7014HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7015 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7016}
7017
7018// ------------------------------ Lt128
7019
7020namespace detail {
7021
7022// Returns vector-mask for Lt128. Also used by x86_256/x86_512.
7023template <class D, class V = VFromD<D>>
7024HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
7025 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
7026 // Truth table of Eq and Lt for Hi and Lo u64.
7027 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
7028 // =H =L cH cL | out = cH | (=H & cL)
7029 // 0 0 0 0 | 0
7030 // 0 0 0 1 | 0
7031 // 0 0 1 0 | 1
7032 // 0 0 1 1 | 1
7033 // 0 1 0 0 | 0
7034 // 0 1 0 1 | 0
7035 // 0 1 1 0 | 1
7036 // 1 0 0 0 | 0
7037 // 1 0 0 1 | 1
7038 // 1 1 0 0 | 0
7039 const auto eqHL = Eq(a, b);
7040 const V ltHL = VecFromMask(d, Lt(a, b));
7041 const V ltLX = ShiftLeftLanes<1>(ltHL);
7042 const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
7043 return InterleaveUpper(d, vecHx, vecHx);
7044}
7045
7046template <class D, class V = VFromD<D>>
7047HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
7048 // No specialization required for AVX-512: Mask <-> Vec is fast, and
7049 // copying mask bits to their neighbor seems infeasible.
7050 const V ltHL = VecFromMask(d, Lt(a, b));
7051 return InterleaveUpper(d, ltHL, ltHL);
7052}
7053
7054} // namespace detail
7055
7056template <class D, class V = VFromD<D>>
7057HWY_API MFromD<D> Lt128(D d, const V a, const V b) {
7058 return MaskFromVec(detail::Lt128Vec(d, a, b));
7059}
7060
7061template <class D, class V = VFromD<D>>
7062HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
7063 return MaskFromVec(detail::Lt128UpperVec(d, a, b));
7064}
7065
7066// ------------------------------ Min128, Max128 (Lt128)
7067
7068// Avoids the extra MaskFromVec in Lt128.
7069template <class D, class V = VFromD<D>>
7070HWY_API V Min128(D d, const V a, const V b) {
7071 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
7072}
7073
7074template <class D, class V = VFromD<D>>
7075HWY_API V Max128(D d, const V a, const V b) {
7076 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
7077}
7078
7079template <class D, class V = VFromD<D>>
7080HWY_API V Min128Upper(D d, const V a, const V b) {
7081 return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
7082}
7083
7084template <class D, class V = VFromD<D>>
7085HWY_API V Max128Upper(D d, const V a, const V b) {
7086 return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
7087}
7088
7089// ================================================== Operator wrapper
7090
7091// These apply to all x86_*-inl.h because there are no restrictions on V.
7092
7093template <class V>
7094HWY_API V Add(V a, V b) {
7095 return a + b;
7096}
7097template <class V>
7098HWY_API V Sub(V a, V b) {
7099 return a - b;
7100}
7101
7102template <class V>
7103HWY_API V Mul(V a, V b) {
7104 return a * b;
7105}
7106template <class V>
7107HWY_API V Div(V a, V b) {
7108 return a / b;
7109}
7110
7111template <class V>
7112V Shl(V a, V b) {
7113 return a << b;
7114}
7115template <class V>
7116V Shr(V a, V b) {
7117 return a >> b;
7118}
7119
7120template <class V>
7121HWY_API auto Eq(V a, V b) -> decltype(a == b) {
7122 return a == b;
7123}
7124template <class V>
7125HWY_API auto Ne(V a, V b) -> decltype(a == b) {
7126 return a != b;
7127}
7128template <class V>
7129HWY_API auto Lt(V a, V b) -> decltype(a == b) {
7130 return a < b;
7131}
7132
7133template <class V>
7134HWY_API auto Gt(V a, V b) -> decltype(a == b) {
7135 return a > b;
7136}
7137template <class V>
7138HWY_API auto Ge(V a, V b) -> decltype(a == b) {
7139 return a >= b;
7140}
7141
7142template <class V>
7143HWY_API auto Le(V a, V b) -> decltype(a == b) {
7144 return a <= b;
7145}
7146
7147// NOLINTNEXTLINE(google-readability-namespace-comments)
7148} // namespace HWY_NAMESPACE
7149} // namespace hwy
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:346
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_IF_LE64(T, N)
Definition: base.h:333
#define HWY_API
Definition: base.h:120
#define HWY_IF_LE128(T, N)
Definition: base.h:332
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_MAYBE_UNUSED
Definition: base.h:73
#define HWY_ASSERT(condition)
Definition: base.h:145
Definition: arm_neon-inl.h:804
Raw raw
Definition: arm_neon-inl.h:814
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:141
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:79
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:761
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:85
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:94
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:91
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:76
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:88
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:82
Definition: wasm_256-inl.h:39
Definition: x86_512-inl.h:112
#define HWY_AVX3_DL
Definition: detect_targets.h:62
#define HWY_TARGET
Definition: detect_targets.h:341
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5045
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: x86_128-inl.h:2139
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1700
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4800
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition: wasm_128-inl.h:1856
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4804
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b)
Definition: x86_128-inl.h:7047
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3345
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:3080
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4809
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:5187
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
trn2 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2793
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4962
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2984
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1155
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
Vec128< T, 4/sizeof(T)> Vec32
Definition: arm_neon-inl.h:800
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
typename D::Twice Twice
Definition: ops/shared-inl.h:219
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
Simd< T, 32/sizeof(T), 0 > Full256
Definition: wasm_256-inl.h:32
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition: ops/shared-inl.h:252
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:797
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
long long int GatherIndex64
Definition: x86_128-inl.h:3201
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
__m128i raw
Definition: x86_128-inl.h:4064
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:226
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:222
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:218
Definition: arm_neon-inl.h:823
Simd< T, N, 0 > operator()(const Vec128< T, N > *) const
Definition: x86_128-inl.h:171
Full256< T > operator()(const hwy::HWY_NAMESPACE::Vec256< T > *) const
Definition: x86_128-inl.h:176
Full512< T > operator()(const hwy::HWY_NAMESPACE::Vec512< T > *) const
Definition: x86_128-inl.h:182
Definition: x86_128-inl.h:190
decltype(DeduceD()(static_cast< V * >(nullptr))) type
Definition: x86_128-inl.h:191
__m128d type
Definition: x86_128-inl.h:64
__f32x4 type
Definition: wasm_128-inl.h:60
Definition: x86_128-inl.h:55
__v128_u type
Definition: wasm_128-inl.h:56
Definition: x86_128-inl.h:119
__mmask16 type
Definition: x86_128-inl.h:120
Definition: x86_128-inl.h:123
__mmask8 type
Definition: x86_128-inl.h:124
Definition: x86_128-inl.h:127
__mmask8 type
Definition: x86_128-inl.h:128
Definition: x86_128-inl.h:131
__mmask8 type
Definition: x86_128-inl.h:132
Definition: x86_128-inl.h:117
Definition: base.h:358
HWY_AFTER_NAMESPACE()
#define HWY_INLINE_F16
Definition: x86_128-inl.h:5231
HWY_BEFORE_NAMESPACE()