17#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
18#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
31#define HAVE_AVX2SORT 0
34#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
44#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
45#include "third_party/ips4o/include/ips4o.hpp"
46#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
49#include "third_party/boost/allowed/sort/sort.hpp"
64#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
65 apply_to = any(function))
67#pragma clang attribute push(__attribute__((target("avx2"))), \
68 apply_to = any(function))
72#pragma GCC push_options
74#pragma GCC target("avx512f,avx512dq")
76#pragma GCC target("avx2")
82#include "vxsort/machine_traits.avx512.h"
84#include "vxsort/machine_traits.avx2.h"
86#include "vxsort/vxsort.h"
89#pragma clang attribute pop
91#pragma GCC pop_options
113 return "unreachable";
126 static_assert(
sizeof(T) <= 8,
"Expected a built-in type");
127 CopyBytes<sizeof(T)>(&value, &bits);
135 static_cast<int>(other.
count_));
139 HWY_ABORT(
"minmax %f/%f vs %f/%f\n",
static_cast<double>(
min_),
140 static_cast<double>(
max_),
static_cast<double>(other.
min_),
141 static_cast<double>(other.
max_));
146 HWY_ABORT(
"Sum mismatch %g %g; min %g max %g\n",
147 static_cast<double>(
sum_),
static_cast<double>(other.
sum_),
148 static_cast<double>(
min_),
static_cast<double>(
max_));
155 T
min_ = hwy::HighestValue<T>();
156 T
max_ = hwy::LowestValue<T>();
168#if HAVE_PARALLEL_IPS4O
195#if HAVE_PARALLEL_IPS4O
196 case Algo::kParallelIPS4O:
218 return "unreachable";
225#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
226 defined(HWY_TARGET_TOGGLE)
227#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
228#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
230#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
244 z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
245 z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
246 return z ^ (z >> 31);
252 template <
class DU64>
255 for (
size_t i = 1; i < 2 *
Lanes(du64); ++i) {
261 template <
class DU64>
268 s1 =
Xor(s1, ShiftLeft<23>(s1));
269 state1 =
Xor(s1,
Xor(s0,
Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
274template <
typename T,
class DU64, HWY_IF_NOT_FLOAT(T)>
278 return And(bits, mask);
283template <
typename T,
class DU64, HWY_IF_FLOAT(T)>
284Vec<DU64>
RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
285 const Vec<DU64> mask) {
287 const Vec<DU64> values =
And(bits, mask);
288#if HWY_TARGET == HWY_SCALAR
289 const RebindToSigned<DU64> di;
291 const Repartition<MakeSigned<T>, DU64> di;
295 const auto k1 =
BitCast(du64,
Set(df, T{1.0}));
296 const auto mantissa =
BitCast(du64,
Set(du, MantissaMask<T>()));
298 const Vec<DU64> no_nan =
OrAnd(k1, values, mantissa);
307 : 0xFFFFFFFFFFFFFFFFull);
311 : 0xFFFFFFFFFFFFFFFFull);
315 : 0x00000000FFFFFFFFull);
325 using VU64 =
Vec<
decltype(du64)>;
326 const size_t N64 =
Lanes(du64);
327 auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
329 auto s0 =
Load(du64, buf.get());
330 auto s1 =
Load(du64, buf.get() + N64);
332 const VU64 mask =
MaskForDist(du64, dist,
sizeof(T));
337 for (; i +
N <= num; i +=
N) {
338 const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
339#if HWY_ARCH_RVV || (HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7)
341 StoreU(bits, du64, buf.get());
342 memcpy(
v + i, buf.get(), N64 *
sizeof(uint64_t));
344 StoreU(bits, du64,
reinterpret_cast<uint64_t*
>(
v + i));
348 const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
349 StoreU(bits, du64, buf.get());
350 memcpy(
v + i, buf.get(), (num - i) *
sizeof(T));
354 for (
size_t i = 0; i < num; ++i) {
365#if HAVE_PARALLEL_IPS4O
366 const unsigned max_threads = hwy::LimitsMax<unsigned>();
367 ips4o::StdThreadPool pool{
static_cast<int>(
368 HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
370 std::vector<ThreadLocal>
tls{1};
375template <
class Order,
typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
379 if (Order().IsAscending()) {
380 const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
383 const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
389template <
class Order>
391 using detail::SharedTraits;
392 using detail::Traits128;
393 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
394 const size_t num_lanes = num_keys * 2;
395 if (Order().IsAscending()) {
396 const SharedTraits<Traits128<detail::OrderAscending128>> st;
399 const SharedTraits<Traits128<detail::OrderDescending128>> st;
404template <
class Order>
406 using detail::SharedTraits;
407 using detail::Traits128;
408 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
409 const size_t num_lanes = num_keys * 2;
410 if (Order().IsAscending()) {
411 const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
414 const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
420template <
class Order,
typename KeyType>
423 const std::less<KeyType> less;
424 const std::greater<KeyType> greater;
429 return avx2::quicksort(inout,
static_cast<int>(num));
434 if (Order().IsAscending()) {
435 return ips4o::sort(inout, inout + num, less);
437 return ips4o::sort(inout, inout + num, greater);
441#if HAVE_PARALLEL_IPS4O
442 case Algo::kParallelIPS4O:
443 if (Order().IsAscending()) {
444 return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
446 return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
458 if (Order().IsAscending()) {
459 return boost::sort::pdqsort_branchless(inout, inout + num, less);
461 return boost::sort::pdqsort_branchless(inout, inout + num, greater);
466 case Algo::kVXSort: {
467#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
468 (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
469 fprintf(stderr,
"Do not call for target %s\n",
474 vxsort::vxsort<KeyType, vxsort::AVX512> vx;
476 vxsort::vxsort<KeyType, vxsort::AVX2> vx;
478 if (Order().IsAscending()) {
479 return vx.sort(inout, inout + num - 1);
481 fprintf(stderr,
"Skipping VX - does not support descending order\n");
489 if (Order().IsAscending()) {
490 return std::sort(inout, inout + num, less);
492 return std::sort(inout, inout + num, greater);
496 return shared.
tls[thread].sorter(inout, num, Order());
499 return CallHeapSort<Order>(inout, num);
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_POP_ATTRIBUTES
Definition: base.h:114
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_ABORT(format,...)
Definition: base.h:141
#define HWY_INLINE
Definition: base.h:62
#define HWY_PUSH_ATTRIBUTES(targets_str)
Definition: base.h:113
Definition: algo-inl.h:242
static Vec< DU64 > RandomBits(DU64, Vec< DU64 > &state0, Vec< DU64 > &state1)
Definition: algo-inl.h:262
static void GenerateSeeds(DU64 du64, TFromD< DU64 > *HWY_RESTRICT seeds)
Definition: algo-inl.h:253
static HWY_INLINE uint64_t SplitMix64(uint64_t z)
Definition: algo-inl.h:243
#define HWY_TARGET
Definition: detect_targets.h:341
void HeapSort(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes)
Definition: vqsort-inl.h:92
d
Definition: rvv-inl.h:1742
InputStats< T > GenerateInput(const Dist dist, T *v, size_t num)
Definition: algo-inl.h:323
void CallHeapSort(KeyType *HWY_RESTRICT keys, const size_t num_keys)
Definition: algo-inl.h:376
void Run(Algo algo, KeyType *HWY_RESTRICT inout, size_t num, SharedState &shared, size_t thread)
Definition: algo-inl.h:421
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
Vec< DU64 > MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t)
Definition: algo-inl.h:303
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
N
Definition: rvv-inl.h:1742
ScalableTag< T, -1 > SortTag
Definition: contrib/sort/shared-inl.h:123
Vec< DU64 > RandomValues(DU64 du64, Vec< DU64 > &s0, Vec< DU64 > &s1, const Vec< DU64 > mask)
Definition: algo-inl.h:275
const vfloat64m1_t v
Definition: rvv-inl.h:1742
typename D::T TFromD
Definition: ops/shared-inl.h:191
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
static const char * DistName(Dist dist)
Definition: algo-inl.h:104
static HWY_MAYBE_UNUSED const char * TargetName(uint32_t target)
Definition: targets.h:77
Dist
Definition: algo-inl.h:98
static std::vector< Dist > AllDist()
Definition: algo-inl.h:100
const char * AlgoName(Algo algo)
Definition: algo-inl.h:185
Algo
Definition: algo-inl.h:161
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: algo-inl.h:364
std::vector< ThreadLocal > tls
Definition: algo-inl.h:370
Definition: algo-inl.h:360
Sorter sorter
Definition: algo-inl.h:361
Definition: sorting_networks-inl.h:686
Definition: traits-inl.h:381