Grok  9.7.5
algo-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Normal include guard for target-independent parts
17 #ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
18 #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
19 
20 #include <stdint.h>
21 #include <string.h> // memcpy
22 
23 #include <algorithm>
24 #include <cmath> // std::abs
25 #include <vector>
26 
27 #include "hwy/base.h"
29 
30 // Third-party algorithms
31 #define HAVE_AVX2SORT 0
32 #define HAVE_IPS4O 0
33 #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
34 #define HAVE_PDQSORT 0
35 #define HAVE_SORT512 0
36 
37 #if HAVE_AVX2SORT
38 HWY_PUSH_ATTRIBUTES("avx2,avx")
39 #include "avx2sort.h"
41 #endif
42 #if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
43 #include "third_party/ips4o/include/ips4o.hpp"
44 #include "third_party/ips4o/include/ips4o/thread_pool.hpp"
45 #endif
46 #if HAVE_PDQSORT
47 #include "third_party/boost/allowed/sort/sort.hpp"
48 #endif
49 #if HAVE_SORT512
50 #include "sort512.h"
51 #endif
52 
53 namespace hwy {
54 
56 
57 std::vector<Dist> AllDist() {
58  return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
59 }
60 
61 const char* DistName(Dist dist) {
62  switch (dist) {
63  case Dist::kUniform8:
64  return "uniform8";
65  case Dist::kUniform16:
66  return "uniform16";
67  case Dist::kUniform32:
68  return "uniform32";
69  }
70  return "unreachable";
71 }
72 
73 template <typename T>
74 class InputStats {
75  public:
76  void Notify(T value) {
77  min_ = std::min(min_, value);
78  max_ = std::max(max_, value);
79  sumf_ += static_cast<double>(value);
80  count_ += 1;
81  }
82 
83  bool operator==(const InputStats& other) const {
84  if (count_ != other.count_) {
85  HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
86  static_cast<int>(other.count_));
87  }
88 
89  if (min_ != other.min_ || max_ != other.max_) {
90  HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
91  double(other.min_), double(other.max_));
92  }
93 
94  // Sum helps detect duplicated/lost values
95  if (sumf_ != other.sumf_) {
96  // Allow some tolerance because kUniform32 * num can exceed double
97  // precision.
98  const double mul = 1E-9; // prevent destructive cancellation
99  const double err = std::abs(sumf_ * mul - other.sumf_ * mul);
100  if (err > 1E-3) {
101  HWY_ABORT("Sum mismatch %.15e %.15e (%f) min %g max %g\n", sumf_,
102  other.sumf_, err, double(min_), double(max_));
103  }
104  }
105 
106  return true;
107  }
108 
109  private:
110  T min_ = hwy::HighestValue<T>();
111  T max_ = hwy::LowestValue<T>();
112  double sumf_ = 0.0;
113  size_t count_ = 0;
114 };
115 
116 enum class Algo {
117 #if HAVE_AVX2SORT
118  kSEA,
119 #endif
120 #if HAVE_IPS4O
121  kIPS4O,
122 #endif
123 #if HAVE_PARALLEL_IPS4O
124  kParallelIPS4O,
125 #endif
126 #if HAVE_PDQSORT
127  kPDQ,
128 #endif
129 #if HAVE_SORT512
130  kSort512,
131 #endif
132  kStd,
133  kVQSort,
134  kHeap,
135 };
136 
137 const char* AlgoName(Algo algo) {
138  switch (algo) {
139 #if HAVE_AVX2SORT
140  case Algo::kSEA:
141  return "sea";
142 #endif
143 #if HAVE_IPS4O
144  case Algo::kIPS4O:
145  return "ips4o";
146 #endif
147 #if HAVE_PARALLEL_IPS4O
148  case Algo::kParallelIPS4O:
149  return "par_ips4o";
150 #endif
151 #if HAVE_PDQSORT
152  case Algo::kPDQ:
153  return "pdq";
154 #endif
155 #if HAVE_SORT512
156  case Algo::kSort512:
157  return "sort512";
158 #endif
159  case Algo::kStd:
160  return "std";
161  case Algo::kVQSort:
162  return "vq";
163  case Algo::kHeap:
164  return "heap";
165  }
166  return "unreachable";
167 }
168 
169 } // namespace hwy
170 #endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
171 
172 // Per-target
173 #if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
174  defined(HWY_TARGET_TOGGLE)
175 #ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
176 #undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
177 #else
178 #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
179 #endif
180 
183 #include "hwy/contrib/sort/vqsort-inl.h" // HeapSort
184 #include "hwy/tests/test_util-inl.h"
185 
187 namespace hwy {
188 namespace HWY_NAMESPACE {
189 
191  static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
192  z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
193  z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
194  return z ^ (z >> 31);
195  }
196 
197  public:
198  // Generates two vectors of 64-bit seeds via SplitMix64 and stores into
199  // `seeds`. Generating these afresh in each ChoosePivot is too expensive.
200  template <class DU64>
201  static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
202  seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
203  for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
204  seeds[i] = SplitMix64(seeds[i - 1]);
205  }
206  }
207 
208  // Need to pass in the state because vector cannot be class members.
209  template <class DU64>
210  static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0,
211  Vec<DU64>& state1) {
212  Vec<DU64> s1 = state0;
213  Vec<DU64> s0 = state1;
214  const Vec<DU64> bits = Add(s1, s0);
215  state0 = s0;
216  s1 = Xor(s1, ShiftLeft<23>(s1));
217  state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
218  return bits;
219  }
220 };
221 
222 template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)>
224  const Vec<DU64> mask) {
225  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
226  return And(bits, mask);
227 }
228 
229 // Important to avoid denormals, which are flushed to zero by SIMD but not
230 // scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
231 template <typename T, class DU64, HWY_IF_FLOAT(T)>
232 Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
233  const Vec<DU64> mask) {
234  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
235  const Vec<DU64> values = And(bits, mask);
236 #if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to i32
237  const RebindToSigned<DU64> di;
238 #else
239  const Repartition<MakeSigned<T>, DU64> di;
240 #endif
241  const RebindToFloat<decltype(di)> df;
242  // Avoid NaN/denormal by converting from (range-limited) integer.
243  const Vec<DU64> no_nan =
244  And(values, Set(du64, MantissaMask<MakeUnsigned<T>>()));
245  return BitCast(du64, ConvertTo(df, BitCast(di, no_nan)));
246 }
247 
248 template <class DU64>
249 Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
250  switch (sizeof_t) {
251  case 2:
252  return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
253  : 0xFFFFFFFFFFFFFFFFull);
254  case 4:
255  return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull
256  : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
257  : 0xFFFFFFFFFFFFFFFFull);
258  case 8:
259  return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull
260  : (dist == Dist::kUniform16) ? 0x000000000000FFFFull
261  : 0x00000000FFFFFFFFull);
262  default:
263  HWY_ABORT("Logic error");
264  return Zero(du64);
265  }
266 }
267 
268 template <typename T>
269 InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
270  SortTag<uint64_t> du64;
271  using VU64 = Vec<decltype(du64)>;
272  const size_t N64 = Lanes(du64);
273  auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
274  Xorshift128Plus::GenerateSeeds(du64, buf.get());
275  auto s0 = Load(du64, buf.get());
276  auto s1 = Load(du64, buf.get() + N64);
277 
278  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
279 
280  const Repartition<T, decltype(du64)> d;
281  const size_t N = Lanes(d);
282  size_t i = 0;
283  for (; i + N <= num; i += N) {
284  const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
285 #if HWY_ARCH_RVV
286  // v may not be 64-bit aligned
287  StoreU(bits, du64, buf.get());
288  memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
289 #else
290  StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i));
291 #endif
292  }
293  if (i < num) {
294  const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
295  StoreU(bits, du64, buf.get());
296  memcpy(v + i, buf.get(), (num - i) * sizeof(T));
297  }
298 
299  InputStats<T> input_stats;
300  for (size_t i = 0; i < num; ++i) {
301  input_stats.Notify(v[i]);
302  }
303  return input_stats;
304 }
305 
306 struct ThreadLocal {
308 };
309 
310 struct SharedState {
311 #if HAVE_PARALLEL_IPS4O
312  ips4o::StdThreadPool pool{
313  static_cast<int>(std::thread::hardware_concurrency() / 2)};
314 #endif
315  std::vector<ThreadLocal> tls{1};
316 };
317 
318 template <class Order, typename T>
319 void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
320  size_t thread) {
321  using detail::HeapSort;
322  using detail::TraitsLane;
323  using detail::SharedTraits;
324 
325  switch (algo) {
326 #if HAVE_AVX2SORT
327  case Algo::kSEA:
328  return avx2::quicksort(inout, static_cast<int>(num));
329 #endif
330 
331 #if HAVE_IPS4O
332  case Algo::kIPS4O:
333  if (Order().IsAscending()) {
334  return ips4o::sort(inout, inout + num, std::less<T>());
335  } else {
336  return ips4o::sort(inout, inout + num, std::greater<T>());
337  }
338 #endif
339 
340 #if HAVE_PARALLEL_IPS4O
341  case Algo::kParallelIPS4O:
342  if (Order().IsAscending()) {
343  return ips4o::parallel::sort(inout, inout + num, std::less<T>(),
344  shared.pool);
345  } else {
346  return ips4o::parallel::sort(inout, inout + num, std::greater<T>(),
347  shared.pool);
348  }
349 #endif
350 
351 #if HAVE_SORT512
352  case Algo::kSort512:
353  HWY_ABORT("not supported");
354  // return Sort512::Sort(inout, num);
355 #endif
356 
357 #if HAVE_PDQSORT
358  case Algo::kPDQ:
359  if (Order().IsAscending()) {
360  return boost::sort::pdqsort_branchless(inout, inout + num,
361  std::less<T>());
362  } else {
363  return boost::sort::pdqsort_branchless(inout, inout + num,
364  std::greater<T>());
365  }
366 #endif
367 
368  case Algo::kStd:
369  if (Order().IsAscending()) {
370  return std::sort(inout, inout + num, std::less<T>());
371  } else {
372  return std::sort(inout, inout + num, std::greater<T>());
373  }
374 
375  case Algo::kVQSort:
376  return shared.tls[thread].sorter(inout, num, Order());
377 
378  case Algo::kHeap:
379  HWY_ASSERT(sizeof(T) < 16);
380  if (Order().IsAscending()) {
381  const SharedTraits<TraitsLane<detail::OrderAscending>> st;
382  return HeapSort(st, inout, num);
383  } else {
384  const SharedTraits<TraitsLane<detail::OrderDescending>> st;
385  return HeapSort(st, inout, num);
386  }
387 
388  default:
389  HWY_ABORT("Not implemented");
390  }
391 }
392 
393 // NOLINTNEXTLINE(google-readability-namespace-comments)
394 } // namespace HWY_NAMESPACE
395 } // namespace hwy
397 
398 #endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_POP_ATTRIBUTES
Definition: base.h:116
#define HWY_ABORT(format,...)
Definition: base.h:143
#define HWY_INLINE
Definition: base.h:64
#define HWY_PUSH_ATTRIBUTES(targets_str)
Definition: base.h:115
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: algo-inl.h:190
static void GenerateSeeds(DU64 du64, TFromD< DU64 > *HWY_RESTRICT seeds)
Definition: algo-inl.h:201
static Vec< DU64 > RandomBits(DU64, Vec< DU64 > &state0, Vec< DU64 > &state1)
Definition: algo-inl.h:210
static HWY_INLINE uint64_t SplitMix64(uint64_t z)
Definition: algo-inl.h:191
Definition: algo-inl.h:74
T min_
Definition: algo-inl.h:110
size_t count_
Definition: algo-inl.h:113
T max_
Definition: algo-inl.h:111
bool operator==(const InputStats &other) const
Definition: algo-inl.h:83
void Notify(T value)
Definition: algo-inl.h:76
double sumf_
Definition: algo-inl.h:112
Definition: vqsort.h:44
void HeapSort(Traits st, T *HWY_RESTRICT keys, const size_t num)
Definition: vqsort-inl.h:71
d
Definition: rvv-inl.h:1656
void Run(Algo algo, T *HWY_RESTRICT inout, size_t num, SharedState &shared, size_t thread)
Definition: algo-inl.h:319
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:203
Vec< DU64 > MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t)
Definition: algo-inl.h:249
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
InputStats< T > GenerateInput(const Dist dist, T *v, size_t num)
Definition: algo-inl.h:269
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
N
Definition: rvv-inl.h:1656
ScalableTag< T, -1 > SortTag
Definition: contrib/sort/shared-inl.h:112
Vec< DU64 > RandomValues(DU64 du64, Vec< DU64 > &s0, Vec< DU64 > &s1, const Vec< DU64 > mask)
Definition: algo-inl.h:223
const vfloat64m1_t v
Definition: rvv-inl.h:1656
typename D::T TFromD
Definition: ops/shared-inl.h:192
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
std::vector< Dist > AllDist()
Definition: algo-inl.h:57
const char * AlgoName(Algo algo)
Definition: algo-inl.h:137
Dist
Definition: algo-inl.h:55
const char * DistName(Dist dist)
Definition: algo-inl.h:61
constexpr T MantissaMask()
Definition: base.h:554
Algo
Definition: algo-inl.h:116
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: algo-inl.h:310
std::vector< ThreadLocal > tls
Definition: algo-inl.h:315
Definition: algo-inl.h:306
Sorter sorter
Definition: algo-inl.h:307
Definition: sorting_networks-inl.h:42
Definition: traits-inl.h:247