Grok  9.5.0
sort-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Per-target include guard
16 
17 #if defined(HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_) == \
18  defined(HWY_TARGET_TOGGLE)
19 #ifdef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
20 #undef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
21 #else
22 #define HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
23 #endif
24 
25 #include "hwy/aligned_allocator.h"
26 #include "hwy/highway.h"
27 
29 namespace hwy {
30 namespace HWY_NAMESPACE {
31 
32 #if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
33 
34 #define HWY_SORT_VERIFY 1
35 
36 enum class SortOrder { kAscending, kDescending };
37 
38 constexpr inline SortOrder Reverse(SortOrder order) {
39  return (order == SortOrder::kAscending) ? SortOrder::kDescending
40  : SortOrder::kAscending;
41 }
42 
43 namespace verify {
44 
45 template <typename T>
46 bool Compare(T a, T b, SortOrder kOrder) {
47  if (kOrder == SortOrder::kAscending) return a <= b;
48  return a >= b;
49 }
50 
51 #if HWY_SORT_VERIFY
52 
53 template <class D>
54 class Runs {
55  using T = TFromD<D>;
56 
57  public:
58  Runs(D d, size_t num_regs, size_t run_length = 0, bool alternating = false) {
59  const size_t N = Lanes(d);
60 
61  buf_ = AllocateAligned<T>(N);
62  consecutive_ = AllocateAligned<T>(num_regs * N);
63 
64  num_regs_ = num_regs;
65  if (run_length) {
66  run_length_ = run_length;
67  num_runs_ = num_regs * N / run_length;
68  is_vector_ = true;
69  alternating_ = alternating;
70  } else {
71  run_length_ = num_regs * 4;
72  num_runs_ = N / 4;
73  is_vector_ = false;
74  alternating_ = false;
75  }
76  }
77 
78  void ScatterQuartets(D d, const size_t idx_reg, Vec<D> v) {
79  HWY_ASSERT(idx_reg < num_regs_);
80  const size_t N = Lanes(d);
81  for (size_t i = 0; i < N; i += 4) {
82  Store(v, d, buf_.get());
83  const size_t idx_q = (i / 4) * num_regs_ + idx_reg;
84  CopyBytes<16>(buf_.get() + i, consecutive_.get() + idx_q * 4);
85  }
86  }
87 
88  void StoreVector(D d, const size_t idx_reg, Vec<D> v) {
89  HWY_ASSERT(idx_reg < num_regs_);
90  Store(v, d, &consecutive_[idx_reg * Lanes(d)]);
91  }
92 
93  bool IsBitonic() const {
94  HWY_ASSERT(!alternating_);
95  for (size_t ir = 0; ir < num_runs_; ++ir) {
96  const T* p = &consecutive_[ir * run_length_];
97  bool is_asc = true;
98  bool is_desc = true;
99  bool is_zero = true;
100 
101  for (size_t i = 0; i < run_length_ / 2 - 1; ++i) {
102  is_asc &= (p[i] <= p[i + 1]);
103  is_desc &= (p[i] >= p[i + 1]);
104  }
105  for (size_t i = 0; i < run_length_; ++i) {
106  is_zero &= (p[i] == 0);
107  }
108 
109  bool is_asc2 = true;
110  bool is_desc2 = true;
111  for (size_t i = run_length_ / 2; i < run_length_ - 1; ++i) {
112  is_asc2 &= (p[i] <= p[i + 1]);
113  is_desc2 &= (p[i] >= p[i + 1]);
114  }
115 
116  if (is_zero) continue;
117  if (is_asc && is_desc2) continue;
118  if (is_desc && is_asc2) continue;
119  return false;
120  }
121  return true;
122  }
123 
124  void CheckBitonic(int line, int caller) const {
125  if (IsBitonic()) return;
126  for (size_t ir = 0; ir < num_runs_; ++ir) {
127  const T* p = &consecutive_[ir * run_length_];
128  printf("run %zu (len %zu)\n", ir, run_length_);
129  for (size_t i = 0; i < run_length_; ++i) {
130  printf("%.0f\n", static_cast<float>(p[i]));
131  }
132  }
133  printf("caller %d\n", caller);
134  hwy::Abort("", line, "not bitonic");
135  }
136 
137  void CheckSorted(SortOrder kOrder, int line, int caller) const {
138  for (size_t ir = 0; ir < num_runs_; ++ir) {
139  const SortOrder order =
140  (alternating_ && (ir & 1)) ? Reverse(kOrder) : kOrder;
141  const T* p = &consecutive_[ir * run_length_];
142 
143  for (size_t i = 0; i < run_length_ - 1; ++i) {
144  if (!Compare(p[i], p[i + 1], order)) {
145  printf(
146  "ir%zu run_length=%zu alt=%d original order=%d this order=%d\n",
147  ir, run_length_, alternating_, static_cast<int>(kOrder),
148  static_cast<int>(order));
149  for (size_t i = 0; i < run_length_; ++i) {
150  printf(" %.0f\n", static_cast<float>(p[i]));
151  }
152  printf("caller %d\n", caller);
153  hwy::Abort("", line, "not sorted");
154  }
155  }
156  }
157  }
158 
159  private:
160  AlignedFreeUniquePtr<T[]> buf_;
161  AlignedFreeUniquePtr<T[]> consecutive_;
162  size_t num_regs_;
163  size_t run_length_;
164  size_t num_runs_;
165  bool is_vector_;
166  bool alternating_;
167 };
168 
169 template <class D>
170 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0) {
171  Runs runs(d, 1);
172  runs.ScatterQuartets(d, 0, v0);
173  return runs;
174 }
175 
176 template <class D>
177 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1) {
178  Runs runs(d, 2);
179  runs.ScatterQuartets(d, 0, v0);
180  runs.ScatterQuartets(d, 1, v1);
181  return runs;
182 }
183 
184 template <class D>
185 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
186  Vec<D> v3) {
187  Runs runs(d, 4);
188  runs.ScatterQuartets(d, 0, v0);
189  runs.ScatterQuartets(d, 1, v1);
190  runs.ScatterQuartets(d, 2, v2);
191  runs.ScatterQuartets(d, 3, v3);
192  return runs;
193 }
194 
195 template <class D>
196 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
197  Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
198  Vec<D> v7) {
199  Runs runs(d, 8);
200  runs.ScatterQuartets(d, 0, v0);
201  runs.ScatterQuartets(d, 1, v1);
202  runs.ScatterQuartets(d, 2, v2);
203  runs.ScatterQuartets(d, 3, v3);
204  runs.ScatterQuartets(d, 4, v4);
205  runs.ScatterQuartets(d, 5, v5);
206  runs.ScatterQuartets(d, 6, v6);
207  runs.ScatterQuartets(d, 7, v7);
208  return runs;
209 }
210 
211 template <class D>
212 Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
213  Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
214  Vec<D> v7, Vec<D> v8, Vec<D> v9, Vec<D> vA,
215  Vec<D> vB, Vec<D> vC, Vec<D> vD, Vec<D> vE,
216  Vec<D> vF) {
217  Runs runs(d, 16);
218  runs.ScatterQuartets(d, 0x0, v0);
219  runs.ScatterQuartets(d, 0x1, v1);
220  runs.ScatterQuartets(d, 0x2, v2);
221  runs.ScatterQuartets(d, 0x3, v3);
222  runs.ScatterQuartets(d, 0x4, v4);
223  runs.ScatterQuartets(d, 0x5, v5);
224  runs.ScatterQuartets(d, 0x6, v6);
225  runs.ScatterQuartets(d, 0x7, v7);
226  runs.ScatterQuartets(d, 0x8, v8);
227  runs.ScatterQuartets(d, 0x9, v9);
228  runs.ScatterQuartets(d, 0xA, vA);
229  runs.ScatterQuartets(d, 0xB, vB);
230  runs.ScatterQuartets(d, 0xC, vC);
231  runs.ScatterQuartets(d, 0xD, vD);
232  runs.ScatterQuartets(d, 0xE, vE);
233  runs.ScatterQuartets(d, 0xF, vF);
234  return runs;
235 }
236 
237 template <class D>
238 Runs<D> StoreDeinterleavedQuartets(
239  D d, const Vec<D>& v00, const Vec<D>& v01, const Vec<D>& v02,
240  const Vec<D>& v03, const Vec<D>& v04, const Vec<D>& v05, const Vec<D>& v06,
241  const Vec<D>& v07, const Vec<D>& v08, const Vec<D>& v09, const Vec<D>& v0A,
242  const Vec<D>& v0B, const Vec<D>& v0C, const Vec<D>& v0D, const Vec<D>& v0E,
243  const Vec<D>& v0F, const Vec<D>& v10, const Vec<D>& v11, const Vec<D>& v12,
244  const Vec<D>& v13, const Vec<D>& v14, const Vec<D>& v15, const Vec<D>& v16,
245  const Vec<D>& v17, const Vec<D>& v18, const Vec<D>& v19, const Vec<D>& v1A,
246  const Vec<D>& v1B, const Vec<D>& v1C, const Vec<D>& v1D, const Vec<D>& v1E,
247  const Vec<D>& v1F) {
248  Runs runs(d, 32);
249  runs.ScatterQuartets(d, 0x00, v00);
250  runs.ScatterQuartets(d, 0x01, v01);
251  runs.ScatterQuartets(d, 0x02, v02);
252  runs.ScatterQuartets(d, 0x03, v03);
253  runs.ScatterQuartets(d, 0x04, v04);
254  runs.ScatterQuartets(d, 0x05, v05);
255  runs.ScatterQuartets(d, 0x06, v06);
256  runs.ScatterQuartets(d, 0x07, v07);
257  runs.ScatterQuartets(d, 0x08, v08);
258  runs.ScatterQuartets(d, 0x09, v09);
259  runs.ScatterQuartets(d, 0x0A, v0A);
260  runs.ScatterQuartets(d, 0x0B, v0B);
261  runs.ScatterQuartets(d, 0x0C, v0C);
262  runs.ScatterQuartets(d, 0x0D, v0D);
263  runs.ScatterQuartets(d, 0x0E, v0E);
264  runs.ScatterQuartets(d, 0x0F, v0F);
265  runs.ScatterQuartets(d, 0x10, v10);
266  runs.ScatterQuartets(d, 0x11, v11);
267  runs.ScatterQuartets(d, 0x12, v12);
268  runs.ScatterQuartets(d, 0x13, v13);
269  runs.ScatterQuartets(d, 0x14, v14);
270  runs.ScatterQuartets(d, 0x15, v15);
271  runs.ScatterQuartets(d, 0x16, v16);
272  runs.ScatterQuartets(d, 0x17, v17);
273  runs.ScatterQuartets(d, 0x18, v18);
274  runs.ScatterQuartets(d, 0x19, v19);
275  runs.ScatterQuartets(d, 0x1A, v1A);
276  runs.ScatterQuartets(d, 0x1B, v1B);
277  runs.ScatterQuartets(d, 0x1C, v1C);
278  runs.ScatterQuartets(d, 0x1D, v1D);
279  runs.ScatterQuartets(d, 0x1E, v1E);
280  runs.ScatterQuartets(d, 0x1F, v1F);
281  return runs;
282 }
283 
284 template <class D>
285 Runs<D> StoreVectors(D d, Vec<D> v0, size_t run_length, bool alternating) {
286  Runs runs(d, 1, run_length, alternating);
287  runs.StoreVector(d, 0, v0);
288  return runs;
289 }
290 
291 template <class D>
292 Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1) {
293  constexpr size_t kRegs = 2;
294  Runs runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
295  runs.StoreVector(d, 0, v0);
296  runs.StoreVector(d, 1, v1);
297  return runs;
298 }
299 
300 template <class D>
301 Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3) {
302  constexpr size_t kRegs = 4;
303  Runs runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
304  runs.StoreVector(d, 0, v0);
305  runs.StoreVector(d, 1, v1);
306  runs.StoreVector(d, 2, v2);
307  runs.StoreVector(d, 3, v3);
308  return runs;
309 }
310 
311 template <class D>
312 Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3, Vec<D> v4,
313  Vec<D> v5, Vec<D> v6, Vec<D> v7) {
314  constexpr size_t kRegs = 8;
315  Runs runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
316  runs.StoreVector(d, 0, v0);
317  runs.StoreVector(d, 1, v1);
318  runs.StoreVector(d, 2, v2);
319  runs.StoreVector(d, 3, v3);
320  runs.StoreVector(d, 4, v4);
321  runs.StoreVector(d, 5, v5);
322  runs.StoreVector(d, 6, v6);
323  runs.StoreVector(d, 7, v7);
324  return runs;
325 }
326 
327 #endif // HWY_SORT_VERIFY
328 } // namespace verify
329 
330 namespace detail {
331 
332 // ------------------------------ Vector-length agnostic (quartets)
333 
334 // For each lane i: replaces a[i] with the first and b[i] with the second
335 // according to kOrder.
336 // Corresponds to a conditional swap, which is one "node" of a sorting network.
337 // Min/Max are cheaper than compare + blend at least for integers.
338 template <SortOrder kOrder, class V>
339 HWY_INLINE void SortLanesIn2Vectors(V& a, V& b) {
340  V temp = a;
341  a = (kOrder == SortOrder::kAscending) ? Min(a, b) : Max(a, b);
342  b = (kOrder == SortOrder::kAscending) ? Max(temp, b) : Min(temp, b);
343 }
344 
345 // For each lane: sorts the four values in the that lane of the four vectors.
346 template <SortOrder kOrder, class D, class V = Vec<D>>
347 HWY_INLINE void SortLanesIn4Vectors(D d, const TFromD<D>* in, V& v0, V& v1,
348  V& v2, V& v3) {
349  const size_t N = Lanes(d);
350 
351  // Bitonic and odd-even sorters both have 5 nodes. This one is from
352  // http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
353 
354  // layer 1
355  v0 = Load(d, in + 0 * N);
356  v2 = Load(d, in + 2 * N);
357  SortLanesIn2Vectors<kOrder>(v0, v2);
358  v1 = Load(d, in + 1 * N);
359  v3 = Load(d, in + 3 * N);
360  SortLanesIn2Vectors<kOrder>(v1, v3);
361 
362  // layer 2
363  SortLanesIn2Vectors<kOrder>(v0, v1);
364  SortLanesIn2Vectors<kOrder>(v2, v3);
365 
366  // layer 3
367  SortLanesIn2Vectors<kOrder>(v1, v2);
368 }
369 
370 // Inputs are vectors with columns in sorted order (from SortLanesIn4Vectors).
371 // Transposes so that output vectors are sorted quartets (128-bit blocks),
372 // and a quartet in v0 comes before its counterpart in v1, etc.
373 template <class D, class V = Vec<D>>
374 HWY_INLINE void Transpose4x4(D d, V& v0, V& v1, V& v2, V& v3) {
375  const RepartitionToWide<decltype(d)> dw;
376 
377  // Input: first number is reg, second is lane (0 is lowest)
378  // 03 02 01 00 |
379  // 13 12 11 10 | columns are sorted
380  // 23 22 21 20 | (in this order)
381  // 33 32 31 30 V
382  const V t0 = InterleaveLower(d, v0, v1); // 11 01 10 00
383  const V t1 = InterleaveLower(d, v2, v3); // 31 21 30 20
384  const V t2 = InterleaveUpper(d, v0, v1); // 13 03 12 02
385  const V t3 = InterleaveUpper(d, v2, v3); // 33 23 32 22
386 
387  // 30 20 10 00
388  v0 = BitCast(d, InterleaveLower(BitCast(dw, t0), BitCast(dw, t1)));
389  // 31 21 11 01
390  v1 = BitCast(d, InterleaveUpper(BitCast(dw, t0), BitCast(dw, t1)));
391  // 32 22 12 02
392  v2 = BitCast(d, InterleaveLower(BitCast(dw, t2), BitCast(dw, t3)));
393  // 33 23 13 03 --> sorted in descending order (03=smallest in lane 0).
394  v3 = BitCast(d, InterleaveUpper(BitCast(dw, t2), BitCast(dw, t3)));
395 }
396 
397 // 12 ops (including 4 swizzle)
398 // Precondition: v0 and v1 are already sorted according to kOrder.
399 // Postcondition: concatenate(v0, v1) is sorted and v0 is the lower half.
400 template <SortOrder kOrder, class D, class V = Vec<D>>
401 HWY_INLINE void Merge2SortedQuartets(D d, V& v0, V& v1, int caller) {
402 #if HWY_SORT_VERIFY
403  const verify::Runs<D> input0 = verify::StoreDeinterleavedQuartets(d, v0);
404  const verify::Runs<D> input1 = verify::StoreDeinterleavedQuartets(d, v1);
405  input0.CheckSorted(kOrder, __LINE__, caller);
406  input1.CheckSorted(kOrder, __LINE__, caller);
407 #endif
408 
409  // See figure 5 from https://www.vldb.org/pvldb/vol8/p1274-inoue.pdf.
410  // This requires 8 min/max vs 6 for bitonic merge (see Figure 2 in
411  // http://www.vldb.org/pvldb/vol1/1454171.pdf), but is faster overall because
412  // it needs less shuffling, and does not need a bitonic input.
413  SortLanesIn2Vectors<kOrder>(v0, v1);
414  v0 = Shuffle0321(v0);
415  SortLanesIn2Vectors<kOrder>(v0, v1);
416  v0 = Shuffle0321(v0);
417  SortLanesIn2Vectors<kOrder>(v0, v1);
418  v0 = Shuffle0321(v0);
419  SortLanesIn2Vectors<kOrder>(v0, v1);
420  v0 = Shuffle0321(v0);
421 
422 #if HWY_SORT_VERIFY
423  auto output = verify::StoreDeinterleavedQuartets(d, v0, v1);
424  output.CheckSorted(kOrder, __LINE__, caller);
425 #endif
426 }
427 
428 // ------------------------------ Bitonic merge (quartets)
429 
430 // For the last layer of bitonic merge. Conditionally swaps even-numbered lanes
431 // with their odd-numbered neighbor. Works for both quartets and vectors.
432 template <SortOrder kOrder, class D>
433 HWY_INLINE void SortAdjacentLanesQV(D d, Vec<D>& q_or_v) {
434  // Optimization for 32-bit integers: swap via Shuffle and 64-bit Min/Max.
435  // (not worthwhile on SSE4/AVX2 because they lack 64-bit Min/Max)
436 #if !HWY_ARCH_X86 || HWY_TARGET <= HWY_AVX3
437  if (sizeof(TFromD<D>) == 4 && !IsFloat<TFromD<D>>()) {
438  const RepartitionToWide<decltype(d)> dw;
439  const auto wide = BitCast(dw, q_or_v);
440  const auto swap = BitCast(dw, Shuffle2301(q_or_v));
441  if (kOrder == SortOrder::kAscending) {
442  q_or_v = BitCast(d, Max(wide, swap));
443  } else {
444  q_or_v = BitCast(d, Min(wide, swap));
445  }
446  } else
447 #endif
448  {
449  Vec<D> swapped = Shuffle2301(q_or_v);
450  SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
451  q_or_v = OddEven(swapped, q_or_v);
452  }
453 }
454 
455 // Lane 0 with 2, 1 with 3 etc. Works for both quartets and vectors.
456 template <SortOrder kOrder, class D>
457 HWY_INLINE void SortDistance2LanesQV(D d, Vec<D>& q_or_v) {
458  const RepartitionToWide<decltype(d)> dw;
459  Vec<D> swapped = Shuffle1032(q_or_v);
460  SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
461  q_or_v = BitCast(d, OddEven(BitCast(dw, swapped), BitCast(dw, q_or_v)));
462 }
463 
464 // For all BitonicMerge*, and each block, the concatenation of those blocks from
465 // the first half and second half of the input vectors must be sorted in
466 // opposite orders.
467 
468 // 14 ops (including 4 swizzle)
469 template <SortOrder kOrder, class D, class V = Vec<D>>
470 HWY_INLINE void BitonicMerge2Quartets(D d, V& q0, V& q1, int caller) {
471 #if HWY_SORT_VERIFY
472  const verify::Runs<D> input = verify::StoreDeinterleavedQuartets(d, q0, q1);
473  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
474 #endif
475 
476  // Layer 1: lane stride 4 (2 ops)
477  SortLanesIn2Vectors<kOrder>(q0, q1);
478 
479  // Layer 2: lane stride 2 (6 ops)
480  SortDistance2LanesQV<kOrder>(d, q0);
481  SortDistance2LanesQV<kOrder>(d, q1);
482 
483  // Layer 3: lane stride 1 (4 ops)
484  SortAdjacentLanesQV<kOrder>(d, q0);
485  SortAdjacentLanesQV<kOrder>(d, q1);
486 
487 #if HWY_SORT_VERIFY
488  const verify::Runs<D> output = verify::StoreDeinterleavedQuartets(d, q0, q1);
489  output.CheckSorted(kOrder, __LINE__, caller);
490 #endif
491 }
492 
493 // 32 ops, more efficient than three 4+4 merges (36 ops).
494 template <SortOrder kOrder, class D, class V = Vec<D>>
495 HWY_INLINE void BitonicMerge4Quartets(D d, V& q0, V& q1, V& q2, V& q3,
496  int caller) {
497 #if HWY_SORT_VERIFY
498  const verify::Runs<D> input =
499  verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
500  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
501 #endif
502 
503  // Layer 1: lane stride 8
504  SortLanesIn2Vectors<kOrder>(q0, q2);
505  SortLanesIn2Vectors<kOrder>(q1, q3);
506 
507  // Layers 2 to 4
508  // Inputs are not fully sorted, so cannot use Merge2SortedQuartets.
509  BitonicMerge2Quartets<kOrder>(d, q0, q1, __LINE__);
510  BitonicMerge2Quartets<kOrder>(d, q2, q3, __LINE__);
511 
512 #if HWY_SORT_VERIFY
513  const verify::Runs<D> output =
514  verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
515  output.CheckSorted(kOrder, __LINE__, caller);
516 #endif
517 }
518 
519 // 72 ops.
520 template <SortOrder kOrder, class D, class V = Vec<D>>
521 HWY_INLINE void BitonicMerge8Quartets(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
522  V& q5, V& q6, V& q7, int caller) {
523 #if HWY_SORT_VERIFY
524  const verify::Runs<D> input =
525  verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
526  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
527 #endif
528 
529  // Layer 1: lane stride 16
530  SortLanesIn2Vectors<kOrder>(q0, q4);
531  SortLanesIn2Vectors<kOrder>(q1, q5);
532  SortLanesIn2Vectors<kOrder>(q2, q6);
533  SortLanesIn2Vectors<kOrder>(q3, q7);
534 
535  // Layers 2 to 5
536  BitonicMerge4Quartets<kOrder>(d, q0, q1, q2, q3, __LINE__);
537  BitonicMerge4Quartets<kOrder>(d, q4, q5, q6, q7, __LINE__);
538 
539 #if HWY_SORT_VERIFY
540  const verify::Runs<D> output =
541  verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
542  output.CheckSorted(kOrder, __LINE__, caller);
543 #endif
544 }
545 
546 // ------------------------------ Bitonic merge (vectors)
547 
548 // Lane 0 with 4, 1 with 5 etc. Only used for vectors with at least 8 lanes.
549 #if HWY_TARGET <= HWY_AVX3
550 
551 // TODO(janwas): move to op
552 template <typename T>
553 Vec512<T> Shuffle128_2020(Vec512<T> a, Vec512<T> b) {
554  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0))};
555 }
556 
557 template <typename T>
558 Vec512<T> Shuffle128_3131(Vec512<T> a, Vec512<T> b) {
559  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(3, 1, 3, 1))};
560 }
561 
562 template <typename T>
563 Vec512<T> Shuffle128_2301(Vec512<T> a, Vec512<T> b) {
564  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 3, 0, 1))};
565 }
566 
567 template <typename T>
568 Vec512<T> OddEven128(Vec512<T> odd, Vec512<T> even) {
569  return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
570 }
571 
572 template <SortOrder kOrder, class T>
573 HWY_INLINE void SortDistance4LanesV(Simd<T, 16> d, Vec<decltype(d)>& v) {
574  // In: FEDCBA98 76543210
575  // Swap 128-bit halves of each 256 bits => BA98FEDC 32107654
576  Vec512<T> swapped = Shuffle128_2301(v, v);
577  SortLanesIn2Vectors<kOrder>(v, swapped);
578  v = OddEven128(swapped, v);
579 }
580 
581 #endif
582 
583 template <SortOrder kOrder, typename T>
584 HWY_INLINE void SortDistance4LanesV(Simd<T, 8> d, Vec<decltype(d)>& v) {
585  Vec<decltype(d)> swapped = ConcatLowerUpper(d, v, v);
586  SortLanesIn2Vectors<kOrder>(v, swapped);
587  v = ConcatUpperLower(swapped, v);
588 }
589 
590 template <SortOrder kOrder, typename T>
591 HWY_INLINE void SortDistance4LanesV(Simd<T, 4> /* tag */, ...) {}
592 
593 // Only used for vectors with at least 16 lanes.
594 template <SortOrder kOrder, class D>
595 HWY_INLINE void SortDistance8LanesV(D d, Vec<D>& v) {
596  Vec<D> swapped = ConcatLowerUpper(d, v, v);
597  SortLanesIn2Vectors<kOrder>(v, swapped);
598  v = ConcatUpperLower(swapped, v);
599 }
600 
601 // 120 ops. Only used if vectors are at least 8 lanes.
602 template <SortOrder kOrder, class D, class V = Vec<D>>
603 HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
604  V& v6, V& v7, int caller) {
605 #if HWY_SORT_VERIFY
606  const verify::Runs<D> input =
607  verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
608  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
609 #endif
610 
611  // Layer 1: lane stride 32
612  SortLanesIn2Vectors<kOrder>(v0, v4);
613  SortLanesIn2Vectors<kOrder>(v1, v5);
614  SortLanesIn2Vectors<kOrder>(v2, v6);
615  SortLanesIn2Vectors<kOrder>(v3, v7);
616 
617  // Layer 2: lane stride 16
618  SortLanesIn2Vectors<kOrder>(v0, v2);
619  SortLanesIn2Vectors<kOrder>(v1, v3);
620  SortLanesIn2Vectors<kOrder>(v4, v6);
621  SortLanesIn2Vectors<kOrder>(v5, v7);
622 
623  // Layer 3: lane stride 8
624  SortLanesIn2Vectors<kOrder>(v0, v1);
625  SortLanesIn2Vectors<kOrder>(v2, v3);
626  SortLanesIn2Vectors<kOrder>(v4, v5);
627  SortLanesIn2Vectors<kOrder>(v6, v7);
628 
629  // Layer 4: lane stride 4
630  SortDistance4LanesV<kOrder>(d, v0);
631  SortDistance4LanesV<kOrder>(d, v1);
632  SortDistance4LanesV<kOrder>(d, v2);
633  SortDistance4LanesV<kOrder>(d, v3);
634  SortDistance4LanesV<kOrder>(d, v4);
635  SortDistance4LanesV<kOrder>(d, v5);
636  SortDistance4LanesV<kOrder>(d, v6);
637  SortDistance4LanesV<kOrder>(d, v7);
638 
639  // Layer 5: lane stride 2
640  SortDistance2LanesQV<kOrder>(d, v0);
641  SortDistance2LanesQV<kOrder>(d, v1);
642  SortDistance2LanesQV<kOrder>(d, v2);
643  SortDistance2LanesQV<kOrder>(d, v3);
644  SortDistance2LanesQV<kOrder>(d, v4);
645  SortDistance2LanesQV<kOrder>(d, v5);
646  SortDistance2LanesQV<kOrder>(d, v6);
647  SortDistance2LanesQV<kOrder>(d, v7);
648 
649  // Layer 6: lane stride 1
650  SortAdjacentLanesQV<kOrder>(d, v0);
651  SortAdjacentLanesQV<kOrder>(d, v1);
652  SortAdjacentLanesQV<kOrder>(d, v2);
653  SortAdjacentLanesQV<kOrder>(d, v3);
654  SortAdjacentLanesQV<kOrder>(d, v4);
655  SortAdjacentLanesQV<kOrder>(d, v5);
656  SortAdjacentLanesQV<kOrder>(d, v6);
657  SortAdjacentLanesQV<kOrder>(d, v7);
658 
659 #if HWY_SORT_VERIFY
660  const verify::Runs<D> output =
661  verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
662  output.CheckSorted(kOrder, __LINE__, caller);
663 #endif
664 }
665 
666 // 60 ops. Only used if vectors are at least 16 lanes.
667 template <SortOrder kOrder, class D, class V = Vec<D>>
668 HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, int caller) {
669 #if HWY_SORT_VERIFY
670  const verify::Runs<D> input = verify::StoreVectors(d, v0, v1, v2, v3);
671  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
672 #endif
673 
674  // Layer 1: lane stride 32
675  SortLanesIn2Vectors<kOrder>(v0, v2);
676  SortLanesIn2Vectors<kOrder>(v1, v3);
677 
678  // Layer 2: lane stride 16
679  SortLanesIn2Vectors<kOrder>(v0, v1);
680  SortLanesIn2Vectors<kOrder>(v2, v3);
681 
682  // Layer 3: lane stride 8
683  SortDistance8LanesV<kOrder>(d, v0);
684  SortDistance8LanesV<kOrder>(d, v1);
685  SortDistance8LanesV<kOrder>(d, v2);
686  SortDistance8LanesV<kOrder>(d, v3);
687 
688  // Layer 4: lane stride 4
689  SortDistance4LanesV<kOrder>(d, v0);
690  SortDistance4LanesV<kOrder>(d, v1);
691  SortDistance4LanesV<kOrder>(d, v2);
692  SortDistance4LanesV<kOrder>(d, v3);
693 
694  // Layer 5: lane stride 2
695  SortDistance2LanesQV<kOrder>(d, v0);
696  SortDistance2LanesQV<kOrder>(d, v1);
697  SortDistance2LanesQV<kOrder>(d, v2);
698  SortDistance2LanesQV<kOrder>(d, v3);
699 
700  // Layer 6: lane stride 1
701  SortAdjacentLanesQV<kOrder>(d, v0);
702  SortAdjacentLanesQV<kOrder>(d, v1);
703  SortAdjacentLanesQV<kOrder>(d, v2);
704  SortAdjacentLanesQV<kOrder>(d, v3);
705 
706 #if HWY_SORT_VERIFY
707  const verify::Runs<D> output = verify::StoreVectors(d, v0, v1, v2, v3);
708  output.CheckSorted(kOrder, __LINE__, caller);
709 #endif
710 }
711 
712 // 128 ops. Only used if vectors are at least 16 lanes.
713 template <SortOrder kOrder, class D, class V = Vec<D>>
714 HWY_INLINE void BitonicMergeTo128(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
715  V& v6, V& v7, int caller) {
716 #if HWY_SORT_VERIFY
717  const verify::Runs<D> input =
718  verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
719  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
720 #endif
721 
722  // Layer 1: lane stride 64
723  SortLanesIn2Vectors<kOrder>(v0, v4);
724  SortLanesIn2Vectors<kOrder>(v1, v5);
725  SortLanesIn2Vectors<kOrder>(v2, v6);
726  SortLanesIn2Vectors<kOrder>(v3, v7);
727 
728  BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, __LINE__);
729  BitonicMergeTo64<kOrder>(d, v4, v5, v6, v7, __LINE__);
730 
731 #if HWY_SORT_VERIFY
732  const verify::Runs<D> output =
733  verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
734  output.CheckSorted(kOrder, __LINE__, caller);
735 #endif
736 }
737 
738 // ------------------------------ Vector-length dependent
739 
740 // Only called when N=4 (single block, so quartets can just be stored).
741 template <SortOrder kOrder, class D, class V>
742 HWY_API size_t SingleQuartetPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
743  V& q5, V& q6, V& q7, TFromD<D>* inout) {
744  Store(q0, d, inout + 0 * 4);
745  Store(q1, d, inout + 1 * 4);
746  Store(q2, d, inout + 2 * 4);
747  Store(q3, d, inout + 3 * 4);
748  Store(q4, d, inout + 4 * 4);
749  Store(q5, d, inout + 5 * 4);
750  Store(q6, d, inout + 6 * 4);
751  Store(q7, d, inout + 7 * 4);
752  return 8 * 4;
753 }
754 
755 // Only called when N=8.
756 template <SortOrder kOrder, class D, class V>
757 HWY_API size_t TwoQuartetsPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
758  V& q5, V& q6, V& q7, TFromD<D>* inout) {
759  V v0 = ConcatLowerLower(d, q1, q0);
760  V v1 = ConcatLowerLower(d, q3, q2);
761  V v2 = ConcatLowerLower(d, q5, q4);
762  V v3 = ConcatLowerLower(d, q7, q6);
763  // TODO(janwas): merge into single table
764  V v4 = Reverse(d, ConcatUpperUpper(d, q7, q6));
765  V v5 = Reverse(d, ConcatUpperUpper(d, q5, q4));
766  V v6 = Reverse(d, ConcatUpperUpper(d, q3, q2));
767  V v7 = Reverse(d, ConcatUpperUpper(d, q1, q0));
768  detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
769 
770  Store(v0, d, inout + 0 * 8);
771  Store(v1, d, inout + 1 * 8);
772  Store(v2, d, inout + 2 * 8);
773  Store(v3, d, inout + 3 * 8);
774  Store(v4, d, inout + 4 * 8);
775  Store(v5, d, inout + 5 * 8);
776  Store(v6, d, inout + 6 * 8);
777  Store(v7, d, inout + 7 * 8);
778  return 8 * 8;
779 }
780 
781 // Only called when N=16.
782 template <SortOrder kOrder, typename T, class V>
783 HWY_API size_t FourQuartetsPerVector(Simd<T, 16> d, V& q0, V& q1, V& q2, V& q3,
784  V& q4, V& q5, V& q6, V& q7, T* inout) {
785  const V q11_01_10_00 = Shuffle128_2020(q0, q1);
786  const V q13_03_12_02 = Shuffle128_2020(q2, q3);
787  V v0 = Shuffle128_2020(q11_01_10_00, q13_03_12_02); // 3..0
788 
789  const V q15_05_14_04 = Shuffle128_2020(q4, q5);
790  const V q17_07_16_06 = Shuffle128_2020(q6, q7);
791  V v1 = Shuffle128_2020(q15_05_14_04, q17_07_16_06); // 7..4
792 
793  const V q19_09_18_08 = Shuffle128_3131(q0, q1);
794  const V q1b_0b_1a_0a = Shuffle128_3131(q2, q3);
795  V v3 = Reverse(d, Shuffle128_2020(q19_09_18_08, q1b_0b_1a_0a)); // b..8
796 
797  const V q1d_0d_1c_0c = Shuffle128_3131(q4, q5);
798  const V q1f_0f_1e_0e = Shuffle128_3131(q6, q7);
799  V v2 = Reverse(d, Shuffle128_2020(q1d_0d_1c_0c, q1f_0f_1e_0e)); // f..c
800 
801  detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, -1);
802 
803  // TODO(janwas): merge into single table
804  V v4 = Shuffle128_3131(q11_01_10_00, q13_03_12_02); // 13..10
805  V v5 = Shuffle128_3131(q15_05_14_04, q17_07_16_06); // 17..14
806  V v7 = Reverse(d, Shuffle128_3131(q19_09_18_08, q1b_0b_1a_0a)); // 1b..18
807  V v6 = Reverse(d, Shuffle128_3131(q1d_0d_1c_0c, q1f_0f_1e_0e)); // 1f..1c
808 
809  detail::BitonicMergeTo64<Reverse(kOrder)>(d, v4, v5, v6, v7, -1);
810 
811  detail::BitonicMergeTo128<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
812 
813  Store(v0, d, inout + 0 * 16);
814  Store(v1, d, inout + 1 * 16);
815  Store(v2, d, inout + 2 * 16);
816  Store(v3, d, inout + 3 * 16);
817  Store(v4, d, inout + 4 * 16);
818  Store(v5, d, inout + 5 * 16);
819  Store(v6, d, inout + 6 * 16);
820  Store(v7, d, inout + 7 * 16);
821  return 8 * 16;
822 }
823 
824 // Avoid needing #if at the call sites.
825 template <SortOrder kOrder, typename T>
826 HWY_API size_t TwoQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
827  return 0;
828 }
829 
830 template <SortOrder kOrder, typename T>
831 HWY_API size_t FourQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
832  return 0;
833 }
834 template <SortOrder kOrder, typename T>
835 HWY_API size_t FourQuartetsPerVector(Simd<T, 8> /* tag */, ...) {
836  return 0;
837 }
838 
839 } // namespace detail
840 
841 template <class D>
842 HWY_API size_t SortBatchSize(D d) {
843  const size_t N = Lanes(d);
844  if (N == 4) return 32;
845  if (N == 8) return 64;
846  if (N == 16) return 128;
847  return 0;
848 }
849 
850 template <SortOrder kOrder, class D>
851 HWY_API size_t SortBatch(D d, TFromD<D>* inout) {
852  const size_t N = Lanes(d);
853 
854  Vec<D> q0, q1, q2, q3;
855  detail::SortLanesIn4Vectors<kOrder>(d, inout, q0, q1, q2, q3);
856  detail::Transpose4x4(d, q0, q1, q2, q3);
857  detail::Merge2SortedQuartets<kOrder>(d, q0, q1, -1);
858  detail::Merge2SortedQuartets<kOrder>(d, q2, q3, -1);
859 
860  // Bitonic merges require one input to be in reverse order.
861  constexpr SortOrder kReverse = Reverse(kOrder);
862 
863  Vec<D> q4, q5, q6, q7;
864  detail::SortLanesIn4Vectors<kReverse>(d, inout + 4 * N, q4, q5, q6, q7);
865  detail::Transpose4x4(d, q4, q5, q6, q7);
866  detail::Merge2SortedQuartets<kReverse>(d, q4, q5, -1);
867  detail::Merge2SortedQuartets<kReverse>(d, q6, q7, -1);
868 
869  detail::BitonicMerge4Quartets<kOrder>(d, q0, q1, q4, q5, -1);
870  detail::BitonicMerge4Quartets<kReverse>(d, q2, q3, q6, q7, -1);
871 
872  detail::BitonicMerge8Quartets<kOrder>(d, q0, q1, q4, q5, q2, q3, q6, q7,
873  __LINE__);
874 
875  if (N == 4) {
876  return detail::SingleQuartetPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
877  q7, inout);
878  }
879 
880  if (N == 8) {
881  return detail::TwoQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
882  q7, inout);
883  }
884 
885  return detail::FourQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
886  q7, inout);
887 }
888 
889 #endif // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
890 
891 // NOLINTNEXTLINE(google-readability-namespace-comments)
892 } // namespace HWY_NAMESPACE
893 } // namespace hwy
895 
896 #endif // HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
#define HWY_API
Definition: base.h:117
#define HWY_INLINE
Definition: base.h:59
#define HWY_ASSERT(condition)
Definition: base.h:142
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2332
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3435
HWY_API Vec128< T, N > Load(Simd< T, N > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2152
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1879
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1917
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3726
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3395
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_API Vec128< T > Reverse(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:3362
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3637
HWY_API Vec128< uint32_t, 2 > Shuffle2301(const Vec128< uint32_t, 2 > v)
Definition: arm_neon-inl.h:1698
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3752
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3385
HWY_API void Store(Vec128< T, N > v, Simd< T, N > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2343
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:31
Definition: aligned_allocator.h:23
HWY_NORETURN void int line
Definition: base.h:665
constexpr bool IsFloat()
Definition: base.h:308
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()