Grok 10.0.1
sorting_networks-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target
17#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
20#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
21#else
22#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
23#endif
24
25#include "hwy/contrib/sort/shared-inl.h" // SortConstants
26#include "hwy/highway.h"
27
29namespace hwy {
30namespace HWY_NAMESPACE {
31namespace detail {
32
33#if VQSORT_ENABLED
34
36
37// ------------------------------ SharedTraits
38
39// Code shared between all traits. It's unclear whether these can profitably be
40// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
41// Compare/DupOdd.
42template <class Base>
43struct SharedTraits : public Base {
44 // Conditionally swaps lane 0 with 2, 1 with 3 etc.
45 template <class D>
46 HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
47 const Base* base = static_cast<const Base*>(this);
48 Vec<D> swapped = base->SwapAdjacentPairs(d, v);
49 base->Sort2(d, v, swapped);
50 return base->OddEvenPairs(d, swapped, v);
51 }
52
53 // Swaps with the vector formed by reversing contiguous groups of 8 keys.
54 template <class D>
55 HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
56 const Base* base = static_cast<const Base*>(this);
57 Vec<D> swapped = base->ReverseKeys8(d, v);
58 base->Sort2(d, v, swapped);
59 return base->OddEvenQuads(d, swapped, v);
60 }
61
62 // Swaps with the vector formed by reversing contiguous groups of 8 keys.
63 template <class D>
64 HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
65 const Base* base = static_cast<const Base*>(this);
66 static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
67 Vec<D> swapped = base->ReverseKeys(d, v);
68 base->Sort2(d, v, swapped);
69 return ConcatUpperLower(d, swapped, v); // 8 = half of the vector
70 }
71};
72
73// ------------------------------ Sorting network
74
75// (Green's irregular) sorting network for independent columns in 16 vectors.
76template <class D, class Traits, class V = Vec<D>>
77HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
78 V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
79 V& ve, V& vf) {
80 st.Sort2(d, v0, v1);
81 st.Sort2(d, v2, v3);
82 st.Sort2(d, v4, v5);
83 st.Sort2(d, v6, v7);
84 st.Sort2(d, v8, v9);
85 st.Sort2(d, va, vb);
86 st.Sort2(d, vc, vd);
87 st.Sort2(d, ve, vf);
88 st.Sort2(d, v0, v2);
89 st.Sort2(d, v1, v3);
90 st.Sort2(d, v4, v6);
91 st.Sort2(d, v5, v7);
92 st.Sort2(d, v8, va);
93 st.Sort2(d, v9, vb);
94 st.Sort2(d, vc, ve);
95 st.Sort2(d, vd, vf);
96 st.Sort2(d, v0, v4);
97 st.Sort2(d, v1, v5);
98 st.Sort2(d, v2, v6);
99 st.Sort2(d, v3, v7);
100 st.Sort2(d, v8, vc);
101 st.Sort2(d, v9, vd);
102 st.Sort2(d, va, ve);
103 st.Sort2(d, vb, vf);
104 st.Sort2(d, v0, v8);
105 st.Sort2(d, v1, v9);
106 st.Sort2(d, v2, va);
107 st.Sort2(d, v3, vb);
108 st.Sort2(d, v4, vc);
109 st.Sort2(d, v5, vd);
110 st.Sort2(d, v6, ve);
111 st.Sort2(d, v7, vf);
112 st.Sort2(d, v5, va);
113 st.Sort2(d, v6, v9);
114 st.Sort2(d, v3, vc);
115 st.Sort2(d, v7, vb);
116 st.Sort2(d, vd, ve);
117 st.Sort2(d, v4, v8);
118 st.Sort2(d, v1, v2);
119 st.Sort2(d, v1, v4);
120 st.Sort2(d, v7, vd);
121 st.Sort2(d, v2, v8);
122 st.Sort2(d, vb, ve);
123 st.Sort2(d, v2, v4);
124 st.Sort2(d, v5, v6);
125 st.Sort2(d, v9, va);
126 st.Sort2(d, vb, vd);
127 st.Sort2(d, v3, v8);
128 st.Sort2(d, v7, vc);
129 st.Sort2(d, v3, v5);
130 st.Sort2(d, v6, v8);
131 st.Sort2(d, v7, v9);
132 st.Sort2(d, va, vc);
133 st.Sort2(d, v3, v4);
134 st.Sort2(d, v5, v6);
135 st.Sort2(d, v7, v8);
136 st.Sort2(d, v9, va);
137 st.Sort2(d, vb, vc);
138 st.Sort2(d, v6, v7);
139 st.Sort2(d, v8, v9);
140}
141
142// ------------------------------ Merging networks
143
144// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
145
146template <class D, class Traits, class V = Vec<D>>
147HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
148 V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
149 V& ve, V& vf) {
150 v8 = st.ReverseKeys2(d, v8);
151 v9 = st.ReverseKeys2(d, v9);
152 va = st.ReverseKeys2(d, va);
153 vb = st.ReverseKeys2(d, vb);
154 vc = st.ReverseKeys2(d, vc);
155 vd = st.ReverseKeys2(d, vd);
156 ve = st.ReverseKeys2(d, ve);
157 vf = st.ReverseKeys2(d, vf);
158 st.Sort2(d, v0, vf);
159 st.Sort2(d, v1, ve);
160 st.Sort2(d, v2, vd);
161 st.Sort2(d, v3, vc);
162 st.Sort2(d, v4, vb);
163 st.Sort2(d, v5, va);
164 st.Sort2(d, v6, v9);
165 st.Sort2(d, v7, v8);
166 v4 = st.ReverseKeys2(d, v4);
167 vc = st.ReverseKeys2(d, vc);
168 v5 = st.ReverseKeys2(d, v5);
169 vd = st.ReverseKeys2(d, vd);
170 v6 = st.ReverseKeys2(d, v6);
171 ve = st.ReverseKeys2(d, ve);
172 v7 = st.ReverseKeys2(d, v7);
173 vf = st.ReverseKeys2(d, vf);
174 st.Sort2(d, v0, v7);
175 st.Sort2(d, v8, vf);
176 st.Sort2(d, v1, v6);
177 st.Sort2(d, v9, ve);
178 st.Sort2(d, v2, v5);
179 st.Sort2(d, va, vd);
180 st.Sort2(d, v3, v4);
181 st.Sort2(d, vb, vc);
182 v2 = st.ReverseKeys2(d, v2);
183 v3 = st.ReverseKeys2(d, v3);
184 v6 = st.ReverseKeys2(d, v6);
185 v7 = st.ReverseKeys2(d, v7);
186 va = st.ReverseKeys2(d, va);
187 vb = st.ReverseKeys2(d, vb);
188 ve = st.ReverseKeys2(d, ve);
189 vf = st.ReverseKeys2(d, vf);
190 st.Sort2(d, v0, v3);
191 st.Sort2(d, v1, v2);
192 st.Sort2(d, v4, v7);
193 st.Sort2(d, v5, v6);
194 st.Sort2(d, v8, vb);
195 st.Sort2(d, v9, va);
196 st.Sort2(d, vc, vf);
197 st.Sort2(d, vd, ve);
198 v1 = st.ReverseKeys2(d, v1);
199 v3 = st.ReverseKeys2(d, v3);
200 v5 = st.ReverseKeys2(d, v5);
201 v7 = st.ReverseKeys2(d, v7);
202 v9 = st.ReverseKeys2(d, v9);
203 vb = st.ReverseKeys2(d, vb);
204 vd = st.ReverseKeys2(d, vd);
205 vf = st.ReverseKeys2(d, vf);
206 st.Sort2(d, v0, v1);
207 st.Sort2(d, v2, v3);
208 st.Sort2(d, v4, v5);
209 st.Sort2(d, v6, v7);
210 st.Sort2(d, v8, v9);
211 st.Sort2(d, va, vb);
212 st.Sort2(d, vc, vd);
213 st.Sort2(d, ve, vf);
214 v0 = st.SortPairsDistance1(d, v0);
215 v1 = st.SortPairsDistance1(d, v1);
216 v2 = st.SortPairsDistance1(d, v2);
217 v3 = st.SortPairsDistance1(d, v3);
218 v4 = st.SortPairsDistance1(d, v4);
219 v5 = st.SortPairsDistance1(d, v5);
220 v6 = st.SortPairsDistance1(d, v6);
221 v7 = st.SortPairsDistance1(d, v7);
222 v8 = st.SortPairsDistance1(d, v8);
223 v9 = st.SortPairsDistance1(d, v9);
224 va = st.SortPairsDistance1(d, va);
225 vb = st.SortPairsDistance1(d, vb);
226 vc = st.SortPairsDistance1(d, vc);
227 vd = st.SortPairsDistance1(d, vd);
228 ve = st.SortPairsDistance1(d, ve);
229 vf = st.SortPairsDistance1(d, vf);
230}
231
232template <class D, class Traits, class V = Vec<D>>
233HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
234 V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
235 V& ve, V& vf) {
236 v8 = st.ReverseKeys4(d, v8);
237 v9 = st.ReverseKeys4(d, v9);
238 va = st.ReverseKeys4(d, va);
239 vb = st.ReverseKeys4(d, vb);
240 vc = st.ReverseKeys4(d, vc);
241 vd = st.ReverseKeys4(d, vd);
242 ve = st.ReverseKeys4(d, ve);
243 vf = st.ReverseKeys4(d, vf);
244 st.Sort2(d, v0, vf);
245 st.Sort2(d, v1, ve);
246 st.Sort2(d, v2, vd);
247 st.Sort2(d, v3, vc);
248 st.Sort2(d, v4, vb);
249 st.Sort2(d, v5, va);
250 st.Sort2(d, v6, v9);
251 st.Sort2(d, v7, v8);
252 v4 = st.ReverseKeys4(d, v4);
253 vc = st.ReverseKeys4(d, vc);
254 v5 = st.ReverseKeys4(d, v5);
255 vd = st.ReverseKeys4(d, vd);
256 v6 = st.ReverseKeys4(d, v6);
257 ve = st.ReverseKeys4(d, ve);
258 v7 = st.ReverseKeys4(d, v7);
259 vf = st.ReverseKeys4(d, vf);
260 st.Sort2(d, v0, v7);
261 st.Sort2(d, v8, vf);
262 st.Sort2(d, v1, v6);
263 st.Sort2(d, v9, ve);
264 st.Sort2(d, v2, v5);
265 st.Sort2(d, va, vd);
266 st.Sort2(d, v3, v4);
267 st.Sort2(d, vb, vc);
268 v2 = st.ReverseKeys4(d, v2);
269 v3 = st.ReverseKeys4(d, v3);
270 v6 = st.ReverseKeys4(d, v6);
271 v7 = st.ReverseKeys4(d, v7);
272 va = st.ReverseKeys4(d, va);
273 vb = st.ReverseKeys4(d, vb);
274 ve = st.ReverseKeys4(d, ve);
275 vf = st.ReverseKeys4(d, vf);
276 st.Sort2(d, v0, v3);
277 st.Sort2(d, v1, v2);
278 st.Sort2(d, v4, v7);
279 st.Sort2(d, v5, v6);
280 st.Sort2(d, v8, vb);
281 st.Sort2(d, v9, va);
282 st.Sort2(d, vc, vf);
283 st.Sort2(d, vd, ve);
284 v1 = st.ReverseKeys4(d, v1);
285 v3 = st.ReverseKeys4(d, v3);
286 v5 = st.ReverseKeys4(d, v5);
287 v7 = st.ReverseKeys4(d, v7);
288 v9 = st.ReverseKeys4(d, v9);
289 vb = st.ReverseKeys4(d, vb);
290 vd = st.ReverseKeys4(d, vd);
291 vf = st.ReverseKeys4(d, vf);
292 st.Sort2(d, v0, v1);
293 st.Sort2(d, v2, v3);
294 st.Sort2(d, v4, v5);
295 st.Sort2(d, v6, v7);
296 st.Sort2(d, v8, v9);
297 st.Sort2(d, va, vb);
298 st.Sort2(d, vc, vd);
299 st.Sort2(d, ve, vf);
300 v0 = st.SortPairsReverse4(d, v0);
301 v1 = st.SortPairsReverse4(d, v1);
302 v2 = st.SortPairsReverse4(d, v2);
303 v3 = st.SortPairsReverse4(d, v3);
304 v4 = st.SortPairsReverse4(d, v4);
305 v5 = st.SortPairsReverse4(d, v5);
306 v6 = st.SortPairsReverse4(d, v6);
307 v7 = st.SortPairsReverse4(d, v7);
308 v8 = st.SortPairsReverse4(d, v8);
309 v9 = st.SortPairsReverse4(d, v9);
310 va = st.SortPairsReverse4(d, va);
311 vb = st.SortPairsReverse4(d, vb);
312 vc = st.SortPairsReverse4(d, vc);
313 vd = st.SortPairsReverse4(d, vd);
314 ve = st.SortPairsReverse4(d, ve);
315 vf = st.SortPairsReverse4(d, vf);
316 v0 = st.SortPairsDistance1(d, v0);
317 v1 = st.SortPairsDistance1(d, v1);
318 v2 = st.SortPairsDistance1(d, v2);
319 v3 = st.SortPairsDistance1(d, v3);
320 v4 = st.SortPairsDistance1(d, v4);
321 v5 = st.SortPairsDistance1(d, v5);
322 v6 = st.SortPairsDistance1(d, v6);
323 v7 = st.SortPairsDistance1(d, v7);
324 v8 = st.SortPairsDistance1(d, v8);
325 v9 = st.SortPairsDistance1(d, v9);
326 va = st.SortPairsDistance1(d, va);
327 vb = st.SortPairsDistance1(d, vb);
328 vc = st.SortPairsDistance1(d, vc);
329 vd = st.SortPairsDistance1(d, vd);
330 ve = st.SortPairsDistance1(d, ve);
331 vf = st.SortPairsDistance1(d, vf);
332}
333
334template <class D, class Traits, class V = Vec<D>>
335HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
336 V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
337 V& ve, V& vf) {
338 v8 = st.ReverseKeys8(d, v8);
339 v9 = st.ReverseKeys8(d, v9);
340 va = st.ReverseKeys8(d, va);
341 vb = st.ReverseKeys8(d, vb);
342 vc = st.ReverseKeys8(d, vc);
343 vd = st.ReverseKeys8(d, vd);
344 ve = st.ReverseKeys8(d, ve);
345 vf = st.ReverseKeys8(d, vf);
346 st.Sort2(d, v0, vf);
347 st.Sort2(d, v1, ve);
348 st.Sort2(d, v2, vd);
349 st.Sort2(d, v3, vc);
350 st.Sort2(d, v4, vb);
351 st.Sort2(d, v5, va);
352 st.Sort2(d, v6, v9);
353 st.Sort2(d, v7, v8);
354 v4 = st.ReverseKeys8(d, v4);
355 vc = st.ReverseKeys8(d, vc);
356 v5 = st.ReverseKeys8(d, v5);
357 vd = st.ReverseKeys8(d, vd);
358 v6 = st.ReverseKeys8(d, v6);
359 ve = st.ReverseKeys8(d, ve);
360 v7 = st.ReverseKeys8(d, v7);
361 vf = st.ReverseKeys8(d, vf);
362 st.Sort2(d, v0, v7);
363 st.Sort2(d, v8, vf);
364 st.Sort2(d, v1, v6);
365 st.Sort2(d, v9, ve);
366 st.Sort2(d, v2, v5);
367 st.Sort2(d, va, vd);
368 st.Sort2(d, v3, v4);
369 st.Sort2(d, vb, vc);
370 v2 = st.ReverseKeys8(d, v2);
371 v3 = st.ReverseKeys8(d, v3);
372 v6 = st.ReverseKeys8(d, v6);
373 v7 = st.ReverseKeys8(d, v7);
374 va = st.ReverseKeys8(d, va);
375 vb = st.ReverseKeys8(d, vb);
376 ve = st.ReverseKeys8(d, ve);
377 vf = st.ReverseKeys8(d, vf);
378 st.Sort2(d, v0, v3);
379 st.Sort2(d, v1, v2);
380 st.Sort2(d, v4, v7);
381 st.Sort2(d, v5, v6);
382 st.Sort2(d, v8, vb);
383 st.Sort2(d, v9, va);
384 st.Sort2(d, vc, vf);
385 st.Sort2(d, vd, ve);
386 v1 = st.ReverseKeys8(d, v1);
387 v3 = st.ReverseKeys8(d, v3);
388 v5 = st.ReverseKeys8(d, v5);
389 v7 = st.ReverseKeys8(d, v7);
390 v9 = st.ReverseKeys8(d, v9);
391 vb = st.ReverseKeys8(d, vb);
392 vd = st.ReverseKeys8(d, vd);
393 vf = st.ReverseKeys8(d, vf);
394 st.Sort2(d, v0, v1);
395 st.Sort2(d, v2, v3);
396 st.Sort2(d, v4, v5);
397 st.Sort2(d, v6, v7);
398 st.Sort2(d, v8, v9);
399 st.Sort2(d, va, vb);
400 st.Sort2(d, vc, vd);
401 st.Sort2(d, ve, vf);
402 v0 = st.SortPairsReverse8(d, v0);
403 v1 = st.SortPairsReverse8(d, v1);
404 v2 = st.SortPairsReverse8(d, v2);
405 v3 = st.SortPairsReverse8(d, v3);
406 v4 = st.SortPairsReverse8(d, v4);
407 v5 = st.SortPairsReverse8(d, v5);
408 v6 = st.SortPairsReverse8(d, v6);
409 v7 = st.SortPairsReverse8(d, v7);
410 v8 = st.SortPairsReverse8(d, v8);
411 v9 = st.SortPairsReverse8(d, v9);
412 va = st.SortPairsReverse8(d, va);
413 vb = st.SortPairsReverse8(d, vb);
414 vc = st.SortPairsReverse8(d, vc);
415 vd = st.SortPairsReverse8(d, vd);
416 ve = st.SortPairsReverse8(d, ve);
417 vf = st.SortPairsReverse8(d, vf);
418 v0 = st.SortPairsDistance2(d, v0);
419 v1 = st.SortPairsDistance2(d, v1);
420 v2 = st.SortPairsDistance2(d, v2);
421 v3 = st.SortPairsDistance2(d, v3);
422 v4 = st.SortPairsDistance2(d, v4);
423 v5 = st.SortPairsDistance2(d, v5);
424 v6 = st.SortPairsDistance2(d, v6);
425 v7 = st.SortPairsDistance2(d, v7);
426 v8 = st.SortPairsDistance2(d, v8);
427 v9 = st.SortPairsDistance2(d, v9);
428 va = st.SortPairsDistance2(d, va);
429 vb = st.SortPairsDistance2(d, vb);
430 vc = st.SortPairsDistance2(d, vc);
431 vd = st.SortPairsDistance2(d, vd);
432 ve = st.SortPairsDistance2(d, ve);
433 vf = st.SortPairsDistance2(d, vf);
434 v0 = st.SortPairsDistance1(d, v0);
435 v1 = st.SortPairsDistance1(d, v1);
436 v2 = st.SortPairsDistance1(d, v2);
437 v3 = st.SortPairsDistance1(d, v3);
438 v4 = st.SortPairsDistance1(d, v4);
439 v5 = st.SortPairsDistance1(d, v5);
440 v6 = st.SortPairsDistance1(d, v6);
441 v7 = st.SortPairsDistance1(d, v7);
442 v8 = st.SortPairsDistance1(d, v8);
443 v9 = st.SortPairsDistance1(d, v9);
444 va = st.SortPairsDistance1(d, va);
445 vb = st.SortPairsDistance1(d, vb);
446 vc = st.SortPairsDistance1(d, vc);
447 vd = st.SortPairsDistance1(d, vd);
448 ve = st.SortPairsDistance1(d, ve);
449 vf = st.SortPairsDistance1(d, vf);
450}
451
452// Unused on MSVC, see below
453#if !HWY_COMPILER_MSVC
454
455template <class D, class Traits, class V = Vec<D>>
456HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
457 V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
458 V& vd, V& ve, V& vf) {
459 v8 = st.ReverseKeys16(d, v8);
460 v9 = st.ReverseKeys16(d, v9);
461 va = st.ReverseKeys16(d, va);
462 vb = st.ReverseKeys16(d, vb);
463 vc = st.ReverseKeys16(d, vc);
464 vd = st.ReverseKeys16(d, vd);
465 ve = st.ReverseKeys16(d, ve);
466 vf = st.ReverseKeys16(d, vf);
467 st.Sort2(d, v0, vf);
468 st.Sort2(d, v1, ve);
469 st.Sort2(d, v2, vd);
470 st.Sort2(d, v3, vc);
471 st.Sort2(d, v4, vb);
472 st.Sort2(d, v5, va);
473 st.Sort2(d, v6, v9);
474 st.Sort2(d, v7, v8);
475 v4 = st.ReverseKeys16(d, v4);
476 vc = st.ReverseKeys16(d, vc);
477 v5 = st.ReverseKeys16(d, v5);
478 vd = st.ReverseKeys16(d, vd);
479 v6 = st.ReverseKeys16(d, v6);
480 ve = st.ReverseKeys16(d, ve);
481 v7 = st.ReverseKeys16(d, v7);
482 vf = st.ReverseKeys16(d, vf);
483 st.Sort2(d, v0, v7);
484 st.Sort2(d, v8, vf);
485 st.Sort2(d, v1, v6);
486 st.Sort2(d, v9, ve);
487 st.Sort2(d, v2, v5);
488 st.Sort2(d, va, vd);
489 st.Sort2(d, v3, v4);
490 st.Sort2(d, vb, vc);
491 v2 = st.ReverseKeys16(d, v2);
492 v3 = st.ReverseKeys16(d, v3);
493 v6 = st.ReverseKeys16(d, v6);
494 v7 = st.ReverseKeys16(d, v7);
495 va = st.ReverseKeys16(d, va);
496 vb = st.ReverseKeys16(d, vb);
497 ve = st.ReverseKeys16(d, ve);
498 vf = st.ReverseKeys16(d, vf);
499 st.Sort2(d, v0, v3);
500 st.Sort2(d, v1, v2);
501 st.Sort2(d, v4, v7);
502 st.Sort2(d, v5, v6);
503 st.Sort2(d, v8, vb);
504 st.Sort2(d, v9, va);
505 st.Sort2(d, vc, vf);
506 st.Sort2(d, vd, ve);
507 v1 = st.ReverseKeys16(d, v1);
508 v3 = st.ReverseKeys16(d, v3);
509 v5 = st.ReverseKeys16(d, v5);
510 v7 = st.ReverseKeys16(d, v7);
511 v9 = st.ReverseKeys16(d, v9);
512 vb = st.ReverseKeys16(d, vb);
513 vd = st.ReverseKeys16(d, vd);
514 vf = st.ReverseKeys16(d, vf);
515 st.Sort2(d, v0, v1);
516 st.Sort2(d, v2, v3);
517 st.Sort2(d, v4, v5);
518 st.Sort2(d, v6, v7);
519 st.Sort2(d, v8, v9);
520 st.Sort2(d, va, vb);
521 st.Sort2(d, vc, vd);
522 st.Sort2(d, ve, vf);
523 v0 = st.SortPairsReverse16(d, v0);
524 v1 = st.SortPairsReverse16(d, v1);
525 v2 = st.SortPairsReverse16(d, v2);
526 v3 = st.SortPairsReverse16(d, v3);
527 v4 = st.SortPairsReverse16(d, v4);
528 v5 = st.SortPairsReverse16(d, v5);
529 v6 = st.SortPairsReverse16(d, v6);
530 v7 = st.SortPairsReverse16(d, v7);
531 v8 = st.SortPairsReverse16(d, v8);
532 v9 = st.SortPairsReverse16(d, v9);
533 va = st.SortPairsReverse16(d, va);
534 vb = st.SortPairsReverse16(d, vb);
535 vc = st.SortPairsReverse16(d, vc);
536 vd = st.SortPairsReverse16(d, vd);
537 ve = st.SortPairsReverse16(d, ve);
538 vf = st.SortPairsReverse16(d, vf);
539 v0 = st.SortPairsDistance4(d, v0);
540 v1 = st.SortPairsDistance4(d, v1);
541 v2 = st.SortPairsDistance4(d, v2);
542 v3 = st.SortPairsDistance4(d, v3);
543 v4 = st.SortPairsDistance4(d, v4);
544 v5 = st.SortPairsDistance4(d, v5);
545 v6 = st.SortPairsDistance4(d, v6);
546 v7 = st.SortPairsDistance4(d, v7);
547 v8 = st.SortPairsDistance4(d, v8);
548 v9 = st.SortPairsDistance4(d, v9);
549 va = st.SortPairsDistance4(d, va);
550 vb = st.SortPairsDistance4(d, vb);
551 vc = st.SortPairsDistance4(d, vc);
552 vd = st.SortPairsDistance4(d, vd);
553 ve = st.SortPairsDistance4(d, ve);
554 vf = st.SortPairsDistance4(d, vf);
555 v0 = st.SortPairsDistance2(d, v0);
556 v1 = st.SortPairsDistance2(d, v1);
557 v2 = st.SortPairsDistance2(d, v2);
558 v3 = st.SortPairsDistance2(d, v3);
559 v4 = st.SortPairsDistance2(d, v4);
560 v5 = st.SortPairsDistance2(d, v5);
561 v6 = st.SortPairsDistance2(d, v6);
562 v7 = st.SortPairsDistance2(d, v7);
563 v8 = st.SortPairsDistance2(d, v8);
564 v9 = st.SortPairsDistance2(d, v9);
565 va = st.SortPairsDistance2(d, va);
566 vb = st.SortPairsDistance2(d, vb);
567 vc = st.SortPairsDistance2(d, vc);
568 vd = st.SortPairsDistance2(d, vd);
569 ve = st.SortPairsDistance2(d, ve);
570 vf = st.SortPairsDistance2(d, vf);
571 v0 = st.SortPairsDistance1(d, v0);
572 v1 = st.SortPairsDistance1(d, v1);
573 v2 = st.SortPairsDistance1(d, v2);
574 v3 = st.SortPairsDistance1(d, v3);
575 v4 = st.SortPairsDistance1(d, v4);
576 v5 = st.SortPairsDistance1(d, v5);
577 v6 = st.SortPairsDistance1(d, v6);
578 v7 = st.SortPairsDistance1(d, v7);
579 v8 = st.SortPairsDistance1(d, v8);
580 v9 = st.SortPairsDistance1(d, v9);
581 va = st.SortPairsDistance1(d, va);
582 vb = st.SortPairsDistance1(d, vb);
583 vc = st.SortPairsDistance1(d, vc);
584 vd = st.SortPairsDistance1(d, vd);
585 ve = st.SortPairsDistance1(d, ve);
586 vf = st.SortPairsDistance1(d, vf);
587}
588
589#endif // !HWY_COMPILER_MSVC
590
591// Reshapes `buf` into a matrix, sorts columns independently, and then merges
592// into a sorted 1D array without transposing.
593//
594// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
595// differences in sort order and single-lane vs 128-bit keys.
596// `buf` ensures full vectors are aligned, and enables loads/stores without
597// bounds checks.
598//
599// NOINLINE because this is large and called twice from vqsort-inl.h.
600//
601// References:
602// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
603// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
604// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
605template <class Traits, typename T>
606HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
608 using V = decltype(Zero(d));
609
611
612 // The network width depends on the number of keys, not lanes.
613 constexpr size_t kLanesPerKey = st.LanesPerKey();
614 const size_t keys = cols / kLanesPerKey;
615 constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
616
617 // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
618 // offsets to duplicating this code for every value of cols.
619 static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
620 V v0 = LoadU(d, buf + 0x0 * cols);
621 V v1 = LoadU(d, buf + 0x1 * cols);
622 V v2 = LoadU(d, buf + 0x2 * cols);
623 V v3 = LoadU(d, buf + 0x3 * cols);
624 V v4 = LoadU(d, buf + 0x4 * cols);
625 V v5 = LoadU(d, buf + 0x5 * cols);
626 V v6 = LoadU(d, buf + 0x6 * cols);
627 V v7 = LoadU(d, buf + 0x7 * cols);
628 V v8 = LoadU(d, buf + 0x8 * cols);
629 V v9 = LoadU(d, buf + 0x9 * cols);
630 V va = LoadU(d, buf + 0xa * cols);
631 V vb = LoadU(d, buf + 0xb * cols);
632 V vc = LoadU(d, buf + 0xc * cols);
633 V vd = LoadU(d, buf + 0xd * cols);
634 V ve = LoadU(d, buf + 0xe * cols);
635 V vf = LoadU(d, buf + 0xf * cols);
636
637 Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
638
639 // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
640 // code paths: if MaxLanes < 2, then keys <= cols < 2.
641 if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
642 Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
643 vf);
644
645 if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
646 Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
647 vf);
648
649 if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
650 Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
651 ve, vf);
652
653 // Avoids build timeout. Must match #if condition in kMaxCols.
654#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
655 if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
656 Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
657 ve, vf);
658
659 static_assert(Constants::kMaxCols <= 16, "Add more branches");
660 }
661#endif
662 }
663 }
664 }
665
666 StoreU(v0, d, buf + 0x0 * cols);
667 StoreU(v1, d, buf + 0x1 * cols);
668 StoreU(v2, d, buf + 0x2 * cols);
669 StoreU(v3, d, buf + 0x3 * cols);
670 StoreU(v4, d, buf + 0x4 * cols);
671 StoreU(v5, d, buf + 0x5 * cols);
672 StoreU(v6, d, buf + 0x6 * cols);
673 StoreU(v7, d, buf + 0x7 * cols);
674 StoreU(v8, d, buf + 0x8 * cols);
675 StoreU(v9, d, buf + 0x9 * cols);
676 StoreU(va, d, buf + 0xa * cols);
677 StoreU(vb, d, buf + 0xb * cols);
678 StoreU(vc, d, buf + 0xc * cols);
679 StoreU(vd, d, buf + 0xd * cols);
680 StoreU(ve, d, buf + 0xe * cols);
681 StoreU(vf, d, buf + 0xf * cols);
682}
683
684#else
685template <class Base>
686struct SharedTraits : public Base {};
687#endif // VQSORT_ENABLED
688
689} // namespace detail
690// NOLINTNEXTLINE(google-readability-namespace-comments)
691} // namespace HWY_NAMESPACE
692} // namespace hwy
694
695#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_NOINLINE
Definition: base.h:63
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
#define HWY_LIKELY(expr)
Definition: base.h:66
d
Definition: rvv-inl.h:1742
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition: ops/shared-inl.h:276
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: ops/shared-inl.h:172
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
const vfloat64m1_t v
Definition: rvv-inl.h:1742
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
Definition: sorting_networks-inl.h:686
Definition: contrib/sort/shared-inl.h:28
static constexpr size_t kMaxCols
Definition: contrib/sort/shared-inl.h:34
static constexpr size_t kMaxRows
Definition: contrib/sort/shared-inl.h:43