Grok 10.0.0
contrib/sort/shared-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Definitions shared between vqsort-inl and sorting_networks-inl.
17
18// Normal include guard for target-independent parts
19#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
20#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
21
22#include "hwy/base.h"
23
24namespace hwy {
25
26// Internal constants - these are to avoid magic numbers/literals and cannot be
27// changed without also changing the associated code.
29// SortingNetwork reshapes its input into a matrix. This is the maximum number
30// of *keys* per vector.
31#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
32 static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow
33#else
34 static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
35#endif
36
37 // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
38 // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
39 // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
40 // extra logN factor for larger networks (for which only loose upper bounds
41 // on size are known).
42 static constexpr size_t kMaxRowsLog2 = 4;
43 static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
44
45 static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
46 return kMaxRows * HWY_MIN(N, kMaxCols);
47 }
48
49 // Unrolling is important (pipelining and amortizing branch mispredictions);
50 // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
51 // somewhat slower for sorting than 4x.
52 //
53 // To change, must also update left + 3 * N etc. in the loop.
54 static constexpr size_t kPartitionUnroll = 4;
55
56 static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
57 // The main loop reads kPartitionUnroll vectors, and first loads from
58 // both left and right beforehand, so it requires min = 2 *
59 // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
60 // >= BaseCaseNum), we partition the right side into a buffer. We need
61 // another vector at the end so CompressStore does not overwrite anything.
62 return (2 * kPartitionUnroll + 1) * N;
63 }
64
65 // Chunk := group of keys loaded for sampling a pivot. Matches the typical
66 // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
67 // are larger, use entire vectors to ensure we do not overrun the array.
68 static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
69 return HWY_MAX(64 / sizeof_t, N);
70 }
71
72 static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
73 // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
74 return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
75 }
76
77 template <typename T>
78 static constexpr HWY_INLINE size_t BufNum(size_t N) {
79 // One extra for padding plus another for full-vector loads.
80 return HWY_MAX(BaseCaseNum(N) + 2 * N,
81 HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
82 }
83
84 template <typename T>
85 static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
86 return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
87 }
88};
89
90} // namespace hwy
91
92#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
93
94// Per-target
95#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
96 defined(HWY_TARGET_TOGGLE)
97#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
98#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
99#else
100#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
101#endif
102
103#include "hwy/highway.h"
104
105// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
106// Arm v7 debug.
107#undef VQSORT_ENABLED
108#if (HWY_TARGET == HWY_SCALAR) || \
109 (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
110 (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
111#define VQSORT_ENABLED 0
112#else
113#define VQSORT_ENABLED 1
114#endif
115
116namespace hwy {
117namespace HWY_NAMESPACE {
118
119// Default tag / vector width selector.
120#if HWY_TARGET == HWY_RVV
121// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
122template <typename T>
123using SortTag = ScalableTag<T, -1>;
124#else
125template <typename T>
126using SortTag = ScalableTag<T>;
127#endif
128
129// NOLINTNEXTLINE(google-readability-namespace-comments)
130} // namespace HWY_NAMESPACE
131} // namespace hwy
132
133#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:161
N
Definition: rvv-inl.h:1742
ScalableTag< T, -1 > SortTag
Definition: contrib/sort/shared-inl.h:123
Definition: aligned_allocator.h:27
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: contrib/sort/shared-inl.h:28
static constexpr HWY_INLINE size_t BufBytes(size_t vector_size)
Definition: contrib/sort/shared-inl.h:85
static constexpr size_t kMaxCols
Definition: contrib/sort/shared-inl.h:34
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N)
Definition: contrib/sort/shared-inl.h:56
static constexpr HWY_INLINE size_t BufNum(size_t N)
Definition: contrib/sort/shared-inl.h:78
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N)
Definition: contrib/sort/shared-inl.h:72
static constexpr size_t kMaxRows
Definition: contrib/sort/shared-inl.h:43
static constexpr HWY_INLINE size_t BaseCaseNum(size_t N)
Definition: contrib/sort/shared-inl.h:45
static constexpr size_t kMaxRowsLog2
Definition: contrib/sort/shared-inl.h:42
static constexpr size_t kPartitionUnroll
Definition: contrib/sort/shared-inl.h:54
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N)
Definition: contrib/sort/shared-inl.h:68