Grok  9.5.0
shared-inl.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Per-target definitions shared by ops/*.h and user code.
16 
17 #include <cmath>
18 
19 #include "hwy/base.h"
20 
21 // Separate header because foreach_target.h re-enables its include guard.
22 #include "hwy/ops/set_macros-inl.h"
23 
24 // Relies on the external include guard in highway.h.
26 namespace hwy {
27 namespace HWY_NAMESPACE {
28 
29 // SIMD operations are implemented as overloaded functions selected using a
30 // "descriptor" D := Simd<T, N>. T is the lane type, N an opaque integer for
31 // internal use only. Users create D via aliases ScalableTag<T>() (a full
32 // vector), CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of
33 // lanes (always a power of two) is Lanes(D()).
34 template <typename Lane, size_t N>
35 struct Simd {
36  constexpr Simd() = default;
37  using T = Lane;
38  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
39 
40  // Widening/narrowing ops change the number of lanes and/or their type.
41  // To initialize such vectors, we need the corresponding descriptor types:
42 
43  // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
44  template <typename NewLane>
46 
47  // MulEven() with another lane type, but same total size.
48  // Round up to correctly handle scalars with N=1.
49  template <typename NewLane>
50  using Repartition =
51  Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>;
52 
53  // LowerHalf() with the same lane type, but half the lanes.
54  // Round up to correctly handle scalars with N=1.
55  using Half = Simd<T, (N + 1) / 2>;
56 
57  // Combine() with the same lane type, but twice the lanes.
59 };
60 
61 namespace detail {
62 
63 // Given N from HWY_LANES(T), returns N for use in Simd<T, N> to describe:
64 // - a full vector (pow2 = 0);
65 // - 2,4,8 regs on RVV, otherwise a full vector (pow2 [1,3]);
66 // - a fraction of a register from 1/8 to 1/2 (pow2 [-3,-1]).
67 constexpr size_t ScaleByPower(size_t N, int pow2) {
68 #if HWY_TARGET == HWY_RVV
69  // For fractions, if N == 1 ensure we still return at least one lane.
70  return pow2 >= 0 ? (N << pow2) : HWY_MAX(1, (N >> (-pow2)));
71 #else
72  // If pow2 > 0, replace it with 0 (there is nothing wider than a full vector).
73  return HWY_MAX(1, N >> HWY_MAX(-pow2, 0));
74 #endif
75 }
76 
77 // Struct wrappers enable validation of arguments via static_assert.
78 template <typename T, int kPow2>
80  static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
81  using type = Simd<T, ScaleByPower(HWY_LANES(T), kPow2)>;
82 };
83 
84 template <typename T, size_t kLimit>
86  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
87  using type = Simd<T, HWY_MIN(kLimit, HWY_LANES(T))>;
88 };
89 
90 template <typename T, size_t kNumLanes>
92  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
93  static_assert(kNumLanes * sizeof(T) <= HWY_MAX_BYTES, "Too many lanes");
94 #if HWY_TARGET == HWY_SCALAR
95  // HWY_MAX_BYTES would still allow uint8x8, which is not supported.
96  static_assert(kNumLanes == 1, "Scalar only supports one lane");
97 #endif
99 };
100 
101 } // namespace detail
102 
103 // Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
104 // e.g. 1D loops where the application does not care about the vector size) or a
105 // fraction/multiple of one. Multiples are the same as full vectors for all
106 // targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
107 // value of type promotion and demotion.
108 template <typename T, int kPow2 = 0>
110 
111 // Alias for a tag describing a vector with *up to* kLimit active lanes, even on
112 // targets with scalable vectors and HWY_SCALAR. The runtime lane count
113 // `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
114 // typically used for 1D loops with a relatively low application-defined upper
115 // bound, e.g. for 8x8 DCTs. However, it is better if data structures are
116 // designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
117 // chunks of say 256 DC components followed by 256 AC1 and finally 256 AC63;
118 // this would enable vector-length-agnostic loops using ScalableTag).
119 template <typename T, size_t kLimit>
121 
122 // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
123 // even on targets with scalable vectors. All targets except HWY_SCALAR support
124 // up to 16 / sizeof(T). Other targets may allow larger kNumLanes, but relying
125 // on that is non-portable and discouraged.
126 //
127 // NOTE: if the application does not need to support HWY_SCALAR (+), use this
128 // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
129 // This is useful for data structures that rely on exactly 128-bit SIMD, but
130 // these are discouraged because they cannot benefit from wider vectors.
131 // Instead, applications would ideally define a larger problem size and loop
132 // over it with the (unknown size) vectors from ScalableTag.
133 //
134 // + e.g. if the baseline is known to support SIMD, or the application requires
135 // ops such as TableLookupBytes not supported by HWY_SCALAR.
136 template <typename T, size_t kNumLanes>
138 
139 template <class D>
140 using TFromD = typename D::T;
141 
142 // Descriptor for the same number of lanes as D, but with the LaneType T.
143 template <class T, class D>
144 using Rebind = typename D::template Rebind<T>;
145 
146 template <class D>
148 template <class D>
150 template <class D>
152 
153 // Descriptor for the same total size as D, but with the LaneType T.
154 template <class T, class D>
155 using Repartition = typename D::template Repartition<T>;
156 
157 template <class D>
159 template <class D>
161 
162 // Descriptor for the same lane type as D, but half the lanes.
163 template <class D>
164 using Half = typename D::Half;
165 
166 // Descriptor for the same lane type as D, but twice the lanes.
167 template <class D>
168 using Twice = typename D::Twice;
169 
170 // Same as base.h macros but with a Simd<T, N> argument instead of T.
171 #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
172 #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
173 #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
174 #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
175 #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
176 #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
177 
178 // Same, but with a vector argument.
179 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
180 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
181 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
182 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
183 
184 // For implementing functions for a specific type.
185 // IsSame<...>() in template arguments is broken on MSVC2015.
186 #define HWY_IF_LANES_ARE(T, V) \
187  EnableIf<IsSameT<T, TFromD<DFromV<V>>>::value>* = nullptr
188 
189 // Compile-time-constant, (typically but not guaranteed) an upper bound on the
190 // number of lanes.
191 // Prefer instead using Lanes() and dynamic allocation, or Rebind, or
192 // `#if HWY_CAP_GE*`.
193 template <typename T, size_t N>
195  return N;
196 }
197 
198 // Targets with non-constexpr Lanes define this themselves.
199 #if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE
200 
201 // (Potentially) non-constant actual size of the vector at runtime, subject to
202 // the limit imposed by the Simd. Useful for advancing loop counters.
203 template <typename T, size_t N>
204 HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
205  return N;
206 }
207 
208 #endif
209 
210 // NOTE: GCC generates incorrect code for vector arguments to non-inlined
211 // functions in two situations:
212 // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
213 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
214 // - on ARM64 and GCC 9.3.0 or 11.2.1, passing by const& causes many (but not
215 // all) tests to fail.
216 //
217 // We therefore pass by const& only on GCC and Windows. This alias must be used
218 // for all vector/mask parameters of functions marked HWY_NOINLINE, and possibly
219 // also all functions not marked HWY_INLINE nor HWY_API.
220 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
221  (defined(_WIN32) || defined(_WIN64))
222 template <class V>
223 using VecArg = const V&;
224 #else
225 template <class V>
226 using VecArg = V;
227 #endif
228 
229 // NOLINTNEXTLINE(google-readability-namespace-comments)
230 } // namespace HWY_NAMESPACE
231 } // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:123
#define HWY_MIN(a, b)
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:59
#define HWY_MAYBE_UNUSED
Definition: base.h:70
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: shared-inl.h:67
V VecArg
Definition: shared-inl.h:226
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: shared-inl.h:158
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: shared-inl.h:149
constexpr HWY_API size_t Lanes(Simd< T, N >)
Definition: arm_sve-inl.h:226
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: shared-inl.h:120
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: shared-inl.h:151
typename D::Twice Twice
Definition: shared-inl.h:168
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: shared-inl.h:147
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: shared-inl.h:160
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: shared-inl.h:109
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_INLINE constexpr HWY_MAYBE_UNUSED size_t MaxLanes(Simd< T, N >)
Definition: shared-inl.h:194
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition: shared-inl.h:137
typename D::Half Half
Definition: shared-inl.h:164
typename D::template Repartition< T > Repartition
Definition: shared-inl.h:155
typename D::T TFromD
Definition: shared-inl.h:140
Definition: aligned_allocator.h:23
#define HWY_MAX_BYTES
Definition: set_macros-inl.h:79
#define HWY_LANES(T)
Definition: set_macros-inl.h:80
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
Definition: shared-inl.h:35
Lane T
Definition: shared-inl.h:37
constexpr Simd()=default