Grok  9.7.5
generic_ops-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Target-independent types/functions defined after target-specific ops.
17 
18 // Relies on the external include guard in highway.h.
20 namespace hwy {
21 namespace HWY_NAMESPACE {
22 
23 // The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
24 template <class V>
25 using LaneType = decltype(GetLane(V()));
26 
27 // Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
28 // type of functions that do not take a vector argument, or as an argument type
29 // if the function only has a template argument for D, or for explicit type
30 // names instead of auto. This may be a built-in type.
31 template <class D>
32 using Vec = decltype(Zero(D()));
33 
34 // Mask type. Useful as the return type of functions that do not take a mask
35 // argument, or as an argument type if the function only has a template argument
36 // for D, or for explicit type names instead of auto.
37 template <class D>
38 using Mask = decltype(MaskFromVec(Zero(D())));
39 
40 // Returns the closest value to v within [lo, hi].
41 template <class V>
42 HWY_API V Clamp(const V v, const V lo, const V hi) {
43  return Min(Max(lo, v), hi);
44 }
45 
46 // CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
47 // and RVV has its own implementation of -Lanes.
48 #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
49 
50 template <size_t kLanes, class D, class V = VFromD<D>>
51 HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
52  constexpr size_t kBytes = kLanes * sizeof(LaneType<V>);
53  static_assert(kBytes < 16, "Shift count is per-block");
54  return CombineShiftRightBytes<kBytes>(d, hi, lo);
55 }
56 
57 #endif
58 
59 // Returns lanes with the most significant bit set and all other bits zero.
60 template <class D>
62  using Unsigned = MakeUnsigned<TFromD<D>>;
63  const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
64  return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
65 }
66 
67 // Returns quiet NaN.
68 template <class D>
70  const RebindToSigned<D> di;
71  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
72  // mantissa MSB (to indicate quiet) would be sufficient.
73  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
74 }
75 
76 // ------------------------------ SafeCopyN
77 
78 template <class D, typename T = TFromD<D>>
79 HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
80  T* HWY_RESTRICT to) {
81 #if HWY_MEM_OPS_MIGHT_FAULT
82  (void)d;
83  for (size_t i = 0; i < num; ++i) {
84  to[i] = from[i];
85  }
86 #else
87  BlendedStore(LoadU(d, from), FirstN(d, num), d, to);
88 #endif
89 }
90 
91 // ------------------------------ AESRound
92 
93 // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
94 #if HWY_TARGET != HWY_SCALAR
95 
96 // Define for white-box testing, even if native instructions are available.
97 namespace detail {
98 
99 // Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
100 // Vector Permute Instructions" and the accompanying assembly language
101 // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
102 // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
103 //
104 // A brute-force 256 byte table lookup can also be made constant-time, and
105 // possibly competitive on NEON, but this is more performance-portable
106 // especially for x86 and large vectors.
107 template <class V> // u8
108 HWY_INLINE V SubBytes(V state) {
109  const DFromV<V> du;
110  const auto mask = Set(du, 0xF);
111 
112  // Change polynomial basis to GF(2^4)
113  {
114  alignas(16) static constexpr uint8_t basisL[16] = {
115  0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
116  0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
117  alignas(16) static constexpr uint8_t basisU[16] = {
118  0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
119  0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
120  const auto sL = And(state, mask);
121  const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
122  const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
123  const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
124  state = Xor(gf4L, gf4U);
125  }
126 
127  // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
128  // cause TableLookupBytesOr0 to return 0.
129  alignas(16) static constexpr uint8_t kZetaInv[16] = {
130  0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
131  alignas(16) static constexpr uint8_t kInv[16] = {
132  0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
133  const auto tbl = LoadDup128(du, kInv);
134  const auto sL = And(state, mask); // L=low nibble, U=upper
135  const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
136  const auto sX = Xor(sU, sL);
137  const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
138  const auto invU = TableLookupBytes(tbl, sU);
139  const auto invX = TableLookupBytes(tbl, sX);
140  const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
141  const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
142 
143  // Linear skew (cannot bake 0x63 bias into the table because out* indices
144  // may have the infinity flag set).
145  alignas(16) static constexpr uint8_t kAffineL[16] = {
146  0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
147  0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
148  alignas(16) static constexpr uint8_t kAffineU[16] = {
149  0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
150  0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
151  const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
152  const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
153  return Xor(Xor(affL, affU), Set(du, 0x63));
154 }
155 
156 } // namespace detail
157 
158 #endif // HWY_TARGET != HWY_SCALAR
159 
160 // "Include guard": skip if native AES instructions are available.
161 #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
162 #ifdef HWY_NATIVE_AES
163 #undef HWY_NATIVE_AES
164 #else
165 #define HWY_NATIVE_AES
166 #endif
167 
168 // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
169 #if HWY_TARGET != HWY_SCALAR
170 
171 namespace detail {
172 
173 template <class V> // u8
174 HWY_API V ShiftRows(const V state) {
175  const DFromV<V> du;
176  alignas(16) static constexpr uint8_t kShiftRow[16] = {
177  0, 5, 10, 15, // transposed: state is column major
178  4, 9, 14, 3, //
179  8, 13, 2, 7, //
180  12, 1, 6, 11};
181  const auto shift_row = LoadDup128(du, kShiftRow);
182  return TableLookupBytes(state, shift_row);
183 }
184 
185 template <class V> // u8
186 HWY_API V MixColumns(const V state) {
187  const DFromV<V> du;
188  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
189  // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
190  // 1 2 3 1 // d are on diagonal, no permutation needed.
191  // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
192  // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
193  alignas(16) static constexpr uint8_t k2301[16] = {
194  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
195  alignas(16) static constexpr uint8_t k1230[16] = {
196  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
197  const RebindToSigned<decltype(du)> di; // can only do signed comparisons
198  const auto msb = Lt(BitCast(di, state), Zero(di));
199  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
200  const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8).
201  const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
202  const auto d_s2301 = Xor(d, s2301);
203  const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
204  const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
205  return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
206 }
207 
208 } // namespace detail
209 
210 template <class V> // u8
211 HWY_API V AESRound(V state, const V round_key) {
212  // Intel docs swap the first two steps, but it does not matter because
213  // ShiftRows is a permutation and SubBytes is independent of lane index.
214  state = detail::SubBytes(state);
215  state = detail::ShiftRows(state);
216  state = detail::MixColumns(state);
217  state = Xor(state, round_key); // AddRoundKey
218  return state;
219 }
220 
221 template <class V> // u8
222 HWY_API V AESLastRound(V state, const V round_key) {
223  // LIke AESRound, but without MixColumns.
224  state = detail::SubBytes(state);
225  state = detail::ShiftRows(state);
226  state = Xor(state, round_key); // AddRoundKey
227  return state;
228 }
229 
230 // Constant-time implementation inspired by
231 // https://www.bearssl.org/constanttime.html, but about half the cost because we
232 // use 64x64 multiplies and 128-bit XORs.
233 template <class V>
234 HWY_API V CLMulLower(V a, V b) {
235  const DFromV<V> d;
236  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
237  const auto k1 = Set(d, 0x1111111111111111ULL);
238  const auto k2 = Set(d, 0x2222222222222222ULL);
239  const auto k4 = Set(d, 0x4444444444444444ULL);
240  const auto k8 = Set(d, 0x8888888888888888ULL);
241  const auto a0 = And(a, k1);
242  const auto a1 = And(a, k2);
243  const auto a2 = And(a, k4);
244  const auto a3 = And(a, k8);
245  const auto b0 = And(b, k1);
246  const auto b1 = And(b, k2);
247  const auto b2 = And(b, k4);
248  const auto b3 = And(b, k8);
249 
250  auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
251  auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
252  auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
253  auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
254  m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
255  m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
256  m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
257  m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
258  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
259 }
260 
261 template <class V>
262 HWY_API V CLMulUpper(V a, V b) {
263  const DFromV<V> d;
264  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
265  const auto k1 = Set(d, 0x1111111111111111ULL);
266  const auto k2 = Set(d, 0x2222222222222222ULL);
267  const auto k4 = Set(d, 0x4444444444444444ULL);
268  const auto k8 = Set(d, 0x8888888888888888ULL);
269  const auto a0 = And(a, k1);
270  const auto a1 = And(a, k2);
271  const auto a2 = And(a, k4);
272  const auto a3 = And(a, k8);
273  const auto b0 = And(b, k1);
274  const auto b1 = And(b, k2);
275  const auto b2 = And(b, k4);
276  const auto b3 = And(b, k8);
277 
278  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
279  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
280  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
281  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
282  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
283  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
284  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
285  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
286  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
287 }
288 
289 #endif // HWY_NATIVE_AES
290 #endif // HWY_TARGET != HWY_SCALAR
291 
292 // "Include guard": skip if native POPCNT-related instructions are available.
293 #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
294 #ifdef HWY_NATIVE_POPCNT
295 #undef HWY_NATIVE_POPCNT
296 #else
297 #define HWY_NATIVE_POPCNT
298 #endif
299 
300 #if HWY_TARGET == HWY_RVV
301 #define HWY_MIN_POW2_FOR_128 1
302 #else
303 // All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
304 // guarantee 128 bits anyway.
305 #define HWY_MIN_POW2_FOR_128 0
306 #endif
307 
308 // This algorithm requires vectors to be at least 16 bytes, which is the case
309 // for LMUL >= 2. If not, use the fallback below.
310 template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
311  HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
313  const DFromV<V> d;
314  HWY_ALIGN constexpr uint8_t kLookup[16] = {
315  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
316  };
317  const auto lo = And(v, Set(d, 0xF));
318  const auto hi = ShiftRight<4>(v);
319  const auto lookup = LoadDup128(d, kLookup);
320  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
321 }
322 
323 // RVV has a specialization that avoids the Set().
324 #if HWY_TARGET != HWY_RVV
325 // Slower fallback for capped vectors.
326 template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
328  const DFromV<V> d;
329  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
330  v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
331  v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
332  return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
333 }
334 #endif // HWY_TARGET != HWY_RVV
335 
336 template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
338  const DFromV<V> d;
339  const Repartition<uint8_t, decltype(d)> d8;
340  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
341  return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
342 }
343 
344 template <typename V, HWY_IF_LANES_ARE(uint32_t, V)>
346  const DFromV<V> d;
347  Repartition<uint16_t, decltype(d)> d16;
348  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
349  return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
350 }
351 
352 #if HWY_HAVE_INTEGER64
353 template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
355  const DFromV<V> d;
356  Repartition<uint32_t, decltype(d)> d32;
357  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
358  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
359 }
360 #endif
361 
362 #endif // HWY_NATIVE_POPCNT
363 
364 // NOLINTNEXTLINE(google-readability-namespace-comments)
365 } // namespace HWY_NAMESPACE
366 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_API
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:64
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
d
Definition: rvv-inl.h:1656
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec< D > NaN(D d)
Definition: generic_ops-inl.h:69
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
sseg3 sseg3 sseg4 sseg4 m2
Definition: rvv-inl.h:1436
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:25
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
sseg3 m1
Definition: rvv-inl.h:1409
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition: generic_ops-inl.h:42
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:79
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2108
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
constexpr HWY_API T LimitsMax()
Definition: base.h:497
constexpr HWY_API bool IsSame()
Definition: base.h:286
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80