Grok 10.0.1
detect_targets.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
17#define HIGHWAY_HWY_DETECT_TARGETS_H_
18
19// Defines targets and chooses which to enable.
20
22
23//------------------------------------------------------------------------------
24// Optional configuration
25
26// See ../quick_reference.md for documentation of these macros.
27
28// Uncomment to override the default baseline determined from predefined macros:
29// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
30
31// Uncomment to override the default blocklist:
32// #define HWY_BROKEN_TARGETS HWY_AVX3
33
34// Uncomment to definitely avoid generating those target(s):
35// #define HWY_DISABLED_TARGETS HWY_SSE4
36
37// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
38// AVX2 target for VMs which support AVX2 but not the other instruction sets)
39// #define HWY_DISABLE_BMI2_FMA
40
41// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
42// #define HWY_WANT_SSSE3
43// #define HWY_WANT_SSE4
44
45//------------------------------------------------------------------------------
46// Targets
47
48// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
49// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
50//
51// All values are unconditionally defined so we can test HWY_TARGETS without
52// first checking the HWY_ARCH_*.
53//
54// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
55// can use 32-bit literals.
56
57// 1,2,4: reserved
58
59// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
60// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
61// Tiger Lake? We do not yet have uses for GFNI.
62#define HWY_AVX3_DL 8 // see HWY_WANT_AVX3_DL below
63#define HWY_AVX3 16
64#define HWY_AVX2 32
65// 64: reserved for AVX
66#define HWY_SSE4 128
67#define HWY_SSSE3 256
68// 512: reserved for SSE3 or SSE2
69
70// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
71// dynamic dispatch. All x86 target bits must be lower or equal to
72// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
73// HWY_MAX_DYNAMIC_TARGETS in total.
74#define HWY_HIGHEST_TARGET_BIT_X86 9
75
76// 0x400, 0x800: reserved
77#define HWY_SVE2_128 0x1000 // specialized target (e.g. Arm N2)
78#define HWY_SVE_256 0x2000 // specialized target (e.g. Arm V1)
79#define HWY_SVE2 0x4000
80#define HWY_SVE 0x8000
81// 0x10000 reserved for Helium
82#define HWY_NEON 0x20000
83
84#define HWY_HIGHEST_TARGET_BIT_ARM 17
85
86// 0x40000 reserved
87#define HWY_PPC8 0x80000 // v2.07 or 3
88// 0x100000 reserved for prior VSX/AltiVec
89
90#define HWY_HIGHEST_TARGET_BIT_PPC 20
91
92// 0x200000, 0x400000 reserved
93#define HWY_WASM_EMU256 0x800000 // Experimental
94#define HWY_WASM 0x1000000
95
96#define HWY_HIGHEST_TARGET_BIT_WASM 24
97
98// 0x2000000, 0x4000000, 0x8000000 reserved
99#define HWY_RVV 0x10000000
100
101#define HWY_HIGHEST_TARGET_BIT_RVV 28
102
103#define HWY_EMU128 0x20000000
104#define HWY_SCALAR 0x40000000
105
106#define HWY_HIGHEST_TARGET_BIT_SCALAR 30
107
108// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
109
110//------------------------------------------------------------------------------
111// Set default blocklists
112
113// Disabled means excluded from enabled at user's request. A separate config
114// macro allows disabling without deactivating the blocklist below.
115#ifndef HWY_DISABLED_TARGETS
116#define HWY_DISABLED_TARGETS 0
117#endif
118
119// Broken means excluded from enabled due to known compiler issues. Allow the
120// user to override this blocklist without any guarantee of success.
121#ifndef HWY_BROKEN_TARGETS
122
123// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
124// SSE4 codegen (possibly only for msan), so disable all those targets.
125#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
126#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
127// This entails a major speed reduction, so warn unless the user explicitly
128// opts in to scalar-only.
129#if !defined(HWY_COMPILE_ONLY_SCALAR)
130#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
131#endif
132
133// 32-bit may fail to compile AVX2/3.
134#elif HWY_ARCH_X86_32
135#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
136
137// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
138#elif HWY_COMPILER_MSVC != 0
139#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
140
141// armv7be has not been tested and is not yet supported.
142#elif HWY_ARCH_ARM_V7 && \
143 (defined(__ARM_BIG_ENDIAN) || \
144 (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
145#define HWY_BROKEN_TARGETS (HWY_NEON)
146
147// SVE[2] require recent clang or gcc versions.
148#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
149(!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
150#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
151
152#else
153#define HWY_BROKEN_TARGETS 0
154#endif
155
156#endif // HWY_BROKEN_TARGETS
157
158// Enabled means not disabled nor blocklisted.
159#define HWY_ENABLED(targets) \
160 ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
161
162//------------------------------------------------------------------------------
163// Detect baseline targets using predefined macros
164
165// Baseline means the targets for which the compiler is allowed to generate
166// instructions, implying the target CPU would have to support them. This does
167// not take the blocklist into account.
168
169#if defined(HWY_COMPILE_ONLY_SCALAR)
170#define HWY_BASELINE_SCALAR HWY_SCALAR
171#else
172#define HWY_BASELINE_SCALAR HWY_EMU128
173#endif
174
175// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
176// HWY_TARGET == HWY_BASELINE_SCALAR.
177
178#if HWY_ARCH_WASM && defined(__wasm_simd128__)
179#if defined(HWY_WANT_WASM2)
180#define HWY_BASELINE_WASM HWY_WASM_EMU256
181#else
182#define HWY_BASELINE_WASM HWY_WASM
183#endif // HWY_WANT_WASM2
184#else
185#define HWY_BASELINE_WASM 0
186#endif
187
188// Avoid choosing the PPC target until we have an implementation.
189#if HWY_ARCH_PPC && defined(__VSX__) && 0
190#define HWY_BASELINE_PPC8 HWY_PPC8
191#else
192#define HWY_BASELINE_PPC8 0
193#endif
194
195#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
196#define HWY_BASELINE_SVE2 HWY_SVE2
197#else
198#define HWY_BASELINE_SVE2 0
199#endif
200
201#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
202// Baseline targets can be used unconditionally, which does not apply to
203// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
204// in the baseline would also disable all 'worse' targets (including SVE and
205// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
206// HWY_ATTAINABLE_TARGETS below.
207#define HWY_BASELINE_SVE HWY_SVE
208#else
209#define HWY_BASELINE_SVE 0
210#endif
211
212// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
213#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
214#define HWY_BASELINE_NEON HWY_NEON
215#else
216#define HWY_BASELINE_NEON 0
217#endif
218
219// Special handling for MSVC because it has fewer predefined macros:
220#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
221
222// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
223// https://stackoverflow.com/questions/18563978/.
224#if defined(__AVX__)
225#define HWY_CHECK_SSSE3 1
226#define HWY_CHECK_SSE4 1
227#else
228#define HWY_CHECK_SSSE3 0
229#define HWY_CHECK_SSE4 0
230#endif
231
232// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
233// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
234#define HWY_CHECK_PCLMUL_AES 1
235#define HWY_CHECK_BMI2_FMA 1
236#define HWY_CHECK_F16C 1
237
238#else // non-MSVC
239
240#if defined(__SSSE3__)
241#define HWY_CHECK_SSSE3 1
242#else
243#define HWY_CHECK_SSSE3 0
244#endif
245
246#if defined(__SSE4_1__) && defined(__SSE4_2__)
247#define HWY_CHECK_SSE4 1
248#else
249#define HWY_CHECK_SSE4 0
250#endif
251
252// If these are disabled, they should not gate the availability of SSE4/AVX2.
253#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
254#define HWY_CHECK_PCLMUL_AES 1
255#else
256#define HWY_CHECK_PCLMUL_AES 0
257#endif
258
259#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
260#define HWY_CHECK_BMI2_FMA 1
261#else
262#define HWY_CHECK_BMI2_FMA 0
263#endif
264
265#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
266#define HWY_CHECK_F16C 1
267#else
268#define HWY_CHECK_F16C 0
269#endif
270
271#endif // non-MSVC
272
273#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
274#define HWY_BASELINE_SSSE3 HWY_SSSE3
275#else
276#define HWY_BASELINE_SSSE3 0
277#endif
278
279#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
280#define HWY_BASELINE_SSE4 HWY_SSE4
281#else
282#define HWY_BASELINE_SSE4 0
283#endif
284
285#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
286 defined(__AVX2__)
287#define HWY_BASELINE_AVX2 HWY_AVX2
288#else
289#define HWY_BASELINE_AVX2 0
290#endif
291
292// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
293#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
294 defined(__AVX512DQ__) && defined(__AVX512VL__)
295#define HWY_BASELINE_AVX3 HWY_AVX3
296#else
297#define HWY_BASELINE_AVX3 0
298#endif
299
300// TODO(janwas): not yet known whether these will be set by MSVC
301#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
302 defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
303 defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
304 defined(__AVX512BITALG__)
305#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
306#else
307#define HWY_BASELINE_AVX3_DL 0
308#endif
309
310#if HWY_ARCH_RVV && defined(__riscv_vector)
311#define HWY_BASELINE_RVV HWY_RVV
312#else
313#define HWY_BASELINE_RVV 0
314#endif
315
316// Allow the user to override this without any guarantee of success.
317#ifndef HWY_BASELINE_TARGETS
318#define HWY_BASELINE_TARGETS \
319 (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
320 HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON | \
321 HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
322 HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
323#endif // HWY_BASELINE_TARGETS
324
325//------------------------------------------------------------------------------
326// Choose target for static dispatch
327
328#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
329#if HWY_ENABLED_BASELINE == 0
330#error "At least one baseline target must be defined and enabled"
331#endif
332
333// Best baseline, used for static dispatch. This is the least-significant 1-bit
334// within HWY_ENABLED_BASELINE and lower bit values imply "better".
335#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
336
337// Start by assuming static dispatch. If we later use dynamic dispatch, this
338// will be defined to other targets during the multiple-inclusion, and finally
339// return to the initial value. Defining this outside begin/end_target ensures
340// inl headers successfully compile by themselves (required by Bazel).
341#define HWY_TARGET HWY_STATIC_TARGET
342
343//------------------------------------------------------------------------------
344// Choose targets for dynamic dispatch according to one of four policies
345
346#if defined(HWY_COMPILE_ONLY_SCALAR) && defined(HWY_COMPILE_ONLY_STATIC)
347#error "Defined both HWY_COMPILE_ONLY_{SCALAR|STATIC} - bug?"
348#endif
349// Defining either HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
350
351// AVX3_DL is not widely available yet. To reduce code size and compile time,
352// only include it in the set of attainable targets (for dynamic dispatch) if
353// the user opts in, OR it is in the baseline (we check whether enabled below).
354#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
355#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
356#else
357#define HWY_ATTAINABLE_AVX3_DL 0
358#endif
359
360#if HWY_ARCH_ARM_A64 && (HWY_ENABLED_BASELINE & HWY_SVE)
361#define HWY_ATTAINABLE_SVE_256 HWY_ENABLED(HWY_SVE_256)
362#else
363#define HWY_ATTAINABLE_SVE_256 0
364#endif
365
366#if HWY_ARCH_ARM_A64 && (HWY_ENABLED_BASELINE & HWY_SVE2)
367#define HWY_ATTAINABLE_SVE2_128 HWY_ENABLED(HWY_SVE2_128)
368#else
369#define HWY_ATTAINABLE_SVE2_128 0
370#endif
371
372// Attainable means enabled and the compiler allows intrinsics (even when not
373// allowed to autovectorize). Used in 3 and 4.
374#if HWY_ARCH_X86
375#define HWY_ATTAINABLE_TARGETS \
376 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
377 HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
378#else
379#define HWY_ATTAINABLE_TARGETS \
380 (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE_256 | HWY_ATTAINABLE_SVE2_128)
381#endif
382
383// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
384// to ~HWY_SCALAR, but this is more explicit).
385#if defined(HWY_COMPILE_ONLY_SCALAR)
386#undef HWY_STATIC_TARGET
387#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
388#define HWY_TARGETS HWY_SCALAR
389
390// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
391#elif defined(HWY_COMPILE_ONLY_STATIC)
392#define HWY_TARGETS HWY_STATIC_TARGET
393
394// 3) For tests: include all attainable targets (in particular: scalar)
395#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
396#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
397
398// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
399// excluding superseded targets, in particular scalar.
400#else
401#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
402
403#endif // target policy
404
405// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
406// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
407// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
408#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
409#error "Logic error: best baseline should be included in dynamic targets"
410#endif
411
412#endif // HIGHWAY_HWY_DETECT_TARGETS_H_