LLVM OpenMP* Runtime Library
kmp_dispatch.h
1/*
2 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_DISPATCH_H
14#define KMP_DISPATCH_H
15
16/* ------------------------------------------------------------------------ */
17/* ------------------------------------------------------------------------ */
18
19#include "kmp.h"
20#include "kmp_error.h"
21#include "kmp_i18n.h"
22#include "kmp_itt.h"
23#include "kmp_stats.h"
24#include "kmp_str.h"
25#if KMP_OS_WINDOWS && KMP_ARCH_X86
26#include <float.h>
27#endif
28
29#if OMPT_SUPPORT
30#include "ompt-internal.h"
31#include "ompt-specific.h"
32#endif
33
34/* ------------------------------------------------------------------------ */
35/* ------------------------------------------------------------------------ */
36#if KMP_USE_HIER_SCHED
37// Forward declarations of some hierarchical scheduling data structures
38template <typename T> struct kmp_hier_t;
39template <typename T> struct kmp_hier_top_unit_t;
40#endif // KMP_USE_HIER_SCHED
41
42template <typename T> struct dispatch_shared_info_template;
43template <typename T> struct dispatch_private_info_template;
44
45template <typename T>
46extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47 dispatch_private_info_template<T> *pr,
48 enum sched_type schedule, T lb, T ub,
49 typename traits_t<T>::signed_t st,
50#if USE_ITT_BUILD
51 kmp_uint64 *cur_chunk,
52#endif
53 typename traits_t<T>::signed_t chunk,
54 T nproc, T unit_id);
55template <typename T>
56extern int __kmp_dispatch_next_algorithm(
57 int gtid, dispatch_private_info_template<T> *pr,
58 dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59 T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60
61void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63
64#if KMP_STATIC_STEAL_ENABLED
65
66// replaces dispatch_private_info{32,64} structures and
67// dispatch_private_info{32,64}_t types
68template <typename T> struct dispatch_private_infoXX_template {
69 typedef typename traits_t<T>::unsigned_t UT;
70 typedef typename traits_t<T>::signed_t ST;
71 UT count; // unsigned
72 T ub;
73 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74 T lb;
75 ST st; // signed
76 UT tc; // unsigned
77 kmp_lock_t *steal_lock; // lock used for chunk stealing
78 /* parm[1-4] are used in different ways by different scheduling algorithms */
79
80 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
81 // a) parm3 is properly aligned and
82 // b) all parm1-4 are in the same cache line.
83 // Because of parm1-4 are used together, performance seems to be better
84 // if they are in the same line (not measured though).
85
86 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
87 T parm1;
88 T parm2;
89 T parm3;
90 T parm4;
91 };
92
93 UT ordered_lower; // unsigned
94 UT ordered_upper; // unsigned
95#if KMP_OS_WINDOWS
96 T last_upper;
97#endif /* KMP_OS_WINDOWS */
98};
99
100#else /* KMP_STATIC_STEAL_ENABLED */
101
102// replaces dispatch_private_info{32,64} structures and
103// dispatch_private_info{32,64}_t types
104template <typename T> struct dispatch_private_infoXX_template {
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::signed_t ST;
107 T lb;
108 T ub;
109 ST st; // signed
110 UT tc; // unsigned
111
112 T parm1;
113 T parm2;
114 T parm3;
115 T parm4;
116
117 UT count; // unsigned
118
119 UT ordered_lower; // unsigned
120 UT ordered_upper; // unsigned
121#if KMP_OS_WINDOWS
122 T last_upper;
123#endif /* KMP_OS_WINDOWS */
124};
125#endif /* KMP_STATIC_STEAL_ENABLED */
126
127template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
128 // duplicate alignment here, otherwise size of structure is not correct in our
129 // compiler
130 union KMP_ALIGN_CACHE private_info_tmpl {
131 dispatch_private_infoXX_template<T> p;
132 dispatch_private_info64_t p64;
133 } u;
134 enum sched_type schedule; /* scheduling algorithm */
135 kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
136 std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
137 kmp_uint32 ordered_bumped;
138 dispatch_private_info *next; /* stack of buffers for nest of serial regions */
139 kmp_uint32 type_size;
140#if KMP_USE_HIER_SCHED
141 kmp_int32 hier_id;
142 kmp_hier_top_unit_t<T> *hier_parent;
143 // member functions
144 kmp_int32 get_hier_id() const { return hier_id; }
145 kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
146#endif
147 enum cons_type pushed_ws;
148};
149
150// replaces dispatch_shared_info{32,64} structures and
151// dispatch_shared_info{32,64}_t types
152template <typename T> struct dispatch_shared_infoXX_template {
153 typedef typename traits_t<T>::unsigned_t UT;
154 typedef typename traits_t<T>::signed_t ST;
155 /* chunk index under dynamic, number of idle threads under static-steal;
156 iteration index otherwise */
157 volatile UT iteration;
158 volatile ST num_done;
159 volatile UT ordered_iteration;
160 // to retain the structure size making ordered_iteration scalar
161 UT ordered_dummy[KMP_MAX_ORDERED - 3];
162};
163
164// replaces dispatch_shared_info structure and dispatch_shared_info_t type
165template <typename T> struct dispatch_shared_info_template {
166 typedef typename traits_t<T>::unsigned_t UT;
167 // we need union here to keep the structure size
168 union shared_info_tmpl {
169 dispatch_shared_infoXX_template<UT> s;
170 dispatch_shared_info64_t s64;
171 } u;
172 volatile kmp_uint32 buffer_index;
173 volatile kmp_int32 doacross_buf_idx; // teamwise index
174 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
175 kmp_int32 doacross_num_done; // count finished threads
176#if KMP_USE_HIER_SCHED
177 kmp_hier_t<T> *hier;
178#endif
179#if KMP_USE_HWLOC
180 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
181 // machines (> 48 cores). Performance analysis showed that a cache thrash
182 // was occurring and this padding helps alleviate the problem.
183 char padding[64];
184#endif
185};
186
187/* ------------------------------------------------------------------------ */
188/* ------------------------------------------------------------------------ */
189
190#undef USE_TEST_LOCKS
191
192// test_then_add template (general template should NOT be used)
193template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
194
195template <>
196__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
197 kmp_int32 d) {
198 kmp_int32 r;
199 r = KMP_TEST_THEN_ADD32(p, d);
200 return r;
201}
202
203template <>
204__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
205 kmp_int64 d) {
206 kmp_int64 r;
207 r = KMP_TEST_THEN_ADD64(p, d);
208 return r;
209}
210
211// test_then_inc_acq template (general template should NOT be used)
212template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
213
214template <>
215__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
216 kmp_int32 r;
217 r = KMP_TEST_THEN_INC_ACQ32(p);
218 return r;
219}
220
221template <>
222__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
223 kmp_int64 r;
224 r = KMP_TEST_THEN_INC_ACQ64(p);
225 return r;
226}
227
228// test_then_inc template (general template should NOT be used)
229template <typename T> static __forceinline T test_then_inc(volatile T *p);
230
231template <>
232__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
233 kmp_int32 r;
234 r = KMP_TEST_THEN_INC32(p);
235 return r;
236}
237
238template <>
239__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
240 kmp_int64 r;
241 r = KMP_TEST_THEN_INC64(p);
242 return r;
243}
244
245// compare_and_swap template (general template should NOT be used)
246template <typename T>
247static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
248
249template <>
250__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
251 kmp_int32 c, kmp_int32 s) {
252 return KMP_COMPARE_AND_STORE_REL32(p, c, s);
253}
254
255template <>
256__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
257 kmp_int64 c, kmp_int64 s) {
258 return KMP_COMPARE_AND_STORE_REL64(p, c, s);
259}
260
261template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
262 return value >= checker;
263}
264template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
265 return value == checker;
266}
267
268/*
269 Spin wait loop that pauses between checks.
270 Waits until function returns non-zero when called with *spinner and check.
271 Does NOT put threads to sleep.
272 Arguments:
273 UT is unsigned 4- or 8-byte type
274 spinner - memory location to check value
275 checker - value which spinner is >, <, ==, etc.
276 pred - predicate function to perform binary comparison of some sort
277#if USE_ITT_BUILD
278 obj -- is higher-level synchronization object to report to ittnotify. It
279 is used to report locks consistently. For example, if lock is acquired
280 immediately, its address is reported to ittnotify via
281 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
282 and lock routine calls to KMP_WAIT(), the later should report the
283 same address, not an address of low-level spinner.
284#endif // USE_ITT_BUILD
285 TODO: make inline function (move to header file for icl)
286*/
287template <typename UT>
288static UT __kmp_wait(volatile UT *spinner, UT checker,
289 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
290 // note: we may not belong to a team at this point
291 volatile UT *spin = spinner;
292 UT check = checker;
293 kmp_uint32 spins;
294 kmp_uint32 (*f)(UT, UT) = pred;
295 kmp_uint64 time;
296 UT r;
297
298 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
299 KMP_INIT_YIELD(spins);
300 KMP_INIT_BACKOFF(time);
301 // main wait spin loop
302 while (!f(r = *spin, check)) {
303 KMP_FSYNC_SPIN_PREPARE(obj);
304 /* GEH - remove this since it was accidentally introduced when kmp_wait was
305 split.
306 It causes problems with infinite recursion because of exit lock */
307 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
308 __kmp_abort_thread(); */
309 // If oversubscribed, or have waited a bit then yield.
310 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
311 }
312 KMP_FSYNC_SPIN_ACQUIRED(obj);
313 return r;
314}
315
316/* ------------------------------------------------------------------------ */
317/* ------------------------------------------------------------------------ */
318
319template <typename UT>
320void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
321 dispatch_private_info_template<UT> *pr;
322
323 int gtid = *gtid_ref;
324 // int cid = *cid_ref;
325 kmp_info_t *th = __kmp_threads[gtid];
326 KMP_DEBUG_ASSERT(th->th.th_dispatch);
327
328 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
329 if (__kmp_env_consistency_check) {
330 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
331 th->th.th_dispatch->th_dispatch_pr_current);
332 if (pr->pushed_ws != ct_none) {
333#if KMP_USE_DYNAMIC_LOCK
334 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
335#else
336 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
337#endif
338 }
339 }
340
341 if (!th->th.th_team->t.t_serialized) {
342 dispatch_shared_info_template<UT> *sh =
343 reinterpret_cast<dispatch_shared_info_template<UT> *>(
344 th->th.th_dispatch->th_dispatch_sh_current);
345 UT lower;
346
347 if (!__kmp_env_consistency_check) {
348 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
349 th->th.th_dispatch->th_dispatch_pr_current);
350 }
351 lower = pr->u.p.ordered_lower;
352
353#if !defined(KMP_GOMP_COMPAT)
354 if (__kmp_env_consistency_check) {
355 if (pr->ordered_bumped) {
356 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
357 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
358 ct_ordered_in_pdo, loc_ref,
359 &p->stack_data[p->w_top]);
360 }
361 }
362#endif /* !defined(KMP_GOMP_COMPAT) */
363
364 KMP_MB();
365#ifdef KMP_DEBUG
366 {
367 char *buff;
368 // create format specifiers before the debug output
369 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
370 "ordered_iter:%%%s lower:%%%s\n",
371 traits_t<UT>::spec, traits_t<UT>::spec);
372 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
373 __kmp_str_free(&buff);
374 }
375#endif
376 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
377 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
378 KMP_MB(); /* is this necessary? */
379#ifdef KMP_DEBUG
380 {
381 char *buff;
382 // create format specifiers before the debug output
383 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
384 "ordered_iter:%%%s lower:%%%s\n",
385 traits_t<UT>::spec, traits_t<UT>::spec);
386 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
387 __kmp_str_free(&buff);
388 }
389#endif
390 }
391 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
392}
393
394template <typename UT>
395void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
396 typedef typename traits_t<UT>::signed_t ST;
397 dispatch_private_info_template<UT> *pr;
398
399 int gtid = *gtid_ref;
400 // int cid = *cid_ref;
401 kmp_info_t *th = __kmp_threads[gtid];
402 KMP_DEBUG_ASSERT(th->th.th_dispatch);
403
404 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
405 if (__kmp_env_consistency_check) {
406 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
407 th->th.th_dispatch->th_dispatch_pr_current);
408 if (pr->pushed_ws != ct_none) {
409 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
410 }
411 }
412
413 if (!th->th.th_team->t.t_serialized) {
414 dispatch_shared_info_template<UT> *sh =
415 reinterpret_cast<dispatch_shared_info_template<UT> *>(
416 th->th.th_dispatch->th_dispatch_sh_current);
417
418 if (!__kmp_env_consistency_check) {
419 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
420 th->th.th_dispatch->th_dispatch_pr_current);
421 }
422
423 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
424#if !defined(KMP_GOMP_COMPAT)
425 if (__kmp_env_consistency_check) {
426 if (pr->ordered_bumped != 0) {
427 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
428 /* How to test it? - OM */
429 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
430 ct_ordered_in_pdo, loc_ref,
431 &p->stack_data[p->w_top]);
432 }
433 }
434#endif /* !defined(KMP_GOMP_COMPAT) */
435
436 KMP_MB(); /* Flush all pending memory write invalidates. */
437
438 pr->ordered_bumped += 1;
439
440 KD_TRACE(1000,
441 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
442 gtid, pr->ordered_bumped));
443
444 KMP_MB(); /* Flush all pending memory write invalidates. */
445
446 /* TODO use general release procedure? */
447 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
448
449 KMP_MB(); /* Flush all pending memory write invalidates. */
450 }
451 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
452}
453
454/* Computes and returns x to the power of y, where y must a non-negative integer
455 */
456template <typename UT>
457static __forceinline long double __kmp_pow(long double x, UT y) {
458 long double s = 1.0L;
459
460 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
461 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
462 while (y) {
463 if (y & 1)
464 s *= x;
465 x *= x;
466 y >>= 1;
467 }
468 return s;
469}
470
471/* Computes and returns the number of unassigned iterations after idx chunks
472 have been assigned
473 (the total number of unassigned iterations in chunks with index greater than
474 or equal to idx).
475 __forceinline seems to be broken so that if we __forceinline this function,
476 the behavior is wrong
477 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
478*/
479template <typename T>
480static __inline typename traits_t<T>::unsigned_t
481__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
482 typename traits_t<T>::unsigned_t idx) {
483 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
484 least for ICL 8.1, long double arithmetic may not really have
485 long double precision, even with /Qlong_double. Currently, we
486 workaround that in the caller code, by manipulating the FPCW for
487 Windows* OS on IA-32 architecture. The lack of precision is not
488 expected to be a correctness issue, though.
489 */
490 typedef typename traits_t<T>::unsigned_t UT;
491
492 long double x = tc * __kmp_pow<UT>(base, idx);
493 UT r = (UT)x;
494 if (x == r)
495 return r;
496 return r + 1;
497}
498
499// Parameters of the guided-iterative algorithm:
500// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
501// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
502// by default n = 2. For example with n = 3 the chunks distribution will be more
503// flat.
504// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
505static const int guided_int_param = 2;
506static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
507#endif // KMP_DISPATCH_H
sched_type
Definition: kmp.h:357
Definition: kmp.h:234