LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56 }
57 }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 // Nonmonotonic as default for dynamic schedule when no modifier is specified
76 int monotonicity = SCHEDULE_NONMONOTONIC;
77
78 // Let default be monotonic for executables
79 // compiled with OpenMP* 4.5 or less compilers
80 if (loc != NULL && loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
82
83 if (use_hier || __kmp_force_monotonic)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
89
90 return monotonicity;
91}
92
93#if KMP_STATIC_STEAL_ENABLED
94enum { // values for steal_flag (possible states of private per-loop buffer)
95 UNUSED = 0,
96 CLAIMED = 1, // owner thread started initialization
97 READY = 2, // available for stealing
98 THIEF = 3 // finished by owner, or claimed by thief
99 // possible state changes:
100 // 0 -> 1 owner only, sync
101 // 0 -> 3 thief only, sync
102 // 1 -> 2 owner only, async
103 // 2 -> 3 owner only, async
104 // 3 -> 2 owner only, async
105 // 3 -> 0 last thread finishing the loop, async
106};
107#endif
108
109// Initialize a dispatch_private_info_template<T> buffer for a particular
110// type of schedule,chunk. The loop description is found in lb (lower bound),
111// ub (upper bound), and st (stride). nproc is the number of threads relevant
112// to the scheduling (often the number of threads in a team, but not always if
113// hierarchical scheduling is used). tid is the id of the thread calling
114// the function within the group of nproc threads. It will have a value
115// between 0 and nproc - 1. This is often just the thread id within a team, but
116// is not necessarily the case when using hierarchical scheduling.
117// loc is the source file location of the corresponding loop
118// gtid is the global thread id
119template <typename T>
120void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121 dispatch_private_info_template<T> *pr,
122 enum sched_type schedule, T lb, T ub,
123 typename traits_t<T>::signed_t st,
124#if USE_ITT_BUILD
125 kmp_uint64 *cur_chunk,
126#endif
127 typename traits_t<T>::signed_t chunk,
128 T nproc, T tid) {
129 typedef typename traits_t<T>::unsigned_t UT;
130 typedef typename traits_t<T>::floating_t DBL;
131
132 int active;
133 T tc;
134 kmp_info_t *th;
135 kmp_team_t *team;
136 int monotonicity;
137 bool use_hier;
138
139#ifdef KMP_DEBUG
140 typedef typename traits_t<T>::signed_t ST;
141 {
142 char *buff;
143 // create format specifiers before the debug output
144 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147 traits_t<T>::spec, traits_t<T>::spec,
148 traits_t<ST>::spec, traits_t<ST>::spec,
149 traits_t<T>::spec, traits_t<T>::spec);
150 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151 __kmp_str_free(&buff);
152 }
153#endif
154 /* setup data */
155 th = __kmp_threads[gtid];
156 team = th->th.th_team;
157 active = !team->t.t_serialized;
158
159#if USE_ITT_BUILD
160 int itt_need_metadata_reporting =
161 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163 team->t.t_active_level == 1;
164#endif
165
166#if KMP_USE_HIER_SCHED
167 use_hier = pr->flags.use_hier;
168#else
169 use_hier = false;
170#endif
171
172 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175
176 /* Pick up the nomerge/ordered bits from the scheduling type */
177 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178 pr->flags.nomerge = TRUE;
179 schedule =
180 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181 } else {
182 pr->flags.nomerge = FALSE;
183 }
184 pr->type_size = traits_t<T>::type_size; // remember the size of variables
185 if (kmp_ord_lower & schedule) {
186 pr->flags.ordered = TRUE;
187 schedule =
188 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189 } else {
190 pr->flags.ordered = FALSE;
191 }
192 // Ordered overrides nonmonotonic
193 if (pr->flags.ordered) {
194 monotonicity = SCHEDULE_MONOTONIC;
195 }
196
197 if (schedule == kmp_sch_static) {
198 schedule = __kmp_static;
199 } else {
200 if (schedule == kmp_sch_runtime) {
201 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202 // not specified)
203 schedule = team->t.t_sched.r_sched_type;
204 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207 monotonicity = SCHEDULE_MONOTONIC;
208 // Detail the schedule if needed (global controls are differentiated
209 // appropriately)
210 if (schedule == kmp_sch_guided_chunked) {
211 schedule = __kmp_guided;
212 } else if (schedule == kmp_sch_static) {
213 schedule = __kmp_static;
214 }
215 // Use the chunk size specified by OMP_SCHEDULE (or default if not
216 // specified)
217 chunk = team->t.t_sched.chunk;
218#if USE_ITT_BUILD
219 if (cur_chunk)
220 *cur_chunk = chunk;
221#endif
222#ifdef KMP_DEBUG
223 {
224 char *buff;
225 // create format specifiers before the debug output
226 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227 "schedule:%%d chunk:%%%s\n",
228 traits_t<ST>::spec);
229 KD_TRACE(10, (buff, gtid, schedule, chunk));
230 __kmp_str_free(&buff);
231 }
232#endif
233 } else {
234 if (schedule == kmp_sch_guided_chunked) {
235 schedule = __kmp_guided;
236 }
237 if (chunk <= 0) {
238 chunk = KMP_DEFAULT_CHUNK;
239 }
240 }
241
242 if (schedule == kmp_sch_auto) {
243 // mapping and differentiation: in the __kmp_do_serial_initialize()
244 schedule = __kmp_auto;
245#ifdef KMP_DEBUG
246 {
247 char *buff;
248 // create format specifiers before the debug output
249 buff = __kmp_str_format(
250 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251 "schedule:%%d chunk:%%%s\n",
252 traits_t<ST>::spec);
253 KD_TRACE(10, (buff, gtid, schedule, chunk));
254 __kmp_str_free(&buff);
255 }
256#endif
257 }
258#if KMP_STATIC_STEAL_ENABLED
259 // map nonmonotonic:dynamic to static steal
260 if (schedule == kmp_sch_dynamic_chunked) {
261 if (monotonicity == SCHEDULE_NONMONOTONIC)
262 schedule = kmp_sch_static_steal;
263 }
264#endif
265 /* guided analytical not safe for too many threads */
266 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267 schedule = kmp_sch_guided_iterative_chunked;
268 KMP_WARNING(DispatchManyThreads);
269 }
270 if (schedule == kmp_sch_runtime_simd) {
271 // compiler provides simd_width in the chunk parameter
272 schedule = team->t.t_sched.r_sched_type;
273 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275 // Detail the schedule if needed (global controls are differentiated
276 // appropriately)
277 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278 schedule == __kmp_static) {
279 schedule = kmp_sch_static_balanced_chunked;
280 } else {
281 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282 schedule = kmp_sch_guided_simd;
283 }
284 chunk = team->t.t_sched.chunk * chunk;
285 }
286#if USE_ITT_BUILD
287 if (cur_chunk)
288 *cur_chunk = chunk;
289#endif
290#ifdef KMP_DEBUG
291 {
292 char *buff;
293 // create format specifiers before the debug output
294 buff = __kmp_str_format(
295 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296 " chunk:%%%s\n",
297 traits_t<ST>::spec);
298 KD_TRACE(10, (buff, gtid, schedule, chunk));
299 __kmp_str_free(&buff);
300 }
301#endif
302 }
303 pr->u.p.parm1 = chunk;
304 }
305 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306 "unknown scheduling type");
307
308 pr->u.p.count = 0;
309
310 if (__kmp_env_consistency_check) {
311 if (st == 0) {
312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314 }
315 }
316 // compute trip count
317 if (st == 1) { // most common case
318 if (ub >= lb) {
319 tc = ub - lb + 1;
320 } else { // ub < lb
321 tc = 0; // zero-trip
322 }
323 } else if (st < 0) {
324 if (lb >= ub) {
325 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326 // where the division needs to be unsigned regardless of the result type
327 tc = (UT)(lb - ub) / (-st) + 1;
328 } else { // lb < ub
329 tc = 0; // zero-trip
330 }
331 } else { // st > 0
332 if (ub >= lb) {
333 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334 // where the division needs to be unsigned regardless of the result type
335 tc = (UT)(ub - lb) / st + 1;
336 } else { // ub < lb
337 tc = 0; // zero-trip
338 }
339 }
340
341#if KMP_STATS_ENABLED
342 if (KMP_MASTER_GTID(gtid)) {
343 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344 }
345#endif
346
347 pr->u.p.lb = lb;
348 pr->u.p.ub = ub;
349 pr->u.p.st = st;
350 pr->u.p.tc = tc;
351
352#if KMP_OS_WINDOWS
353 pr->u.p.last_upper = ub + st;
354#endif /* KMP_OS_WINDOWS */
355
356 /* NOTE: only the active parallel region(s) has active ordered sections */
357
358 if (active) {
359 if (pr->flags.ordered) {
360 pr->ordered_bumped = 0;
361 pr->u.p.ordered_lower = 1;
362 pr->u.p.ordered_upper = 0;
363 }
364 }
365
366 switch (schedule) {
367#if KMP_STATIC_STEAL_ENABLED
368 case kmp_sch_static_steal: {
369 T ntc, init;
370
371 KD_TRACE(100,
372 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373 gtid));
374
375 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376 if (nproc > 1 && ntc >= nproc) {
377 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378 T id = tid;
379 T small_chunk, extras;
380 kmp_uint32 old = UNUSED;
381 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382 if (traits_t<T>::type_size > 4) {
383 // AC: TODO: check if 16-byte CAS available and use it to
384 // improve performance (probably wait for explicit request
385 // before spending time on this).
386 // For now use dynamically allocated per-private-buffer lock,
387 // free memory in __kmp_dispatch_next when status==0.
388 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389 __kmp_init_lock(pr->u.p.steal_lock);
390 }
391 small_chunk = ntc / nproc;
392 extras = ntc % nproc;
393
394 init = id * small_chunk + (id < extras ? id : extras);
395 pr->u.p.count = init;
396 if (claimed) { // are we succeeded in claiming own buffer?
397 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398 // Other threads will inspect steal_flag when searching for a victim.
399 // READY means other threads may steal from this thread from now on.
400 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401 } else {
402 // other thread has stolen whole our range
403 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404 pr->u.p.ub = init; // mark there is no iterations to work on
405 }
406 pr->u.p.parm2 = ntc; // save number of chunks
407 // parm3 is the number of times to attempt stealing which is
408 // nproc (just a heuristics, could be optimized later on).
409 pr->u.p.parm3 = nproc;
410 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411 break;
412 } else {
413 /* too few chunks: switching to kmp_sch_dynamic_chunked */
414 schedule = kmp_sch_dynamic_chunked;
415 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416 "kmp_sch_dynamic_chunked\n",
417 gtid));
418 goto dynamic_init;
419 break;
420 } // if
421 } // case
422#endif
423 case kmp_sch_static_balanced: {
424 T init, limit;
425
426 KD_TRACE(
427 100,
428 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429 gtid));
430
431 if (nproc > 1) {
432 T id = tid;
433
434 if (tc < nproc) {
435 if (id < tc) {
436 init = id;
437 limit = id;
438 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439 } else {
440 pr->u.p.count = 1; /* means no more chunks to execute */
441 pr->u.p.parm1 = FALSE;
442 break;
443 }
444 } else {
445 T small_chunk = tc / nproc;
446 T extras = tc % nproc;
447 init = id * small_chunk + (id < extras ? id : extras);
448 limit = init + small_chunk - (id < extras ? 0 : 1);
449 pr->u.p.parm1 = (id == nproc - 1);
450 }
451 } else {
452 if (tc > 0) {
453 init = 0;
454 limit = tc - 1;
455 pr->u.p.parm1 = TRUE;
456 } else {
457 // zero trip count
458 pr->u.p.count = 1; /* means no more chunks to execute */
459 pr->u.p.parm1 = FALSE;
460 break;
461 }
462 }
463#if USE_ITT_BUILD
464 // Calculate chunk for metadata report
465 if (itt_need_metadata_reporting)
466 if (cur_chunk)
467 *cur_chunk = limit - init + 1;
468#endif
469 if (st == 1) {
470 pr->u.p.lb = lb + init;
471 pr->u.p.ub = lb + limit;
472 } else {
473 // calculated upper bound, "ub" is user-defined upper bound
474 T ub_tmp = lb + limit * st;
475 pr->u.p.lb = lb + init * st;
476 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477 // it exactly
478 if (st > 0) {
479 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480 } else {
481 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482 }
483 }
484 if (pr->flags.ordered) {
485 pr->u.p.ordered_lower = init;
486 pr->u.p.ordered_upper = limit;
487 }
488 break;
489 } // case
490 case kmp_sch_static_balanced_chunked: {
491 // similar to balanced, but chunk adjusted to multiple of simd width
492 T nth = nproc;
493 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494 " -> falling-through to static_greedy\n",
495 gtid));
496 schedule = kmp_sch_static_greedy;
497 if (nth > 1)
498 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499 else
500 pr->u.p.parm1 = tc;
501 break;
502 } // case
504 case kmp_sch_guided_iterative_chunked: {
505 KD_TRACE(
506 100,
507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508 " case\n",
509 gtid));
510
511 if (nproc > 1) {
512 if ((2L * chunk + 1) * nproc >= tc) {
513 /* chunk size too large, switch to dynamic */
514 schedule = kmp_sch_dynamic_chunked;
515 goto dynamic_init;
516 } else {
517 // when remaining iters become less than parm2 - switch to dynamic
518 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519 *(double *)&pr->u.p.parm3 =
520 guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521 }
522 } else {
523 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524 "kmp_sch_static_greedy\n",
525 gtid));
526 schedule = kmp_sch_static_greedy;
527 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528 KD_TRACE(
529 100,
530 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531 gtid));
532 pr->u.p.parm1 = tc;
533 } // if
534 } // case
535 break;
536 case kmp_sch_guided_analytical_chunked: {
537 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538 "kmp_sch_guided_analytical_chunked case\n",
539 gtid));
540
541 if (nproc > 1) {
542 if ((2L * chunk + 1) * nproc >= tc) {
543 /* chunk size too large, switch to dynamic */
544 schedule = kmp_sch_dynamic_chunked;
545 goto dynamic_init;
546 } else {
547 /* commonly used term: (2 nproc - 1)/(2 nproc) */
548 DBL x;
549
550#if KMP_USE_X87CONTROL
551 /* Linux* OS already has 64-bit computation by default for long double,
552 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554 instead of the default 53-bit. Even though long double doesn't work
555 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556 expected to impact the correctness of the algorithm, but this has not
557 been mathematically proven. */
558 // save original FPCW and set precision to 64-bit, as
559 // Windows* OS on IA-32 architecture defaults to 53-bit
560 unsigned int oldFpcw = _control87(0, 0);
561 _control87(_PC_64, _MCW_PC); // 0,0x30000
562#endif
563 /* value used for comparison in solver for cross-over point */
564 KMP_ASSERT(tc > 0);
565 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
566
567 /* crossover point--chunk indexes equal to or greater than
568 this point switch to dynamic-style scheduling */
569 UT cross;
570
571 /* commonly used term: (2 nproc - 1)/(2 nproc) */
572 x = 1.0 - 0.5 / (double)nproc;
573
574#ifdef KMP_DEBUG
575 { // test natural alignment
576 struct _test_a {
577 char a;
578 union {
579 char b;
580 DBL d;
581 };
582 } t;
583 ptrdiff_t natural_alignment =
584 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
585 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
586 // long)natural_alignment );
587 KMP_DEBUG_ASSERT(
588 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
589 }
590#endif // KMP_DEBUG
591
592 /* save the term in thread private dispatch structure */
593 *(DBL *)&pr->u.p.parm3 = x;
594
595 /* solve for the crossover point to the nearest integer i for which C_i
596 <= chunk */
597 {
598 UT left, right, mid;
599 long double p;
600
601 /* estimate initial upper and lower bound */
602
603 /* doesn't matter what value right is as long as it is positive, but
604 it affects performance of the solver */
605 right = 229;
606 p = __kmp_pow<UT>(x, right);
607 if (p > target) {
608 do {
609 p *= p;
610 right <<= 1;
611 } while (p > target && right < (1 << 27));
612 /* lower bound is previous (failed) estimate of upper bound */
613 left = right >> 1;
614 } else {
615 left = 0;
616 }
617
618 /* bisection root-finding method */
619 while (left + 1 < right) {
620 mid = (left + right) / 2;
621 if (__kmp_pow<UT>(x, mid) > target) {
622 left = mid;
623 } else {
624 right = mid;
625 }
626 } // while
627 cross = right;
628 }
629 /* assert sanity of computed crossover point */
630 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
631 __kmp_pow<UT>(x, cross) <= target);
632
633 /* save the crossover point in thread private dispatch structure */
634 pr->u.p.parm2 = cross;
635
636// C75803
637#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
638#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
639#else
640#define GUIDED_ANALYTICAL_WORKAROUND (x)
641#endif
642 /* dynamic-style scheduling offset */
643 pr->u.p.count = tc -
644 __kmp_dispatch_guided_remaining(
645 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
646 cross * chunk;
647#if KMP_USE_X87CONTROL
648 // restore FPCW
649 _control87(oldFpcw, _MCW_PC);
650#endif
651 } // if
652 } else {
653 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
654 "kmp_sch_static_greedy\n",
655 gtid));
656 schedule = kmp_sch_static_greedy;
657 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
658 pr->u.p.parm1 = tc;
659 } // if
660 } // case
661 break;
662 case kmp_sch_static_greedy:
663 KD_TRACE(
664 100,
665 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
666 gtid));
667 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
668 break;
669 case kmp_sch_static_chunked:
670 case kmp_sch_dynamic_chunked:
671 dynamic_init:
672 if (tc == 0)
673 break;
674 if (pr->u.p.parm1 <= 0)
675 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
676 else if (pr->u.p.parm1 > tc)
677 pr->u.p.parm1 = tc;
678 // Store the total number of chunks to prevent integer overflow during
679 // bounds calculations in the get next chunk routine.
680 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
681 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
682 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
683 gtid));
684 break;
685 case kmp_sch_trapezoidal: {
686 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
687
688 T parm1, parm2, parm3, parm4;
689 KD_TRACE(100,
690 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
691 gtid));
692
693 parm1 = chunk;
694
695 /* F : size of the first cycle */
696 parm2 = (tc / (2 * nproc));
697
698 if (parm2 < 1) {
699 parm2 = 1;
700 }
701
702 /* L : size of the last cycle. Make sure the last cycle is not larger
703 than the first cycle. */
704 if (parm1 < 1) {
705 parm1 = 1;
706 } else if (parm1 > parm2) {
707 parm1 = parm2;
708 }
709
710 /* N : number of cycles */
711 parm3 = (parm2 + parm1);
712 parm3 = (2 * tc + parm3 - 1) / parm3;
713
714 if (parm3 < 2) {
715 parm3 = 2;
716 }
717
718 /* sigma : decreasing incr of the trapezoid */
719 parm4 = (parm3 - 1);
720 parm4 = (parm2 - parm1) / parm4;
721
722 // pointless check, because parm4 >= 0 always
723 // if ( parm4 < 0 ) {
724 // parm4 = 0;
725 //}
726
727 pr->u.p.parm1 = parm1;
728 pr->u.p.parm2 = parm2;
729 pr->u.p.parm3 = parm3;
730 pr->u.p.parm4 = parm4;
731 } // case
732 break;
733
734 default: {
735 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
736 KMP_HNT(GetNewerLibrary), // Hint
737 __kmp_msg_null // Variadic argument list terminator
738 );
739 } break;
740 } // switch
741 pr->schedule = schedule;
742}
743
744#if KMP_USE_HIER_SCHED
745template <typename T>
746inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
747 typename traits_t<T>::signed_t st);
748template <>
749inline void
750__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
751 kmp_int32 ub, kmp_int32 st) {
752 __kmp_dispatch_init_hierarchy<kmp_int32>(
753 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
754 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
755}
756template <>
757inline void
758__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
759 kmp_uint32 ub, kmp_int32 st) {
760 __kmp_dispatch_init_hierarchy<kmp_uint32>(
761 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
762 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
763}
764template <>
765inline void
766__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
767 kmp_int64 ub, kmp_int64 st) {
768 __kmp_dispatch_init_hierarchy<kmp_int64>(
769 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
770 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
771}
772template <>
773inline void
774__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
775 kmp_uint64 ub, kmp_int64 st) {
776 __kmp_dispatch_init_hierarchy<kmp_uint64>(
777 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
778 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
779}
780
781// free all the hierarchy scheduling memory associated with the team
782void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
783 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
784 for (int i = 0; i < num_disp_buff; ++i) {
785 // type does not matter here so use kmp_int32
786 auto sh =
787 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
788 &team->t.t_disp_buffer[i]);
789 if (sh->hier) {
790 sh->hier->deallocate();
791 __kmp_free(sh->hier);
792 }
793 }
794}
795#endif
796
797// UT - unsigned flavor of T, ST - signed flavor of T,
798// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
799template <typename T>
800static void
801__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
802 T ub, typename traits_t<T>::signed_t st,
803 typename traits_t<T>::signed_t chunk, int push_ws) {
804 typedef typename traits_t<T>::unsigned_t UT;
805
806 int active;
807 kmp_info_t *th;
808 kmp_team_t *team;
809 kmp_uint32 my_buffer_index;
810 dispatch_private_info_template<T> *pr;
811 dispatch_shared_info_template<T> volatile *sh;
812
813 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
814 sizeof(dispatch_private_info));
815 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
816 sizeof(dispatch_shared_info));
817 __kmp_assert_valid_gtid(gtid);
818
819 if (!TCR_4(__kmp_init_parallel))
820 __kmp_parallel_initialize();
821
822 __kmp_resume_if_soft_paused();
823
824#if INCLUDE_SSC_MARKS
825 SSC_MARK_DISPATCH_INIT();
826#endif
827#ifdef KMP_DEBUG
828 typedef typename traits_t<T>::signed_t ST;
829 {
830 char *buff;
831 // create format specifiers before the debug output
832 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
833 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
834 traits_t<ST>::spec, traits_t<T>::spec,
835 traits_t<T>::spec, traits_t<ST>::spec);
836 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
837 __kmp_str_free(&buff);
838 }
839#endif
840 /* setup data */
841 th = __kmp_threads[gtid];
842 team = th->th.th_team;
843 active = !team->t.t_serialized;
844 th->th.th_ident = loc;
845
846 // Any half-decent optimizer will remove this test when the blocks are empty
847 // since the macros expand to nothing
848 // when statistics are disabled.
849 if (schedule == __kmp_static) {
850 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
851 } else {
852 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
853 }
854
855#if KMP_USE_HIER_SCHED
856 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
857 // Hierarchical scheduling does not work with ordered, so if ordered is
858 // detected, then revert back to threaded scheduling.
859 bool ordered;
860 enum sched_type my_sched = schedule;
861 my_buffer_index = th->th.th_dispatch->th_disp_index;
862 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
863 &th->th.th_dispatch
864 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
866 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
867 my_sched =
868 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
869 ordered = (kmp_ord_lower & my_sched);
870 if (pr->flags.use_hier) {
871 if (ordered) {
872 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
873 "Disabling hierarchical scheduling.\n",
874 gtid));
875 pr->flags.use_hier = FALSE;
876 }
877 }
878 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
879 // Don't use hierarchical for ordered parallel loops and don't
880 // use the runtime hierarchy if one was specified in the program
881 if (!ordered && !pr->flags.use_hier)
882 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
883 }
884#endif // KMP_USE_HIER_SCHED
885
886#if USE_ITT_BUILD
887 kmp_uint64 cur_chunk = chunk;
888 int itt_need_metadata_reporting =
889 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
890 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
891 team->t.t_active_level == 1;
892#endif
893 if (!active) {
894 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
895 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
896 } else {
897 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
898 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
899
900 my_buffer_index = th->th.th_dispatch->th_disp_index++;
901
902 /* What happens when number of threads changes, need to resize buffer? */
903 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
904 &th->th.th_dispatch
905 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
906 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
907 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
908 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
909 my_buffer_index));
910 if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
911 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
912 " sh->buffer_index:%d\n",
913 gtid, my_buffer_index, sh->buffer_index));
914 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
915 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
916 // Note: KMP_WAIT() cannot be used there: buffer index and
917 // my_buffer_index are *always* 32-bit integers.
918 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
919 "sh->buffer_index:%d\n",
920 gtid, my_buffer_index, sh->buffer_index));
921 }
922 }
923
924 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
925#if USE_ITT_BUILD
926 &cur_chunk,
927#endif
928 chunk, (T)th->th.th_team_nproc,
929 (T)th->th.th_info.ds.ds_tid);
930 if (active) {
931 if (pr->flags.ordered == 0) {
932 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
933 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
934 } else {
935 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
936 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
937 }
938 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
939 th->th.th_dispatch->th_dispatch_sh_current =
940 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
941#if USE_ITT_BUILD
942 if (pr->flags.ordered) {
943 __kmp_itt_ordered_init(gtid);
944 }
945 // Report loop metadata
946 if (itt_need_metadata_reporting) {
947 // Only report metadata by primary thread of active team at level 1
948 kmp_uint64 schedtype = 0;
949 switch (schedule) {
950 case kmp_sch_static_chunked:
951 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
952 break;
953 case kmp_sch_static_greedy:
954 cur_chunk = pr->u.p.parm1;
955 break;
956 case kmp_sch_dynamic_chunked:
957 schedtype = 1;
958 break;
959 case kmp_sch_guided_iterative_chunked:
960 case kmp_sch_guided_analytical_chunked:
962 schedtype = 2;
963 break;
964 default:
965 // Should we put this case under "static"?
966 // case kmp_sch_static_steal:
967 schedtype = 3;
968 break;
969 }
970 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
971 }
972#if KMP_USE_HIER_SCHED
973 if (pr->flags.use_hier) {
974 pr->u.p.count = 0;
975 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
976 }
977#endif // KMP_USER_HIER_SCHED
978#endif /* USE_ITT_BUILD */
979 }
980
981#ifdef KMP_DEBUG
982 {
983 char *buff;
984 // create format specifiers before the debug output
985 buff = __kmp_str_format(
986 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
987 "lb:%%%s ub:%%%s"
988 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
989 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
990 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
991 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
992 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
993 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
994 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
995 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
996 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
997 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
998 __kmp_str_free(&buff);
999 }
1000#endif
1001#if OMPT_SUPPORT && OMPT_OPTIONAL
1002 if (ompt_enabled.ompt_callback_work) {
1003 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1004 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1005 ompt_callbacks.ompt_callback(ompt_callback_work)(
1006 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1007 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1008 }
1009#endif
1010 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1011}
1012
1013/* For ordered loops, either __kmp_dispatch_finish() should be called after
1014 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1015 * every chunk of iterations. If the ordered section(s) were not executed
1016 * for this iteration (or every iteration in this chunk), we need to set the
1017 * ordered iteration counters so that the next thread can proceed. */
1018template <typename UT>
1019static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1020 typedef typename traits_t<UT>::signed_t ST;
1021 __kmp_assert_valid_gtid(gtid);
1022 kmp_info_t *th = __kmp_threads[gtid];
1023
1024 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1025 if (!th->th.th_team->t.t_serialized) {
1026
1027 dispatch_private_info_template<UT> *pr =
1028 reinterpret_cast<dispatch_private_info_template<UT> *>(
1029 th->th.th_dispatch->th_dispatch_pr_current);
1030 dispatch_shared_info_template<UT> volatile *sh =
1031 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1032 th->th.th_dispatch->th_dispatch_sh_current);
1033 KMP_DEBUG_ASSERT(pr);
1034 KMP_DEBUG_ASSERT(sh);
1035 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1036 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1037
1038 if (pr->ordered_bumped) {
1039 KD_TRACE(
1040 1000,
1041 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1042 gtid));
1043 pr->ordered_bumped = 0;
1044 } else {
1045 UT lower = pr->u.p.ordered_lower;
1046
1047#ifdef KMP_DEBUG
1048 {
1049 char *buff;
1050 // create format specifiers before the debug output
1051 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1052 "ordered_iteration:%%%s lower:%%%s\n",
1053 traits_t<UT>::spec, traits_t<UT>::spec);
1054 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1055 __kmp_str_free(&buff);
1056 }
1057#endif
1058
1059 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1060 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1061 KMP_MB(); /* is this necessary? */
1062#ifdef KMP_DEBUG
1063 {
1064 char *buff;
1065 // create format specifiers before the debug output
1066 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1067 "ordered_iteration:%%%s lower:%%%s\n",
1068 traits_t<UT>::spec, traits_t<UT>::spec);
1069 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1070 __kmp_str_free(&buff);
1071 }
1072#endif
1073
1074 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1075 } // if
1076 } // if
1077 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1078}
1079
1080#ifdef KMP_GOMP_COMPAT
1081
1082template <typename UT>
1083static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1084 typedef typename traits_t<UT>::signed_t ST;
1085 __kmp_assert_valid_gtid(gtid);
1086 kmp_info_t *th = __kmp_threads[gtid];
1087
1088 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1089 if (!th->th.th_team->t.t_serialized) {
1090 dispatch_private_info_template<UT> *pr =
1091 reinterpret_cast<dispatch_private_info_template<UT> *>(
1092 th->th.th_dispatch->th_dispatch_pr_current);
1093 dispatch_shared_info_template<UT> volatile *sh =
1094 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1095 th->th.th_dispatch->th_dispatch_sh_current);
1096 KMP_DEBUG_ASSERT(pr);
1097 KMP_DEBUG_ASSERT(sh);
1098 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1099 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1100
1101 UT lower = pr->u.p.ordered_lower;
1102 UT upper = pr->u.p.ordered_upper;
1103 UT inc = upper - lower + 1;
1104
1105 if (pr->ordered_bumped == inc) {
1106 KD_TRACE(
1107 1000,
1108 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1109 gtid));
1110 pr->ordered_bumped = 0;
1111 } else {
1112 inc -= pr->ordered_bumped;
1113
1114#ifdef KMP_DEBUG
1115 {
1116 char *buff;
1117 // create format specifiers before the debug output
1118 buff = __kmp_str_format(
1119 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1120 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1121 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1122 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1123 __kmp_str_free(&buff);
1124 }
1125#endif
1126
1127 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1128 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1129
1130 KMP_MB(); /* is this necessary? */
1131 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1132 "ordered_bumped to zero\n",
1133 gtid));
1134 pr->ordered_bumped = 0;
1136#ifdef KMP_DEBUG
1137 {
1138 char *buff;
1139 // create format specifiers before the debug output
1140 buff = __kmp_str_format(
1141 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1142 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1143 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1144 traits_t<UT>::spec);
1145 KD_TRACE(1000,
1146 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1147 __kmp_str_free(&buff);
1148 }
1149#endif
1150
1151 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1152 }
1153 // }
1154 }
1155 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1156}
1157
1158#endif /* KMP_GOMP_COMPAT */
1159
1160template <typename T>
1161int __kmp_dispatch_next_algorithm(int gtid,
1162 dispatch_private_info_template<T> *pr,
1163 dispatch_shared_info_template<T> volatile *sh,
1164 kmp_int32 *p_last, T *p_lb, T *p_ub,
1165 typename traits_t<T>::signed_t *p_st, T nproc,
1166 T tid) {
1167 typedef typename traits_t<T>::unsigned_t UT;
1168 typedef typename traits_t<T>::signed_t ST;
1169 typedef typename traits_t<T>::floating_t DBL;
1170 int status = 0;
1171 bool last = false;
1172 T start;
1173 ST incr;
1174 UT limit, trip, init;
1175 kmp_info_t *th = __kmp_threads[gtid];
1176 kmp_team_t *team = th->th.th_team;
1177
1178 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1179 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1180 KMP_DEBUG_ASSERT(pr);
1181 KMP_DEBUG_ASSERT(sh);
1182 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1183#ifdef KMP_DEBUG
1184 {
1185 char *buff;
1186 // create format specifiers before the debug output
1187 buff =
1188 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1189 "sh:%%p nproc:%%%s tid:%%%s\n",
1190 traits_t<T>::spec, traits_t<T>::spec);
1191 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1192 __kmp_str_free(&buff);
1193 }
1194#endif
1195
1196 // zero trip count
1197 if (pr->u.p.tc == 0) {
1198 KD_TRACE(10,
1199 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1200 "zero status:%d\n",
1201 gtid, status));
1202 return 0;
1203 }
1204
1205 switch (pr->schedule) {
1206#if KMP_STATIC_STEAL_ENABLED
1207 case kmp_sch_static_steal: {
1208 T chunk = pr->u.p.parm1;
1209 UT nchunks = pr->u.p.parm2;
1210 KD_TRACE(100,
1211 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1212 gtid));
1213
1214 trip = pr->u.p.tc - 1;
1215
1216 if (traits_t<T>::type_size > 4) {
1217 // use lock for 8-byte induction variable.
1218 // TODO (optional): check presence and use 16-byte CAS
1219 kmp_lock_t *lck = pr->u.p.steal_lock;
1220 KMP_DEBUG_ASSERT(lck != NULL);
1221 if (pr->u.p.count < (UT)pr->u.p.ub) {
1222 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1223 __kmp_acquire_lock(lck, gtid);
1224 // try to get own chunk of iterations
1225 init = (pr->u.p.count)++;
1226 status = (init < (UT)pr->u.p.ub);
1227 __kmp_release_lock(lck, gtid);
1228 } else {
1229 status = 0; // no own chunks
1230 }
1231 if (!status) { // try to steal
1232 kmp_lock_t *lckv; // victim buffer's lock
1233 T while_limit = pr->u.p.parm3;
1234 T while_index = 0;
1235 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1236 __kmp_dispatch_num_buffers; // current loop index
1237 // note: victim thread can potentially execute another loop
1238 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1239 while ((!status) && (while_limit != ++while_index)) {
1240 dispatch_private_info_template<T> *v;
1241 T remaining;
1242 T victimId = pr->u.p.parm4;
1243 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1244 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1245 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1246 KMP_DEBUG_ASSERT(v);
1247 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1248 oldVictimId != victimId) {
1249 victimId = (victimId + 1) % nproc;
1250 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1251 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1252 KMP_DEBUG_ASSERT(v);
1253 }
1254 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1255 continue; // try once more (nproc attempts in total)
1256 }
1257 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1258 kmp_uint32 old = UNUSED;
1259 // try to steal whole range from inactive victim
1260 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1261 if (status) {
1262 // initialize self buffer with victim's whole range of chunks
1263 T id = victimId;
1264 T small_chunk, extras;
1265 small_chunk = nchunks / nproc; // chunks per thread
1266 extras = nchunks % nproc;
1267 init = id * small_chunk + (id < extras ? id : extras);
1268 __kmp_acquire_lock(lck, gtid);
1269 pr->u.p.count = init + 1; // exclude one we execute immediately
1270 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1271 __kmp_release_lock(lck, gtid);
1272 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1273 // no need to reinitialize other thread invariants: lb, st, etc.
1274#ifdef KMP_DEBUG
1275 {
1276 char *buff;
1277 // create format specifiers before the debug output
1278 buff = __kmp_str_format(
1279 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1280 "count:%%%s ub:%%%s\n",
1281 traits_t<UT>::spec, traits_t<T>::spec);
1282 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1283 __kmp_str_free(&buff);
1284 }
1285#endif
1286 // activate non-empty buffer and let others steal from us
1287 if (pr->u.p.count < (UT)pr->u.p.ub)
1288 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1289 break;
1290 }
1291 }
1292 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1293 v->u.p.count >= (UT)v->u.p.ub) {
1294 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1295 continue; // no chunks to steal, try next victim
1296 }
1297 lckv = v->u.p.steal_lock;
1298 KMP_ASSERT(lckv != NULL);
1299 __kmp_acquire_lock(lckv, gtid);
1300 limit = v->u.p.ub; // keep initial ub
1301 if (v->u.p.count >= limit) {
1302 __kmp_release_lock(lckv, gtid);
1303 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1304 continue; // no chunks to steal, try next victim
1305 }
1306
1307 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1308 // TODO: is this heuristics good enough??
1309 remaining = limit - v->u.p.count;
1310 if (remaining > 7) {
1311 // steal 1/4 of remaining
1312 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1313 init = (v->u.p.ub -= (remaining >> 2));
1314 } else {
1315 // steal 1 chunk of 1..7 remaining
1316 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1317 init = (v->u.p.ub -= 1);
1318 }
1319 __kmp_release_lock(lckv, gtid);
1320#ifdef KMP_DEBUG
1321 {
1322 char *buff;
1323 // create format specifiers before the debug output
1324 buff = __kmp_str_format(
1325 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1326 "count:%%%s ub:%%%s\n",
1327 traits_t<UT>::spec, traits_t<UT>::spec);
1328 KD_TRACE(10, (buff, gtid, victimId, init, limit));
1329 __kmp_str_free(&buff);
1330 }
1331#endif
1332 KMP_DEBUG_ASSERT(init + 1 <= limit);
1333 pr->u.p.parm4 = victimId; // remember victim to steal from
1334 status = 1;
1335 // now update own count and ub with stolen range excluding init chunk
1336 __kmp_acquire_lock(lck, gtid);
1337 pr->u.p.count = init + 1;
1338 pr->u.p.ub = limit;
1339 __kmp_release_lock(lck, gtid);
1340 // activate non-empty buffer and let others steal from us
1341 if (init + 1 < limit)
1342 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1343 } // while (search for victim)
1344 } // if (try to find victim and steal)
1345 } else {
1346 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1347 // as all operations on pair (count, ub) must be done atomically
1348 typedef union {
1349 struct {
1350 UT count;
1351 T ub;
1352 } p;
1353 kmp_int64 b;
1354 } union_i4;
1355 union_i4 vold, vnew;
1356 if (pr->u.p.count < (UT)pr->u.p.ub) {
1357 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1358 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1359 vnew.b = vold.b;
1360 vnew.p.count++; // get chunk from head of self range
1361 while (!KMP_COMPARE_AND_STORE_REL64(
1362 (volatile kmp_int64 *)&pr->u.p.count,
1363 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1364 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1365 KMP_CPU_PAUSE();
1366 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1367 vnew.b = vold.b;
1368 vnew.p.count++;
1369 }
1370 init = vold.p.count;
1371 status = (init < (UT)vold.p.ub);
1372 } else {
1373 status = 0; // no own chunks
1374 }
1375 if (!status) { // try to steal
1376 T while_limit = pr->u.p.parm3;
1377 T while_index = 0;
1378 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1379 __kmp_dispatch_num_buffers; // current loop index
1380 // note: victim thread can potentially execute another loop
1381 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1382 while ((!status) && (while_limit != ++while_index)) {
1383 dispatch_private_info_template<T> *v;
1384 T remaining;
1385 T victimId = pr->u.p.parm4;
1386 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1387 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1388 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1389 KMP_DEBUG_ASSERT(v);
1390 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1391 oldVictimId != victimId) {
1392 victimId = (victimId + 1) % nproc;
1393 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1394 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1395 KMP_DEBUG_ASSERT(v);
1396 }
1397 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1398 continue; // try once more (nproc attempts in total)
1399 }
1400 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1401 kmp_uint32 old = UNUSED;
1402 // try to steal whole range from inactive victim
1403 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1404 if (status) {
1405 // initialize self buffer with victim's whole range of chunks
1406 T id = victimId;
1407 T small_chunk, extras;
1408 small_chunk = nchunks / nproc; // chunks per thread
1409 extras = nchunks % nproc;
1410 init = id * small_chunk + (id < extras ? id : extras);
1411 vnew.p.count = init + 1;
1412 vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1413 // write pair (count, ub) at once atomically
1414#if KMP_ARCH_X86
1415 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1416#else
1417 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1418#endif
1419 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1420 // no need to initialize other thread invariants: lb, st, etc.
1421#ifdef KMP_DEBUG
1422 {
1423 char *buff;
1424 // create format specifiers before the debug output
1425 buff = __kmp_str_format(
1426 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1427 "count:%%%s ub:%%%s\n",
1428 traits_t<UT>::spec, traits_t<T>::spec);
1429 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1430 __kmp_str_free(&buff);
1431 }
1432#endif
1433 // activate non-empty buffer and let others steal from us
1434 if (pr->u.p.count < (UT)pr->u.p.ub)
1435 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1436 break;
1437 }
1438 }
1439 while (1) { // CAS loop with check if victim still has enough chunks
1440 // many threads may be stealing concurrently from same victim
1441 vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1442 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1443 vold.p.count >= (UT)vold.p.ub) {
1444 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1445 break; // no chunks to steal, try next victim
1446 }
1447 vnew.b = vold.b;
1448 remaining = vold.p.ub - vold.p.count;
1449 // try to steal 1/4 of remaining
1450 // TODO: is this heuristics good enough??
1451 if (remaining > 7) {
1452 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1453 } else {
1454 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1455 }
1456 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1457 if (KMP_COMPARE_AND_STORE_REL64(
1458 (volatile kmp_int64 *)&v->u.p.count,
1459 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1460 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1461 // stealing succedded
1462#ifdef KMP_DEBUG
1463 {
1464 char *buff;
1465 // create format specifiers before the debug output
1466 buff = __kmp_str_format(
1467 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1468 "count:%%%s ub:%%%s\n",
1469 traits_t<T>::spec, traits_t<T>::spec);
1470 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1471 __kmp_str_free(&buff);
1472 }
1473#endif
1474 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1475 vold.p.ub - vnew.p.ub);
1476 status = 1;
1477 pr->u.p.parm4 = victimId; // keep victim id
1478 // now update own count and ub
1479 init = vnew.p.ub;
1480 vold.p.count = init + 1;
1481#if KMP_ARCH_X86
1482 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1483#else
1484 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1485#endif
1486 // activate non-empty buffer and let others steal from us
1487 if (vold.p.count < (UT)vold.p.ub)
1488 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1489 break;
1490 } // if (check CAS result)
1491 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1492 } // while (try to steal from particular victim)
1493 } // while (search for victim)
1494 } // if (try to find victim and steal)
1495 } // if (4-byte induction variable)
1496 if (!status) {
1497 *p_lb = 0;
1498 *p_ub = 0;
1499 if (p_st != NULL)
1500 *p_st = 0;
1501 } else {
1502 start = pr->u.p.lb;
1503 init *= chunk;
1504 limit = chunk + init - 1;
1505 incr = pr->u.p.st;
1506 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1507
1508 KMP_DEBUG_ASSERT(init <= trip);
1509 // keep track of done chunks for possible early exit from stealing
1510 // TODO: count executed chunks locally with rare update of shared location
1511 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1512 if ((last = (limit >= trip)) != 0)
1513 limit = trip;
1514 if (p_st != NULL)
1515 *p_st = incr;
1516
1517 if (incr == 1) {
1518 *p_lb = start + init;
1519 *p_ub = start + limit;
1520 } else {
1521 *p_lb = start + init * incr;
1522 *p_ub = start + limit * incr;
1523 }
1524 } // if
1525 break;
1526 } // case
1527#endif // KMP_STATIC_STEAL_ENABLED
1528 case kmp_sch_static_balanced: {
1529 KD_TRACE(
1530 10,
1531 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1532 gtid));
1533 /* check if thread has any iteration to do */
1534 if ((status = !pr->u.p.count) != 0) {
1535 pr->u.p.count = 1;
1536 *p_lb = pr->u.p.lb;
1537 *p_ub = pr->u.p.ub;
1538 last = (pr->u.p.parm1 != 0);
1539 if (p_st != NULL)
1540 *p_st = pr->u.p.st;
1541 } else { /* no iterations to do */
1542 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1543 }
1544 } // case
1545 break;
1546 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1547 merged here */
1548 case kmp_sch_static_chunked: {
1549 T parm1;
1550
1551 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1552 "kmp_sch_static_[affinity|chunked] case\n",
1553 gtid));
1554 parm1 = pr->u.p.parm1;
1555
1556 trip = pr->u.p.tc - 1;
1557 init = parm1 * (pr->u.p.count + tid);
1558
1559 if ((status = (init <= trip)) != 0) {
1560 start = pr->u.p.lb;
1561 incr = pr->u.p.st;
1562 limit = parm1 + init - 1;
1563
1564 if ((last = (limit >= trip)) != 0)
1565 limit = trip;
1566
1567 if (p_st != NULL)
1568 *p_st = incr;
1569
1570 pr->u.p.count += nproc;
1571
1572 if (incr == 1) {
1573 *p_lb = start + init;
1574 *p_ub = start + limit;
1575 } else {
1576 *p_lb = start + init * incr;
1577 *p_ub = start + limit * incr;
1578 }
1579
1580 if (pr->flags.ordered) {
1581 pr->u.p.ordered_lower = init;
1582 pr->u.p.ordered_upper = limit;
1583 } // if
1584 } // if
1585 } // case
1586 break;
1587
1588 case kmp_sch_dynamic_chunked: {
1589 UT chunk_number;
1590 UT chunk_size = pr->u.p.parm1;
1591 UT nchunks = pr->u.p.parm2;
1592
1593 KD_TRACE(
1594 100,
1595 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1596 gtid));
1597
1598 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1599 status = (chunk_number < nchunks);
1600 if (!status) {
1601 *p_lb = 0;
1602 *p_ub = 0;
1603 if (p_st != NULL)
1604 *p_st = 0;
1605 } else {
1606 init = chunk_size * chunk_number;
1607 trip = pr->u.p.tc - 1;
1608 start = pr->u.p.lb;
1609 incr = pr->u.p.st;
1610
1611 if ((last = (trip - init < (UT)chunk_size)))
1612 limit = trip;
1613 else
1614 limit = chunk_size + init - 1;
1615
1616 if (p_st != NULL)
1617 *p_st = incr;
1618
1619 if (incr == 1) {
1620 *p_lb = start + init;
1621 *p_ub = start + limit;
1622 } else {
1623 *p_lb = start + init * incr;
1624 *p_ub = start + limit * incr;
1625 }
1626
1627 if (pr->flags.ordered) {
1628 pr->u.p.ordered_lower = init;
1629 pr->u.p.ordered_upper = limit;
1630 } // if
1631 } // if
1632 } // case
1633 break;
1634
1635 case kmp_sch_guided_iterative_chunked: {
1636 T chunkspec = pr->u.p.parm1;
1637 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1638 "iterative case\n",
1639 gtid));
1640 trip = pr->u.p.tc;
1641 // Start atomic part of calculations
1642 while (1) {
1643 ST remaining; // signed, because can be < 0
1644 init = sh->u.s.iteration; // shared value
1645 remaining = trip - init;
1646 if (remaining <= 0) { // AC: need to compare with 0 first
1647 // nothing to do, don't try atomic op
1648 status = 0;
1649 break;
1650 }
1651 if ((T)remaining <
1652 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1653 // use dynamic-style schedule
1654 // atomically increment iterations, get old value
1655 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1656 (ST)chunkspec);
1657 remaining = trip - init;
1658 if (remaining <= 0) {
1659 status = 0; // all iterations got by other threads
1660 } else {
1661 // got some iterations to work on
1662 status = 1;
1663 if ((T)remaining > chunkspec) {
1664 limit = init + chunkspec - 1;
1665 } else {
1666 last = true; // the last chunk
1667 limit = init + remaining - 1;
1668 } // if
1669 } // if
1670 break;
1671 } // if
1672 limit = init + (UT)((double)remaining *
1673 *(double *)&pr->u.p.parm3); // divide by K*nproc
1674 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1675 (ST)init, (ST)limit)) {
1676 // CAS was successful, chunk obtained
1677 status = 1;
1678 --limit;
1679 break;
1680 } // if
1681 } // while
1682 if (status != 0) {
1683 start = pr->u.p.lb;
1684 incr = pr->u.p.st;
1685 if (p_st != NULL)
1686 *p_st = incr;
1687 *p_lb = start + init * incr;
1688 *p_ub = start + limit * incr;
1689 if (pr->flags.ordered) {
1690 pr->u.p.ordered_lower = init;
1691 pr->u.p.ordered_upper = limit;
1692 } // if
1693 } else {
1694 *p_lb = 0;
1695 *p_ub = 0;
1696 if (p_st != NULL)
1697 *p_st = 0;
1698 } // if
1699 } // case
1700 break;
1701
1702 case kmp_sch_guided_simd: {
1703 // same as iterative but curr-chunk adjusted to be multiple of given
1704 // chunk
1705 T chunk = pr->u.p.parm1;
1706 KD_TRACE(100,
1707 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1708 gtid));
1709 trip = pr->u.p.tc;
1710 // Start atomic part of calculations
1711 while (1) {
1712 ST remaining; // signed, because can be < 0
1713 init = sh->u.s.iteration; // shared value
1714 remaining = trip - init;
1715 if (remaining <= 0) { // AC: need to compare with 0 first
1716 status = 0; // nothing to do, don't try atomic op
1717 break;
1718 }
1719 KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1720 // compare with K*nproc*(chunk+1), K=2 by default
1721 if ((T)remaining < pr->u.p.parm2) {
1722 // use dynamic-style schedule
1723 // atomically increment iterations, get old value
1724 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1725 (ST)chunk);
1726 remaining = trip - init;
1727 if (remaining <= 0) {
1728 status = 0; // all iterations got by other threads
1729 } else {
1730 // got some iterations to work on
1731 status = 1;
1732 if ((T)remaining > chunk) {
1733 limit = init + chunk - 1;
1734 } else {
1735 last = true; // the last chunk
1736 limit = init + remaining - 1;
1737 } // if
1738 } // if
1739 break;
1740 } // if
1741 // divide by K*nproc
1742 UT span;
1743 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1744 &span);
1745 UT rem = span % chunk;
1746 if (rem) // adjust so that span%chunk == 0
1747 span += chunk - rem;
1748 limit = init + span;
1749 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1750 (ST)init, (ST)limit)) {
1751 // CAS was successful, chunk obtained
1752 status = 1;
1753 --limit;
1754 break;
1755 } // if
1756 } // while
1757 if (status != 0) {
1758 start = pr->u.p.lb;
1759 incr = pr->u.p.st;
1760 if (p_st != NULL)
1761 *p_st = incr;
1762 *p_lb = start + init * incr;
1763 *p_ub = start + limit * incr;
1764 if (pr->flags.ordered) {
1765 pr->u.p.ordered_lower = init;
1766 pr->u.p.ordered_upper = limit;
1767 } // if
1768 } else {
1769 *p_lb = 0;
1770 *p_ub = 0;
1771 if (p_st != NULL)
1772 *p_st = 0;
1773 } // if
1774 } // case
1775 break;
1776
1777 case kmp_sch_guided_analytical_chunked: {
1778 T chunkspec = pr->u.p.parm1;
1779 UT chunkIdx;
1780#if KMP_USE_X87CONTROL
1781 /* for storing original FPCW value for Windows* OS on
1782 IA-32 architecture 8-byte version */
1783 unsigned int oldFpcw;
1784 unsigned int fpcwSet = 0;
1785#endif
1786 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1787 "kmp_sch_guided_analytical_chunked case\n",
1788 gtid));
1789
1790 trip = pr->u.p.tc;
1791
1792 KMP_DEBUG_ASSERT(nproc > 1);
1793 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1794
1795 while (1) { /* this while loop is a safeguard against unexpected zero
1796 chunk sizes */
1797 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1798 if (chunkIdx >= (UT)pr->u.p.parm2) {
1799 --trip;
1800 /* use dynamic-style scheduling */
1801 init = chunkIdx * chunkspec + pr->u.p.count;
1802 /* need to verify init > 0 in case of overflow in the above
1803 * calculation */
1804 if ((status = (init > 0 && init <= trip)) != 0) {
1805 limit = init + chunkspec - 1;
1806
1807 if ((last = (limit >= trip)) != 0)
1808 limit = trip;
1809 }
1810 break;
1811 } else {
1812/* use exponential-style scheduling */
1813/* The following check is to workaround the lack of long double precision on
1814 Windows* OS.
1815 This check works around the possible effect that init != 0 for chunkIdx == 0.
1816 */
1817#if KMP_USE_X87CONTROL
1818 /* If we haven't already done so, save original
1819 FPCW and set precision to 64-bit, as Windows* OS
1820 on IA-32 architecture defaults to 53-bit */
1821 if (!fpcwSet) {
1822 oldFpcw = _control87(0, 0);
1823 _control87(_PC_64, _MCW_PC);
1824 fpcwSet = 0x30000;
1825 }
1826#endif
1827 if (chunkIdx) {
1828 init = __kmp_dispatch_guided_remaining<T>(
1829 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1830 KMP_DEBUG_ASSERT(init);
1831 init = trip - init;
1832 } else
1833 init = 0;
1834 limit = trip - __kmp_dispatch_guided_remaining<T>(
1835 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1836 KMP_ASSERT(init <= limit);
1837 if (init < limit) {
1838 KMP_DEBUG_ASSERT(limit <= trip);
1839 --limit;
1840 status = 1;
1841 break;
1842 } // if
1843 } // if
1844 } // while (1)
1845#if KMP_USE_X87CONTROL
1846 /* restore FPCW if necessary
1847 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1848 */
1849 if (fpcwSet && (oldFpcw & fpcwSet))
1850 _control87(oldFpcw, _MCW_PC);
1851#endif
1852 if (status != 0) {
1853 start = pr->u.p.lb;
1854 incr = pr->u.p.st;
1855 if (p_st != NULL)
1856 *p_st = incr;
1857 *p_lb = start + init * incr;
1858 *p_ub = start + limit * incr;
1859 if (pr->flags.ordered) {
1860 pr->u.p.ordered_lower = init;
1861 pr->u.p.ordered_upper = limit;
1862 }
1863 } else {
1864 *p_lb = 0;
1865 *p_ub = 0;
1866 if (p_st != NULL)
1867 *p_st = 0;
1868 }
1869 } // case
1870 break;
1871
1872 case kmp_sch_trapezoidal: {
1873 UT index;
1874 T parm2 = pr->u.p.parm2;
1875 T parm3 = pr->u.p.parm3;
1876 T parm4 = pr->u.p.parm4;
1877 KD_TRACE(100,
1878 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1879 gtid));
1880
1881 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1882
1883 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1884 trip = pr->u.p.tc - 1;
1885
1886 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1887 *p_lb = 0;
1888 *p_ub = 0;
1889 if (p_st != NULL)
1890 *p_st = 0;
1891 } else {
1892 start = pr->u.p.lb;
1893 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1894 incr = pr->u.p.st;
1895
1896 if ((last = (limit >= trip)) != 0)
1897 limit = trip;
1898
1899 if (p_st != NULL)
1900 *p_st = incr;
1901
1902 if (incr == 1) {
1903 *p_lb = start + init;
1904 *p_ub = start + limit;
1905 } else {
1906 *p_lb = start + init * incr;
1907 *p_ub = start + limit * incr;
1908 }
1909
1910 if (pr->flags.ordered) {
1911 pr->u.p.ordered_lower = init;
1912 pr->u.p.ordered_upper = limit;
1913 } // if
1914 } // if
1915 } // case
1916 break;
1917 default: {
1918 status = 0; // to avoid complaints on uninitialized variable use
1919 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1920 KMP_HNT(GetNewerLibrary), // Hint
1921 __kmp_msg_null // Variadic argument list terminator
1922 );
1923 } break;
1924 } // switch
1925 if (p_last)
1926 *p_last = last;
1927#ifdef KMP_DEBUG
1928 if (pr->flags.ordered) {
1929 char *buff;
1930 // create format specifiers before the debug output
1931 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1932 "ordered_lower:%%%s ordered_upper:%%%s\n",
1933 traits_t<UT>::spec, traits_t<UT>::spec);
1934 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1935 __kmp_str_free(&buff);
1936 }
1937 {
1938 char *buff;
1939 // create format specifiers before the debug output
1940 buff = __kmp_str_format(
1941 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1942 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1943 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1944 KMP_DEBUG_ASSERT(p_last);
1945 KMP_DEBUG_ASSERT(p_st);
1946 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1947 __kmp_str_free(&buff);
1948 }
1949#endif
1950 return status;
1951}
1952
1953/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1954 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1955 is not called. */
1956#if OMPT_SUPPORT && OMPT_OPTIONAL
1957#define OMPT_LOOP_END \
1958 if (status == 0) { \
1959 if (ompt_enabled.ompt_callback_work) { \
1960 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1961 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1962 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1963 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1964 &(task_info->task_data), 0, codeptr); \
1965 } \
1966 }
1967#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
1968 if (ompt_enabled.ompt_callback_dispatch && status) { \
1969 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1970 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1971 ompt_dispatch_chunk_t chunk; \
1972 ompt_data_t instance = ompt_data_none; \
1973 OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
1974 instance.ptr = &chunk; \
1975 ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
1976 &(team_info->parallel_data), &(task_info->task_data), \
1977 ompt_dispatch_ws_loop_chunk, instance); \
1978 }
1979// TODO: implement count
1980#else
1981#define OMPT_LOOP_END // no-op
1982#define OMPT_LOOP_DISPATCH // no-op
1983#endif
1984
1985#if KMP_STATS_ENABLED
1986#define KMP_STATS_LOOP_END \
1987 { \
1988 kmp_int64 u, l, t, i; \
1989 l = (kmp_int64)(*p_lb); \
1990 u = (kmp_int64)(*p_ub); \
1991 i = (kmp_int64)(pr->u.p.st); \
1992 if (status == 0) { \
1993 t = 0; \
1994 KMP_POP_PARTITIONED_TIMER(); \
1995 } else if (i == 1) { \
1996 if (u >= l) \
1997 t = u - l + 1; \
1998 else \
1999 t = 0; \
2000 } else if (i < 0) { \
2001 if (l >= u) \
2002 t = (l - u) / (-i) + 1; \
2003 else \
2004 t = 0; \
2005 } else { \
2006 if (u >= l) \
2007 t = (u - l) / i + 1; \
2008 else \
2009 t = 0; \
2010 } \
2011 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
2012 }
2013#else
2014#define KMP_STATS_LOOP_END /* Nothing */
2015#endif
2016
2017template <typename T>
2018static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2019 T *p_lb, T *p_ub,
2020 typename traits_t<T>::signed_t *p_st
2021#if OMPT_SUPPORT && OMPT_OPTIONAL
2022 ,
2023 void *codeptr
2024#endif
2025) {
2026
2027 typedef typename traits_t<T>::unsigned_t UT;
2028 typedef typename traits_t<T>::signed_t ST;
2029 // This is potentially slightly misleading, schedule(runtime) will appear here
2030 // even if the actual runtime schedule is static. (Which points out a
2031 // disadvantage of schedule(runtime): even when static scheduling is used it
2032 // costs more than a compile time choice to use static scheduling would.)
2033 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2034
2035 int status;
2036 dispatch_private_info_template<T> *pr;
2037 __kmp_assert_valid_gtid(gtid);
2038 kmp_info_t *th = __kmp_threads[gtid];
2039 kmp_team_t *team = th->th.th_team;
2040
2041 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2042 KD_TRACE(
2043 1000,
2044 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2045 gtid, p_lb, p_ub, p_st, p_last));
2046
2047 if (team->t.t_serialized) {
2048 /* NOTE: serialize this dispatch because we are not at the active level */
2049 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2050 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2051 KMP_DEBUG_ASSERT(pr);
2052
2053 if ((status = (pr->u.p.tc != 0)) == 0) {
2054 *p_lb = 0;
2055 *p_ub = 0;
2056 // if ( p_last != NULL )
2057 // *p_last = 0;
2058 if (p_st != NULL)
2059 *p_st = 0;
2060 if (__kmp_env_consistency_check) {
2061 if (pr->pushed_ws != ct_none) {
2062 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2063 }
2064 }
2065 } else if (pr->flags.nomerge) {
2066 kmp_int32 last;
2067 T start;
2068 UT limit, trip, init;
2069 ST incr;
2070 T chunk = pr->u.p.parm1;
2071
2072 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2073 gtid));
2074
2075 init = chunk * pr->u.p.count++;
2076 trip = pr->u.p.tc - 1;
2077
2078 if ((status = (init <= trip)) == 0) {
2079 *p_lb = 0;
2080 *p_ub = 0;
2081 // if ( p_last != NULL )
2082 // *p_last = 0;
2083 if (p_st != NULL)
2084 *p_st = 0;
2085 if (__kmp_env_consistency_check) {
2086 if (pr->pushed_ws != ct_none) {
2087 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2088 }
2089 }
2090 } else {
2091 start = pr->u.p.lb;
2092 limit = chunk + init - 1;
2093 incr = pr->u.p.st;
2094
2095 if ((last = (limit >= trip)) != 0) {
2096 limit = trip;
2097#if KMP_OS_WINDOWS
2098 pr->u.p.last_upper = pr->u.p.ub;
2099#endif /* KMP_OS_WINDOWS */
2100 }
2101 if (p_last != NULL)
2102 *p_last = last;
2103 if (p_st != NULL)
2104 *p_st = incr;
2105 if (incr == 1) {
2106 *p_lb = start + init;
2107 *p_ub = start + limit;
2108 } else {
2109 *p_lb = start + init * incr;
2110 *p_ub = start + limit * incr;
2111 }
2112
2113 if (pr->flags.ordered) {
2114 pr->u.p.ordered_lower = init;
2115 pr->u.p.ordered_upper = limit;
2116#ifdef KMP_DEBUG
2117 {
2118 char *buff;
2119 // create format specifiers before the debug output
2120 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2121 "ordered_lower:%%%s ordered_upper:%%%s\n",
2122 traits_t<UT>::spec, traits_t<UT>::spec);
2123 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2124 pr->u.p.ordered_upper));
2125 __kmp_str_free(&buff);
2126 }
2127#endif
2128 } // if
2129 } // if
2130 } else {
2131 pr->u.p.tc = 0;
2132 *p_lb = pr->u.p.lb;
2133 *p_ub = pr->u.p.ub;
2134#if KMP_OS_WINDOWS
2135 pr->u.p.last_upper = *p_ub;
2136#endif /* KMP_OS_WINDOWS */
2137 if (p_last != NULL)
2138 *p_last = TRUE;
2139 if (p_st != NULL)
2140 *p_st = pr->u.p.st;
2141 } // if
2142#ifdef KMP_DEBUG
2143 {
2144 char *buff;
2145 // create format specifiers before the debug output
2146 buff = __kmp_str_format(
2147 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2148 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2149 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2150 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2151 (p_last ? *p_last : 0), status));
2152 __kmp_str_free(&buff);
2153 }
2154#endif
2155#if INCLUDE_SSC_MARKS
2156 SSC_MARK_DISPATCH_NEXT();
2157#endif
2158 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2159 OMPT_LOOP_END;
2160 KMP_STATS_LOOP_END;
2161 return status;
2162 } else {
2163 kmp_int32 last = 0;
2164 dispatch_shared_info_template<T> volatile *sh;
2165
2166 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2167 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2168
2169 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2170 th->th.th_dispatch->th_dispatch_pr_current);
2171 KMP_DEBUG_ASSERT(pr);
2172 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2173 th->th.th_dispatch->th_dispatch_sh_current);
2174 KMP_DEBUG_ASSERT(sh);
2175
2176#if KMP_USE_HIER_SCHED
2177 if (pr->flags.use_hier)
2178 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2179 else
2180#endif // KMP_USE_HIER_SCHED
2181 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2182 p_st, th->th.th_team_nproc,
2183 th->th.th_info.ds.ds_tid);
2184 // status == 0: no more iterations to execute
2185 if (status == 0) {
2186 ST num_done;
2187 num_done = test_then_inc<ST>(&sh->u.s.num_done);
2188#ifdef KMP_DEBUG
2189 {
2190 char *buff;
2191 // create format specifiers before the debug output
2192 buff = __kmp_str_format(
2193 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2194 traits_t<ST>::spec);
2195 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2196 __kmp_str_free(&buff);
2197 }
2198#endif
2199
2200#if KMP_USE_HIER_SCHED
2201 pr->flags.use_hier = FALSE;
2202#endif
2203 if (num_done == th->th.th_team_nproc - 1) {
2204#if KMP_STATIC_STEAL_ENABLED
2205 if (pr->schedule == kmp_sch_static_steal) {
2206 int i;
2207 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2208 __kmp_dispatch_num_buffers; // current loop index
2209 // loop complete, safe to destroy locks used for stealing
2210 for (i = 0; i < th->th.th_team_nproc; ++i) {
2211 dispatch_private_info_template<T> *buf =
2212 reinterpret_cast<dispatch_private_info_template<T> *>(
2213 &team->t.t_dispatch[i].th_disp_buffer[idx]);
2214 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2215 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2216 if (traits_t<T>::type_size > 4) {
2217 // destroy locks used for stealing
2218 kmp_lock_t *lck = buf->u.p.steal_lock;
2219 KMP_ASSERT(lck != NULL);
2220 __kmp_destroy_lock(lck);
2221 __kmp_free(lck);
2222 buf->u.p.steal_lock = NULL;
2223 }
2224 }
2225 }
2226#endif
2227 /* NOTE: release shared buffer to be reused */
2228
2229 KMP_MB(); /* Flush all pending memory write invalidates. */
2230
2231 sh->u.s.num_done = 0;
2232 sh->u.s.iteration = 0;
2233
2234 /* TODO replace with general release procedure? */
2235 if (pr->flags.ordered) {
2236 sh->u.s.ordered_iteration = 0;
2237 }
2238
2239 sh->buffer_index += __kmp_dispatch_num_buffers;
2240 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2241 gtid, sh->buffer_index));
2242
2243 KMP_MB(); /* Flush all pending memory write invalidates. */
2244
2245 } // if
2246 if (__kmp_env_consistency_check) {
2247 if (pr->pushed_ws != ct_none) {
2248 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2249 }
2250 }
2251
2252 th->th.th_dispatch->th_deo_fcn = NULL;
2253 th->th.th_dispatch->th_dxo_fcn = NULL;
2254 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2255 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2256 } // if (status == 0)
2257#if KMP_OS_WINDOWS
2258 else if (last) {
2259 pr->u.p.last_upper = pr->u.p.ub;
2260 }
2261#endif /* KMP_OS_WINDOWS */
2262 if (p_last != NULL && status != 0)
2263 *p_last = last;
2264 } // if
2265
2266#ifdef KMP_DEBUG
2267 {
2268 char *buff;
2269 // create format specifiers before the debug output
2270 buff = __kmp_str_format(
2271 "__kmp_dispatch_next: T#%%d normal case: "
2272 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2273 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2274 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2275 (p_last ? *p_last : 0), status));
2276 __kmp_str_free(&buff);
2277 }
2278#endif
2279#if INCLUDE_SSC_MARKS
2280 SSC_MARK_DISPATCH_NEXT();
2281#endif
2282 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2283 OMPT_LOOP_END;
2284 KMP_STATS_LOOP_END;
2285 return status;
2286}
2287
2303kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2304
2305 int active;
2306 kmp_info_t *th;
2307 kmp_team_t *team;
2308 kmp_uint32 my_buffer_index;
2309 dispatch_shared_info_template<kmp_int32> volatile *sh;
2310
2311 KMP_DEBUG_ASSERT(__kmp_init_serial);
2312
2313 if (!TCR_4(__kmp_init_parallel))
2314 __kmp_parallel_initialize();
2315 __kmp_resume_if_soft_paused();
2316
2317 /* setup data */
2318 th = __kmp_threads[gtid];
2319 team = th->th.th_team;
2320 active = !team->t.t_serialized;
2321 th->th.th_ident = loc;
2322
2323 KMP_COUNT_BLOCK(OMP_SECTIONS);
2324 KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2325
2326 if (active) {
2327 // Setup sections in the same way as dynamic scheduled loops.
2328 // We need one shared data: which section is to execute next.
2329 // (in case parallel is not active, all sections will be executed on the
2330 // same thread)
2331 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2332 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2333
2334 my_buffer_index = th->th.th_dispatch->th_disp_index++;
2335
2336 // reuse shared data structures from dynamic sched loops:
2337 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2338 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2339 KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2340 my_buffer_index));
2341
2342 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2343 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2344
2345 KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2346 "sh->buffer_index:%d\n",
2347 gtid, my_buffer_index, sh->buffer_index));
2348 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2349 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2350 // Note: KMP_WAIT() cannot be used there: buffer index and
2351 // my_buffer_index are *always* 32-bit integers.
2352 KMP_MB();
2353 KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2354 "sh->buffer_index:%d\n",
2355 gtid, my_buffer_index, sh->buffer_index));
2356
2357 th->th.th_dispatch->th_dispatch_pr_current =
2358 nullptr; // sections construct doesn't need private data
2359 th->th.th_dispatch->th_dispatch_sh_current =
2360 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2361 }
2362
2363#if OMPT_SUPPORT && OMPT_OPTIONAL
2364 if (ompt_enabled.ompt_callback_work) {
2365 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2366 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2367 ompt_callbacks.ompt_callback(ompt_callback_work)(
2368 ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2369 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2370 }
2371#endif
2372 KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2373
2374 return active;
2375}
2376
2387kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2388 kmp_int32 numberOfSections) {
2389
2390 KMP_TIME_PARTITIONED_BLOCK(OMP_sections);
2391
2392 kmp_info_t *th = __kmp_threads[gtid];
2393#ifdef KMP_DEBUG
2394 kmp_team_t *team = th->th.th_team;
2395#endif
2396
2397 KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2398 numberOfSections));
2399
2400 // For serialized case we should not call this function:
2401 KMP_DEBUG_ASSERT(!team->t.t_serialized);
2402
2403 dispatch_shared_info_template<kmp_int32> volatile *sh;
2404
2405 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2406 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2407
2408 KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2409 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2410 th->th.th_dispatch->th_dispatch_sh_current);
2411 KMP_DEBUG_ASSERT(sh);
2412
2413 kmp_int32 sectionIndex = 0;
2414 bool moreSectionsToExecute = true;
2415
2416 // Find section to execute:
2417 sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2418 if (sectionIndex >= numberOfSections) {
2419 moreSectionsToExecute = false;
2420 }
2421
2422 // status == 0: no more sections to execute;
2423 // OMPTODO: __kmpc_end_sections could be bypassed?
2424 if (!moreSectionsToExecute) {
2425 kmp_int32 num_done;
2426
2427 num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2428
2429 if (num_done == th->th.th_team_nproc - 1) {
2430 /* NOTE: release this buffer to be reused */
2431
2432 KMP_MB(); /* Flush all pending memory write invalidates. */
2433
2434 sh->u.s.num_done = 0;
2435 sh->u.s.iteration = 0;
2436
2437 KMP_MB(); /* Flush all pending memory write invalidates. */
2438
2439 sh->buffer_index += __kmp_dispatch_num_buffers;
2440 KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2441 sh->buffer_index));
2442
2443 KMP_MB(); /* Flush all pending memory write invalidates. */
2444
2445 } // if
2446
2447 th->th.th_dispatch->th_deo_fcn = NULL;
2448 th->th.th_dispatch->th_dxo_fcn = NULL;
2449 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2450 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2451
2452#if OMPT_SUPPORT && OMPT_OPTIONAL
2453 if (ompt_enabled.ompt_callback_dispatch) {
2454 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2455 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2456 ompt_data_t instance = ompt_data_none;
2457 instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2458 ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2459 &(team_info->parallel_data), &(task_info->task_data),
2460 ompt_dispatch_section, instance);
2461 }
2462#endif
2463 KMP_POP_PARTITIONED_TIMER();
2464 }
2465
2466 return sectionIndex;
2467}
2468
2477void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2478
2479 kmp_info_t *th = __kmp_threads[gtid];
2480 int active = !th->th.th_team->t.t_serialized;
2481
2482 KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2483
2484 if (!active) {
2485 // In active case call finalization is done in __kmpc_next_section
2486#if OMPT_SUPPORT && OMPT_OPTIONAL
2487 if (ompt_enabled.ompt_callback_work) {
2488 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2489 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2490 ompt_callbacks.ompt_callback(ompt_callback_work)(
2491 ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2492 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2493 }
2494#endif
2495 KMP_POP_PARTITIONED_TIMER();
2496 }
2497
2498 KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2499}
2500
2501template <typename T>
2502static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2503 kmp_int32 *plastiter, T *plower, T *pupper,
2504 typename traits_t<T>::signed_t incr) {
2505 typedef typename traits_t<T>::unsigned_t UT;
2506 kmp_uint32 team_id;
2507 kmp_uint32 nteams;
2508 UT trip_count;
2509 kmp_team_t *team;
2510 kmp_info_t *th;
2511
2512 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2513 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2514#ifdef KMP_DEBUG
2515 typedef typename traits_t<T>::signed_t ST;
2516 {
2517 char *buff;
2518 // create format specifiers before the debug output
2519 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2520 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2521 traits_t<T>::spec, traits_t<T>::spec,
2522 traits_t<ST>::spec, traits_t<T>::spec);
2523 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2524 __kmp_str_free(&buff);
2525 }
2526#endif
2527
2528 if (__kmp_env_consistency_check) {
2529 if (incr == 0) {
2530 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2531 loc);
2532 }
2533 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2534 // The loop is illegal.
2535 // Some zero-trip loops maintained by compiler, e.g.:
2536 // for(i=10;i<0;++i) // lower >= upper - run-time check
2537 // for(i=0;i>10;--i) // lower <= upper - run-time check
2538 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2539 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2540 // Compiler does not check the following illegal loops:
2541 // for(i=0;i<10;i+=incr) // where incr<0
2542 // for(i=10;i>0;i-=incr) // where incr<0
2543 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2544 }
2545 }
2546 __kmp_assert_valid_gtid(gtid);
2547 th = __kmp_threads[gtid];
2548 team = th->th.th_team;
2549 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2550 nteams = th->th.th_teams_size.nteams;
2551 team_id = team->t.t_master_tid;
2552 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2553
2554 // compute global trip count
2555 if (incr == 1) {
2556 trip_count = *pupper - *plower + 1;
2557 } else if (incr == -1) {
2558 trip_count = *plower - *pupper + 1;
2559 } else if (incr > 0) {
2560 // upper-lower can exceed the limit of signed type
2561 trip_count = (UT)(*pupper - *plower) / incr + 1;
2562 } else {
2563 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2564 }
2565
2566 if (trip_count <= nteams) {
2567 KMP_DEBUG_ASSERT(
2568 __kmp_static == kmp_sch_static_greedy ||
2569 __kmp_static ==
2570 kmp_sch_static_balanced); // Unknown static scheduling type.
2571 // only some teams get single iteration, others get nothing
2572 if (team_id < trip_count) {
2573 *pupper = *plower = *plower + team_id * incr;
2574 } else {
2575 *plower = *pupper + incr; // zero-trip loop
2576 }
2577 if (plastiter != NULL)
2578 *plastiter = (team_id == trip_count - 1);
2579 } else {
2580 if (__kmp_static == kmp_sch_static_balanced) {
2581 UT chunk = trip_count / nteams;
2582 UT extras = trip_count % nteams;
2583 *plower +=
2584 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2585 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2586 if (plastiter != NULL)
2587 *plastiter = (team_id == nteams - 1);
2588 } else {
2589 T chunk_inc_count =
2590 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2591 T upper = *pupper;
2592 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2593 // Unknown static scheduling type.
2594 *plower += team_id * chunk_inc_count;
2595 *pupper = *plower + chunk_inc_count - incr;
2596 // Check/correct bounds if needed
2597 if (incr > 0) {
2598 if (*pupper < *plower)
2599 *pupper = traits_t<T>::max_value;
2600 if (plastiter != NULL)
2601 *plastiter = *plower <= upper && *pupper > upper - incr;
2602 if (*pupper > upper)
2603 *pupper = upper; // tracker C73258
2604 } else {
2605 if (*pupper > *plower)
2606 *pupper = traits_t<T>::min_value;
2607 if (plastiter != NULL)
2608 *plastiter = *plower >= upper && *pupper < upper - incr;
2609 if (*pupper < upper)
2610 *pupper = upper; // tracker C73258
2611 }
2612 }
2613 }
2614}
2615
2616//-----------------------------------------------------------------------------
2617// Dispatch routines
2618// Transfer call to template< type T >
2619// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2620// T lb, T ub, ST st, ST chunk )
2621extern "C" {
2622
2639void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2640 enum sched_type schedule, kmp_int32 lb,
2641 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2642 KMP_DEBUG_ASSERT(__kmp_init_serial);
2643#if OMPT_SUPPORT && OMPT_OPTIONAL
2644 OMPT_STORE_RETURN_ADDRESS(gtid);
2645#endif
2646 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2647}
2651void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2652 enum sched_type schedule, kmp_uint32 lb,
2653 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2654 KMP_DEBUG_ASSERT(__kmp_init_serial);
2655#if OMPT_SUPPORT && OMPT_OPTIONAL
2656 OMPT_STORE_RETURN_ADDRESS(gtid);
2657#endif
2658 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2659}
2660
2664void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2665 enum sched_type schedule, kmp_int64 lb,
2666 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2667 KMP_DEBUG_ASSERT(__kmp_init_serial);
2668#if OMPT_SUPPORT && OMPT_OPTIONAL
2669 OMPT_STORE_RETURN_ADDRESS(gtid);
2670#endif
2671 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2672}
2673
2677void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2678 enum sched_type schedule, kmp_uint64 lb,
2679 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2680 KMP_DEBUG_ASSERT(__kmp_init_serial);
2681#if OMPT_SUPPORT && OMPT_OPTIONAL
2682 OMPT_STORE_RETURN_ADDRESS(gtid);
2683#endif
2684 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2685}
2686
2696void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2697 enum sched_type schedule, kmp_int32 *p_last,
2698 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2699 kmp_int32 chunk) {
2700 KMP_DEBUG_ASSERT(__kmp_init_serial);
2701#if OMPT_SUPPORT && OMPT_OPTIONAL
2702 OMPT_STORE_RETURN_ADDRESS(gtid);
2703#endif
2704 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2705 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2706}
2707
2708void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2709 enum sched_type schedule, kmp_int32 *p_last,
2710 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2711 kmp_int32 chunk) {
2712 KMP_DEBUG_ASSERT(__kmp_init_serial);
2713#if OMPT_SUPPORT && OMPT_OPTIONAL
2714 OMPT_STORE_RETURN_ADDRESS(gtid);
2715#endif
2716 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2717 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2718}
2719
2720void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2721 enum sched_type schedule, kmp_int32 *p_last,
2722 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2723 kmp_int64 chunk) {
2724 KMP_DEBUG_ASSERT(__kmp_init_serial);
2725#if OMPT_SUPPORT && OMPT_OPTIONAL
2726 OMPT_STORE_RETURN_ADDRESS(gtid);
2727#endif
2728 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2729 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2730}
2731
2732void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2733 enum sched_type schedule, kmp_int32 *p_last,
2734 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2735 kmp_int64 chunk) {
2736 KMP_DEBUG_ASSERT(__kmp_init_serial);
2737#if OMPT_SUPPORT && OMPT_OPTIONAL
2738 OMPT_STORE_RETURN_ADDRESS(gtid);
2739#endif
2740 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2741 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2742}
2743
2757int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2758 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2759#if OMPT_SUPPORT && OMPT_OPTIONAL
2760 OMPT_STORE_RETURN_ADDRESS(gtid);
2761#endif
2762 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2763#if OMPT_SUPPORT && OMPT_OPTIONAL
2764 ,
2765 OMPT_LOAD_RETURN_ADDRESS(gtid)
2766#endif
2767 );
2768}
2769
2773int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2774 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2775 kmp_int32 *p_st) {
2776#if OMPT_SUPPORT && OMPT_OPTIONAL
2777 OMPT_STORE_RETURN_ADDRESS(gtid);
2778#endif
2779 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2780#if OMPT_SUPPORT && OMPT_OPTIONAL
2781 ,
2782 OMPT_LOAD_RETURN_ADDRESS(gtid)
2783#endif
2784 );
2785}
2786
2790int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2791 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2792#if OMPT_SUPPORT && OMPT_OPTIONAL
2793 OMPT_STORE_RETURN_ADDRESS(gtid);
2794#endif
2795 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2796#if OMPT_SUPPORT && OMPT_OPTIONAL
2797 ,
2798 OMPT_LOAD_RETURN_ADDRESS(gtid)
2799#endif
2800 );
2801}
2802
2806int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2807 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2808 kmp_int64 *p_st) {
2809#if OMPT_SUPPORT && OMPT_OPTIONAL
2810 OMPT_STORE_RETURN_ADDRESS(gtid);
2811#endif
2812 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2813#if OMPT_SUPPORT && OMPT_OPTIONAL
2814 ,
2815 OMPT_LOAD_RETURN_ADDRESS(gtid)
2816#endif
2817 );
2818}
2819
2826void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2827 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2828}
2829
2833void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2834 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2835}
2836
2840void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2841 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2842}
2843
2847void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2848 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2849}
2852//-----------------------------------------------------------------------------
2853// Non-template routines from kmp_dispatch.cpp used in other sources
2854
2855kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2856 return value == checker;
2857}
2858
2859kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2860 return value != checker;
2861}
2862
2863kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2864 return value < checker;
2865}
2866
2867kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2868 return value >= checker;
2869}
2870
2871kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2872 return value <= checker;
2873}
2874
2875kmp_uint32
2876__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2877 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2878 void *obj // Higher-level synchronization object, or NULL.
2879) {
2880 // note: we may not belong to a team at this point
2881 volatile kmp_uint32 *spin = spinner;
2882 kmp_uint32 check = checker;
2883 kmp_uint32 spins;
2884 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2885 kmp_uint32 r;
2886 kmp_uint64 time;
2887
2888 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2889 KMP_INIT_YIELD(spins);
2890 KMP_INIT_BACKOFF(time);
2891 // main wait spin loop
2892 while (!f(r = TCR_4(*spin), check)) {
2893 KMP_FSYNC_SPIN_PREPARE(obj);
2894 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2895 split. It causes problems with infinite recursion because of exit lock */
2896 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2897 __kmp_abort_thread(); */
2898 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
2899 }
2900 KMP_FSYNC_SPIN_ACQUIRED(obj);
2901 return r;
2902}
2903
2904void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2905 kmp_uint32 (*pred)(void *, kmp_uint32),
2906 void *obj // Higher-level synchronization object, or NULL.
2907) {
2908 // note: we may not belong to a team at this point
2909 void *spin = spinner;
2910 kmp_uint32 check = checker;
2911 kmp_uint32 spins;
2912 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2913 kmp_uint64 time;
2914
2915 KMP_FSYNC_SPIN_INIT(obj, spin);
2916 KMP_INIT_YIELD(spins);
2917 KMP_INIT_BACKOFF(time);
2918 // main wait spin loop
2919 while (!f(spin, check)) {
2920 KMP_FSYNC_SPIN_PREPARE(obj);
2921 /* if we have waited a bit, or are noversubscribed, yield */
2922 /* pause is in the following code */
2923 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
2924 }
2925 KMP_FSYNC_SPIN_ACQUIRED(obj);
2926}
2927
2928} // extern "C"
2929
2930#ifdef KMP_GOMP_COMPAT
2931
2932void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2933 enum sched_type schedule, kmp_int32 lb,
2934 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2935 int push_ws) {
2936 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2937 push_ws);
2938}
2939
2940void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2941 enum sched_type schedule, kmp_uint32 lb,
2942 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2943 int push_ws) {
2944 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2945 push_ws);
2946}
2947
2948void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2949 enum sched_type schedule, kmp_int64 lb,
2950 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2951 int push_ws) {
2952 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2953 push_ws);
2954}
2955
2956void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2957 enum sched_type schedule, kmp_uint64 lb,
2958 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2959 int push_ws) {
2960 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2961 push_ws);
2962}
2963
2964void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2965 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2966}
2967
2968void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2969 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2970}
2971
2972void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2973 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2974}
2975
2976void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2977 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2978}
2979
2980#endif /* KMP_GOMP_COMPAT */
2981
2982/* ------------------------------------------------------------------------ */
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:908
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
sched_type
Definition: kmp.h:357
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid, kmp_int32 numberOfSections)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
@ kmp_sch_runtime_simd
Definition: kmp.h:379
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_simd
Definition: kmp.h:378
@ kmp_sch_guided_chunked
Definition: kmp.h:362
@ kmp_sch_lower
Definition: kmp.h:358
@ kmp_nm_upper
Definition: kmp.h:429
@ kmp_ord_lower
Definition: kmp.h:384
@ kmp_sch_upper
Definition: kmp.h:382
@ kmp_nm_lower
Definition: kmp.h:402
Definition: kmp.h:234