LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_barrier.cpp
1/*
2 * kmp_barrier.cpp
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp_wait_release.h"
14#include "kmp_barrier.h"
15#include "kmp_itt.h"
16#include "kmp_os.h"
17#include "kmp_stats.h"
18#include "ompt-specific.h"
19// for distributed barrier
20#include "kmp_affinity.h"
21
22#if KMP_MIC
23#include <immintrin.h>
24#define USE_NGO_STORES 1
25#endif // KMP_MIC
26
27#if KMP_MIC && USE_NGO_STORES
28// ICV copying
29#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33#else
34#define ngo_load(src) ((void)0)
35#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37#define ngo_sync() ((void)0)
38#endif /* KMP_MIC && USE_NGO_STORES */
39
40void __kmp_print_structure(void); // Forward declaration
41
42// ---------------------------- Barrier Algorithms ----------------------------
43// Distributed barrier
44
45// Compute how many threads to have polling each cache-line.
46// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47void distributedBarrier::computeVarsForN(size_t n) {
48 int nsockets = 1;
49 if (__kmp_topology) {
50 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52 int ncores_per_socket =
53 __kmp_topology->calculate_ratio(core_level, socket_level);
54 nsockets = __kmp_topology->get_count(socket_level);
55
56 if (nsockets <= 0)
57 nsockets = 1;
58 if (ncores_per_socket <= 0)
59 ncores_per_socket = 1;
60
61 threads_per_go = ncores_per_socket >> 1;
62 if (!fix_threads_per_go) {
63 // Minimize num_gos
64 if (threads_per_go > 4) {
65 if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66 threads_per_go = threads_per_go >> 1;
67 }
68 if (threads_per_go > 4 && nsockets == 1)
69 threads_per_go = threads_per_go >> 1;
70 }
71 }
72 if (threads_per_go == 0)
73 threads_per_go = 1;
74 fix_threads_per_go = true;
75 num_gos = n / threads_per_go;
76 if (n % threads_per_go)
77 num_gos++;
78 if (nsockets == 1 || num_gos == 1)
79 num_groups = 1;
80 else {
81 num_groups = num_gos / nsockets;
82 if (num_gos % nsockets)
83 num_groups++;
84 }
85 if (num_groups <= 0)
86 num_groups = 1;
87 gos_per_group = num_gos / num_groups;
88 if (num_gos % num_groups)
89 gos_per_group++;
90 threads_per_group = threads_per_go * gos_per_group;
91 } else {
92 num_gos = n / threads_per_go;
93 if (n % threads_per_go)
94 num_gos++;
95 if (num_gos == 1)
96 num_groups = 1;
97 else {
98 num_groups = num_gos / 2;
99 if (num_gos % 2)
100 num_groups++;
101 }
102 gos_per_group = num_gos / num_groups;
103 if (num_gos % num_groups)
104 gos_per_group++;
105 threads_per_group = threads_per_go * gos_per_group;
106 }
107}
108
109void distributedBarrier::computeGo(size_t n) {
110 // Minimize num_gos
111 for (num_gos = 1;; num_gos++)
112 if (IDEAL_CONTENTION * num_gos >= n)
113 break;
114 threads_per_go = n / num_gos;
115 if (n % num_gos)
116 threads_per_go++;
117 while (num_gos > MAX_GOS) {
118 threads_per_go++;
119 num_gos = n / threads_per_go;
120 if (n % threads_per_go)
121 num_gos++;
122 }
123 computeVarsForN(n);
124}
125
126// This function is to resize the barrier arrays when the new number of threads
127// exceeds max_threads, which is the current size of all the arrays
128void distributedBarrier::resize(size_t nthr) {
129 KMP_DEBUG_ASSERT(nthr > max_threads);
130
131 // expand to requested size * 2
132 max_threads = nthr * 2;
133
134 // allocate arrays to new max threads
135 for (int i = 0; i < MAX_ITERS; ++i) {
136 if (flags[i])
137 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138 max_threads * sizeof(flags_s));
139 else
140 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141 }
142
143 if (go)
144 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145 else
146 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147
148 if (iter)
149 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150 else
151 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152
153 if (sleep)
154 sleep =
155 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156 else
157 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158}
159
160// This function is to set all the go flags that threads might be waiting
161// on, and when blocktime is not infinite, it should be followed by a wake-up
162// call to each thread
163kmp_uint64 distributedBarrier::go_release() {
164 kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165 for (size_t j = 0; j < num_gos; j++) {
166 go[j].go.store(next_go);
167 }
168 return next_go;
169}
170
171void distributedBarrier::go_reset() {
172 for (size_t j = 0; j < max_threads; ++j) {
173 for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174 flags[i][j].stillNeed = 1;
175 }
176 go[j].go.store(0);
177 iter[j].iter = 0;
178 }
179}
180
181// This function inits/re-inits the distributed barrier for a particular number
182// of threads. If a resize of arrays is needed, it calls the resize function.
183void distributedBarrier::init(size_t nthr) {
184 size_t old_max = max_threads;
185 if (nthr > max_threads) { // need more space in arrays
186 resize(nthr);
187 }
188
189 for (size_t i = 0; i < max_threads; i++) {
190 for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191 flags[j][i].stillNeed = 1;
192 }
193 go[i].go.store(0);
194 iter[i].iter = 0;
195 if (i >= old_max)
196 sleep[i].sleep = false;
197 }
198
199 // Recalculate num_gos, etc. based on new nthr
200 computeVarsForN(nthr);
201
202 num_threads = nthr;
203
204 if (team_icvs == NULL)
205 team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206}
207
208void distributedBarrier::deallocate(distributedBarrier *db) {
209 for (int i = 0; i < MAX_ITERS; ++i) {
210 if (db->flags[i])
211 KMP_INTERNAL_FREE(db->flags[i]);
212 db->flags[i] = NULL;
213 }
214 if (db->go) {
215 KMP_INTERNAL_FREE(db->go);
216 db->go = NULL;
217 }
218 if (db->iter) {
219 KMP_INTERNAL_FREE(db->iter);
220 db->iter = NULL;
221 }
222 if (db->sleep) {
223 KMP_INTERNAL_FREE(db->sleep);
224 db->sleep = NULL;
225 }
226 if (db->team_icvs) {
227 __kmp_free(db->team_icvs);
228 db->team_icvs = NULL;
229 }
230 KMP_ALIGNED_FREE(db);
231}
232
233// This function is used only when KMP_BLOCKTIME is not infinite.
234// static
235void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
236 size_t start, size_t stop, size_t inc,
237 size_t tid) {
238 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
239 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
240 return;
241
242 kmp_info_t **other_threads = team->t.t_threads;
243 for (size_t thr = start; thr < stop; thr += inc) {
244 KMP_DEBUG_ASSERT(other_threads[thr]);
245 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
246 // Wake up worker regardless of if it appears to be sleeping or not
247 __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
248 }
249}
250
251static void __kmp_dist_barrier_gather(
252 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
253 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
254 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
255 kmp_team_t *team;
256 distributedBarrier *b;
257 kmp_info_t **other_threads;
258 kmp_uint64 my_current_iter, my_next_iter;
259 kmp_uint32 nproc;
260 bool group_leader;
261
262 team = this_thr->th.th_team;
263 nproc = this_thr->th.th_team_nproc;
264 other_threads = team->t.t_threads;
265 b = team->t.b;
266 my_current_iter = b->iter[tid].iter;
267 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
268 group_leader = ((tid % b->threads_per_group) == 0);
269
270 KA_TRACE(20,
271 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
272 gtid, team->t.t_id, tid, bt));
273
274#if USE_ITT_BUILD && USE_ITT_NOTIFY
275 // Barrier imbalance - save arrive time to the thread
276 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
277 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
278 __itt_get_timestamp();
279 }
280#endif
281
282 if (group_leader) {
283 // Start from the thread after the group leader
284 size_t group_start = tid + 1;
285 size_t group_end = tid + b->threads_per_group;
286 size_t threads_pending = 0;
287
288 if (group_end > nproc)
289 group_end = nproc;
290 do { // wait for threads in my group
291 threads_pending = 0;
292 // Check all the flags every time to avoid branch misspredict
293 for (size_t thr = group_start; thr < group_end; thr++) {
294 // Each thread uses a different cache line
295 threads_pending += b->flags[my_current_iter][thr].stillNeed;
296 }
297 // Execute tasks here
298 if (__kmp_tasking_mode != tskm_immediate_exec) {
299 kmp_task_team_t *task_team = this_thr->th.th_task_team;
300 if (task_team != NULL) {
301 if (TCR_SYNC_4(task_team->tt.tt_active)) {
302 if (KMP_TASKING_ENABLED(task_team)) {
303 int tasks_completed = FALSE;
304 __kmp_atomic_execute_tasks_64(
305 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
306 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
307 } else
308 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
309 }
310 } else {
311 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
312 } // if
313 }
314 if (TCR_4(__kmp_global.g.g_done)) {
315 if (__kmp_global.g.g_abort)
316 __kmp_abort_thread();
317 break;
318 } else if (__kmp_tasking_mode != tskm_immediate_exec &&
319 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
320 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
321 }
322 } while (threads_pending > 0);
323
324 if (reduce) { // Perform reduction if needed
325 OMPT_REDUCTION_DECL(this_thr, gtid);
326 OMPT_REDUCTION_BEGIN;
327 // Group leader reduces all threads in group
328 for (size_t thr = group_start; thr < group_end; thr++) {
329 (*reduce)(this_thr->th.th_local.reduce_data,
330 other_threads[thr]->th.th_local.reduce_data);
331 }
332 OMPT_REDUCTION_END;
333 }
334
335 // Set flag for next iteration
336 b->flags[my_next_iter][tid].stillNeed = 1;
337 // Each thread uses a different cache line; resets stillNeed to 0 to
338 // indicate it has reached the barrier
339 b->flags[my_current_iter][tid].stillNeed = 0;
340
341 do { // wait for all group leaders
342 threads_pending = 0;
343 for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
344 threads_pending += b->flags[my_current_iter][thr].stillNeed;
345 }
346 // Execute tasks here
347 if (__kmp_tasking_mode != tskm_immediate_exec) {
348 kmp_task_team_t *task_team = this_thr->th.th_task_team;
349 if (task_team != NULL) {
350 if (TCR_SYNC_4(task_team->tt.tt_active)) {
351 if (KMP_TASKING_ENABLED(task_team)) {
352 int tasks_completed = FALSE;
353 __kmp_atomic_execute_tasks_64(
354 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
355 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
356 } else
357 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
358 }
359 } else {
360 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
361 } // if
362 }
363 if (TCR_4(__kmp_global.g.g_done)) {
364 if (__kmp_global.g.g_abort)
365 __kmp_abort_thread();
366 break;
367 } else if (__kmp_tasking_mode != tskm_immediate_exec &&
368 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
369 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
370 }
371 } while (threads_pending > 0);
372
373 if (reduce) { // Perform reduction if needed
374 if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
375 OMPT_REDUCTION_DECL(this_thr, gtid);
376 OMPT_REDUCTION_BEGIN;
377 for (size_t thr = b->threads_per_group; thr < nproc;
378 thr += b->threads_per_group) {
379 (*reduce)(this_thr->th.th_local.reduce_data,
380 other_threads[thr]->th.th_local.reduce_data);
381 }
382 OMPT_REDUCTION_END;
383 }
384 }
385 } else {
386 // Set flag for next iteration
387 b->flags[my_next_iter][tid].stillNeed = 1;
388 // Each thread uses a different cache line; resets stillNeed to 0 to
389 // indicate it has reached the barrier
390 b->flags[my_current_iter][tid].stillNeed = 0;
391 }
392
393 KMP_MFENCE();
394
395 KA_TRACE(20,
396 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
397 gtid, team->t.t_id, tid, bt));
398}
399
400static void __kmp_dist_barrier_release(
401 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
402 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
403 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
404 kmp_team_t *team;
405 distributedBarrier *b;
406 kmp_bstate_t *thr_bar;
407 kmp_uint64 my_current_iter, next_go;
408 size_t my_go_index;
409 bool group_leader;
410
411 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
412 gtid, tid, bt));
413
414 thr_bar = &this_thr->th.th_bar[bt].bb;
415
416 if (!KMP_MASTER_TID(tid)) {
417 // workers and non-master group leaders need to check their presence in team
418 do {
419 if (this_thr->th.th_used_in_team.load() != 1 &&
420 this_thr->th.th_used_in_team.load() != 3) {
421 // Thread is not in use in a team. Wait on location in tid's thread
422 // struct. The 0 value tells anyone looking that this thread is spinning
423 // or sleeping until this location becomes 3 again; 3 is the transition
424 // state to get to 1 which is waiting on go and being in the team
425 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
426 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
427 0) ||
428 this_thr->th.th_used_in_team.load() == 0) {
429 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
430 }
431#if USE_ITT_BUILD && USE_ITT_NOTIFY
432 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
433 // In fork barrier where we could not get the object reliably
434 itt_sync_obj =
435 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
436 // Cancel wait on previous parallel region...
437 __kmp_itt_task_starting(itt_sync_obj);
438
439 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
440 return;
441
442 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
443 if (itt_sync_obj != NULL)
444 // Call prepare as early as possible for "new" barrier
445 __kmp_itt_task_finished(itt_sync_obj);
446 } else
447#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
448 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
449 return;
450 }
451 if (this_thr->th.th_used_in_team.load() != 1 &&
452 this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
453 continue;
454 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
455 return;
456
457 // At this point, the thread thinks it is in use in a team, or in
458 // transition to be used in a team, but it might have reached this barrier
459 // before it was marked unused by the team. Unused threads are awoken and
460 // shifted to wait on local thread struct elsewhere. It also might reach
461 // this point by being picked up for use by a different team. Either way,
462 // we need to update the tid.
463 tid = __kmp_tid_from_gtid(gtid);
464 team = this_thr->th.th_team;
465 KMP_DEBUG_ASSERT(tid >= 0);
466 KMP_DEBUG_ASSERT(team);
467 b = team->t.b;
468 my_current_iter = b->iter[tid].iter;
469 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
470 my_go_index = tid / b->threads_per_go;
471 if (this_thr->th.th_used_in_team.load() == 3) {
472 (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
473 1);
474 }
475 // Check if go flag is set
476 if (b->go[my_go_index].go.load() != next_go) {
477 // Wait on go flag on team
478 kmp_atomic_flag_64<false, true> my_flag(
479 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
480 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
481 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
482 b->iter[tid].iter == 0);
483 KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
484 }
485
486 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
487 return;
488 // At this point, the thread's go location was set. This means the primary
489 // thread is safely in the barrier, and so this thread's data is
490 // up-to-date, but we should check again that this thread is really in
491 // use in the team, as it could have been woken up for the purpose of
492 // changing team size, or reaping threads at shutdown.
493 if (this_thr->th.th_used_in_team.load() == 1)
494 break;
495 } while (1);
496
497 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
498 return;
499
500 group_leader = ((tid % b->threads_per_group) == 0);
501 if (group_leader) {
502 // Tell all the threads in my group they can go!
503 for (size_t go_idx = my_go_index + 1;
504 go_idx < my_go_index + b->gos_per_group; go_idx++) {
505 b->go[go_idx].go.store(next_go);
506 }
507 // Fence added so that workers can see changes to go. sfence inadequate.
508 KMP_MFENCE();
509 }
510
511#if KMP_BARRIER_ICV_PUSH
512 if (propagate_icvs) { // copy ICVs to final dest
513 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
514 tid, FALSE);
515 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
516 (kmp_internal_control_t *)team->t.b->team_icvs);
517 copy_icvs(&thr_bar->th_fixed_icvs,
518 &team->t.t_implicit_task_taskdata[tid].td_icvs);
519 }
520#endif
521 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
522 // This thread is now awake and participating in the barrier;
523 // wake up the other threads in the group
524 size_t nproc = this_thr->th.th_team_nproc;
525 size_t group_end = tid + b->threads_per_group;
526 if (nproc < group_end)
527 group_end = nproc;
528 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
529 }
530 } else { // Primary thread
531 team = this_thr->th.th_team;
532 b = team->t.b;
533 my_current_iter = b->iter[tid].iter;
534 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
535#if KMP_BARRIER_ICV_PUSH
536 if (propagate_icvs) {
537 // primary thread has ICVs in final destination; copy
538 copy_icvs(&thr_bar->th_fixed_icvs,
539 &team->t.t_implicit_task_taskdata[tid].td_icvs);
540 }
541#endif
542 // Tell all the group leaders they can go!
543 for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
544 b->go[go_idx].go.store(next_go);
545 }
546
547 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
548 // Wake-up the group leaders
549 size_t nproc = this_thr->th.th_team_nproc;
550 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
551 b->threads_per_group, tid);
552 }
553
554 // Tell all the threads in my group they can go!
555 for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
556 b->go[go_idx].go.store(next_go);
557 }
558
559 // Fence added so that workers can see changes to go. sfence inadequate.
560 KMP_MFENCE();
561
562 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
563 // Wake-up the other threads in my group
564 size_t nproc = this_thr->th.th_team_nproc;
565 size_t group_end = tid + b->threads_per_group;
566 if (nproc < group_end)
567 group_end = nproc;
568 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
569 }
570 }
571 // Update to next iteration
572 KMP_ASSERT(my_current_iter == b->iter[tid].iter);
573 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
574
575 KA_TRACE(
576 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
577 gtid, team->t.t_id, tid, bt));
578}
579
580// Linear Barrier
581template <bool cancellable = false>
582static bool __kmp_linear_barrier_gather_template(
583 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
584 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
585 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
586 kmp_team_t *team = this_thr->th.th_team;
587 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
588 kmp_info_t **other_threads = team->t.t_threads;
589
590 KA_TRACE(
591 20,
592 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
593 gtid, team->t.t_id, tid, bt));
594 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
595
596#if USE_ITT_BUILD && USE_ITT_NOTIFY
597 // Barrier imbalance - save arrive time to the thread
598 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
599 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
600 __itt_get_timestamp();
601 }
602#endif
603 // We now perform a linear reduction to signal that all of the threads have
604 // arrived.
605 if (!KMP_MASTER_TID(tid)) {
606 KA_TRACE(20,
607 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
608 "arrived(%p): %llu => %llu\n",
609 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
610 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
611 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
612 // Mark arrival to primary thread
613 /* After performing this write, a worker thread may not assume that the team
614 is valid any more - it could be deallocated by the primary thread at any
615 time. */
616 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
617 flag.release();
618 } else {
619 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
620 int nproc = this_thr->th.th_team_nproc;
621 int i;
622 // Don't have to worry about sleep bit here or atomic since team setting
623 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
624
625 // Collect all the worker team member threads.
626 for (i = 1; i < nproc; ++i) {
627#if KMP_CACHE_MANAGE
628 // Prefetch next thread's arrived count
629 if (i + 1 < nproc)
630 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
631#endif /* KMP_CACHE_MANAGE */
632 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
633 "arrived(%p) == %llu\n",
634 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635 team->t.t_id, i,
636 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
637
638 // Wait for worker thread to arrive
639 if (cancellable) {
640 kmp_flag_64<true, false> flag(
641 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
642 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
643 return true;
644 } else {
645 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
646 new_state);
647 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
648 }
649#if USE_ITT_BUILD && USE_ITT_NOTIFY
650 // Barrier imbalance - write min of the thread time and the other thread
651 // time to the thread.
652 if (__kmp_forkjoin_frames_mode == 2) {
653 this_thr->th.th_bar_min_time = KMP_MIN(
654 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
655 }
656#endif
657 if (reduce) {
658 KA_TRACE(100,
659 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
660 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
661 team->t.t_id, i));
662 OMPT_REDUCTION_DECL(this_thr, gtid);
663 OMPT_REDUCTION_BEGIN;
664 (*reduce)(this_thr->th.th_local.reduce_data,
665 other_threads[i]->th.th_local.reduce_data);
666 OMPT_REDUCTION_END;
667 }
668 }
669 // Don't have to worry about sleep bit here or atomic since team setting
670 team_bar->b_arrived = new_state;
671 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
672 "arrived(%p) = %llu\n",
673 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
674 new_state));
675 }
676 KA_TRACE(
677 20,
678 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
679 gtid, team->t.t_id, tid, bt));
680 return false;
681}
682
683template <bool cancellable = false>
684static bool __kmp_linear_barrier_release_template(
685 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
686 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
687 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
688 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
689 kmp_team_t *team;
690
691 if (KMP_MASTER_TID(tid)) {
692 unsigned int i;
693 kmp_uint32 nproc = this_thr->th.th_team_nproc;
694 kmp_info_t **other_threads;
695
696 team = __kmp_threads[gtid]->th.th_team;
697 KMP_DEBUG_ASSERT(team != NULL);
698 other_threads = team->t.t_threads;
699
700 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
701 "barrier type %d\n",
702 gtid, team->t.t_id, tid, bt));
703
704 if (nproc > 1) {
705#if KMP_BARRIER_ICV_PUSH
706 {
707 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
708 if (propagate_icvs) {
709 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
710 for (i = 1; i < nproc; ++i) {
711 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
712 team, i, FALSE);
713 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
714 &team->t.t_implicit_task_taskdata[0].td_icvs);
715 }
716 ngo_sync();
717 }
718 }
719#endif // KMP_BARRIER_ICV_PUSH
720
721 // Now, release all of the worker threads
722 for (i = 1; i < nproc; ++i) {
723#if KMP_CACHE_MANAGE
724 // Prefetch next thread's go flag
725 if (i + 1 < nproc)
726 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
727#endif /* KMP_CACHE_MANAGE */
728 KA_TRACE(
729 20,
730 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
731 "go(%p): %u => %u\n",
732 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
733 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
734 other_threads[i]->th.th_bar[bt].bb.b_go,
735 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
736 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
737 other_threads[i]);
738 flag.release();
739 }
740 }
741 } else { // Wait for the PRIMARY thread to release us
742 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
743 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
744 if (cancellable) {
745 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
746 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
747 return true;
748 } else {
749 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
750 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
751 }
752#if USE_ITT_BUILD && USE_ITT_NOTIFY
753 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
754 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
755 // disabled)
756 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
757 // Cancel wait on previous parallel region...
758 __kmp_itt_task_starting(itt_sync_obj);
759
760 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
761 return false;
762
763 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
764 if (itt_sync_obj != NULL)
765 // Call prepare as early as possible for "new" barrier
766 __kmp_itt_task_finished(itt_sync_obj);
767 } else
768#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
769 // Early exit for reaping threads releasing forkjoin barrier
770 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
771 return false;
772// The worker thread may now assume that the team is valid.
773#ifdef KMP_DEBUG
774 tid = __kmp_tid_from_gtid(gtid);
775 team = __kmp_threads[gtid]->th.th_team;
776#endif
777 KMP_DEBUG_ASSERT(team != NULL);
778 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
779 KA_TRACE(20,
780 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
781 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
782 KMP_MB(); // Flush all pending memory write invalidates.
783 }
784 KA_TRACE(
785 20,
786 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
787 gtid, team->t.t_id, tid, bt));
788 return false;
789}
790
791static void __kmp_linear_barrier_gather(
792 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
793 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
794 __kmp_linear_barrier_gather_template<false>(
795 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
796}
797
798static bool __kmp_linear_barrier_gather_cancellable(
799 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
800 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
801 return __kmp_linear_barrier_gather_template<true>(
802 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
803}
804
805static void __kmp_linear_barrier_release(
806 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
807 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
808 __kmp_linear_barrier_release_template<false>(
809 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
810}
811
812static bool __kmp_linear_barrier_release_cancellable(
813 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
814 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
815 return __kmp_linear_barrier_release_template<true>(
816 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
817}
818
819// Tree barrier
820static void __kmp_tree_barrier_gather(
821 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
822 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
823 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
824 kmp_team_t *team = this_thr->th.th_team;
825 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
826 kmp_info_t **other_threads = team->t.t_threads;
827 kmp_uint32 nproc = this_thr->th.th_team_nproc;
828 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
829 kmp_uint32 branch_factor = 1 << branch_bits;
830 kmp_uint32 child;
831 kmp_uint32 child_tid;
832 kmp_uint64 new_state = 0;
833
834 KA_TRACE(
835 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
836 gtid, team->t.t_id, tid, bt));
837 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
838
839#if USE_ITT_BUILD && USE_ITT_NOTIFY
840 // Barrier imbalance - save arrive time to the thread
841 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
842 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
843 __itt_get_timestamp();
844 }
845#endif
846 // Perform tree gather to wait until all threads have arrived; reduce any
847 // required data as we go
848 child_tid = (tid << branch_bits) + 1;
849 if (child_tid < nproc) {
850 // Parent threads wait for all their children to arrive
851 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
852 child = 1;
853 do {
854 kmp_info_t *child_thr = other_threads[child_tid];
855 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
856#if KMP_CACHE_MANAGE
857 // Prefetch next thread's arrived count
858 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
859 KMP_CACHE_PREFETCH(
860 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
861#endif /* KMP_CACHE_MANAGE */
862 KA_TRACE(20,
863 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
864 "arrived(%p) == %llu\n",
865 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
866 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
867 // Wait for child to arrive
868 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
869 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870#if USE_ITT_BUILD && USE_ITT_NOTIFY
871 // Barrier imbalance - write min of the thread time and a child time to
872 // the thread.
873 if (__kmp_forkjoin_frames_mode == 2) {
874 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
875 child_thr->th.th_bar_min_time);
876 }
877#endif
878 if (reduce) {
879 KA_TRACE(100,
880 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
881 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
882 team->t.t_id, child_tid));
883 OMPT_REDUCTION_DECL(this_thr, gtid);
884 OMPT_REDUCTION_BEGIN;
885 (*reduce)(this_thr->th.th_local.reduce_data,
886 child_thr->th.th_local.reduce_data);
887 OMPT_REDUCTION_END;
888 }
889 child++;
890 child_tid++;
891 } while (child <= branch_factor && child_tid < nproc);
892 }
893
894 if (!KMP_MASTER_TID(tid)) { // Worker threads
895 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
896
897 KA_TRACE(20,
898 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
899 "arrived(%p): %llu => %llu\n",
900 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
901 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
902 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
903
904 // Mark arrival to parent thread
905 /* After performing this write, a worker thread may not assume that the team
906 is valid any more - it could be deallocated by the primary thread at any
907 time. */
908 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
909 flag.release();
910 } else {
911 // Need to update the team arrived pointer if we are the primary thread
912 if (nproc > 1) // New value was already computed above
913 team->t.t_bar[bt].b_arrived = new_state;
914 else
915 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
916 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
917 "arrived(%p) = %llu\n",
918 gtid, team->t.t_id, tid, team->t.t_id,
919 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
920 }
921 KA_TRACE(20,
922 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
923 gtid, team->t.t_id, tid, bt));
924}
925
926static void __kmp_tree_barrier_release(
927 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
928 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
929 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
930 kmp_team_t *team;
931 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
932 kmp_uint32 nproc;
933 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
934 kmp_uint32 branch_factor = 1 << branch_bits;
935 kmp_uint32 child;
936 kmp_uint32 child_tid;
937
938 // Perform a tree release for all of the threads that have been gathered
939 if (!KMP_MASTER_TID(
940 tid)) { // Handle fork barrier workers who aren't part of a team yet
941 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
942 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
943 // Wait for parent thread to release us
944 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
945 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
946#if USE_ITT_BUILD && USE_ITT_NOTIFY
947 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
948 // In fork barrier where we could not get the object reliably (or
949 // ITTNOTIFY is disabled)
950 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
951 // Cancel wait on previous parallel region...
952 __kmp_itt_task_starting(itt_sync_obj);
953
954 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
955 return;
956
957 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
958 if (itt_sync_obj != NULL)
959 // Call prepare as early as possible for "new" barrier
960 __kmp_itt_task_finished(itt_sync_obj);
961 } else
962#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
963 // Early exit for reaping threads releasing forkjoin barrier
964 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
965 return;
966
967 // The worker thread may now assume that the team is valid.
968 team = __kmp_threads[gtid]->th.th_team;
969 KMP_DEBUG_ASSERT(team != NULL);
970 tid = __kmp_tid_from_gtid(gtid);
971
972 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
973 KA_TRACE(20,
974 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
975 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
976 KMP_MB(); // Flush all pending memory write invalidates.
977 } else {
978 team = __kmp_threads[gtid]->th.th_team;
979 KMP_DEBUG_ASSERT(team != NULL);
980 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
981 "barrier type %d\n",
982 gtid, team->t.t_id, tid, bt));
983 }
984 nproc = this_thr->th.th_team_nproc;
985 child_tid = (tid << branch_bits) + 1;
986
987 if (child_tid < nproc) {
988 kmp_info_t **other_threads = team->t.t_threads;
989 child = 1;
990 // Parent threads release all their children
991 do {
992 kmp_info_t *child_thr = other_threads[child_tid];
993 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
994#if KMP_CACHE_MANAGE
995 // Prefetch next thread's go count
996 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
997 KMP_CACHE_PREFETCH(
998 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
999#endif /* KMP_CACHE_MANAGE */
1000
1001#if KMP_BARRIER_ICV_PUSH
1002 {
1003 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1004 if (propagate_icvs) {
1005 __kmp_init_implicit_task(team->t.t_ident,
1006 team->t.t_threads[child_tid], team,
1007 child_tid, FALSE);
1008 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
1009 &team->t.t_implicit_task_taskdata[0].td_icvs);
1010 }
1011 }
1012#endif // KMP_BARRIER_ICV_PUSH
1013 KA_TRACE(20,
1014 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1015 "go(%p): %u => %u\n",
1016 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1017 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1018 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1019 // Release child from barrier
1020 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1021 flag.release();
1022 child++;
1023 child_tid++;
1024 } while (child <= branch_factor && child_tid < nproc);
1025 }
1026 KA_TRACE(
1027 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028 gtid, team->t.t_id, tid, bt));
1029}
1030
1031// Hyper Barrier
1032static void __kmp_hyper_barrier_gather(
1033 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1034 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1035 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1036 kmp_team_t *team = this_thr->th.th_team;
1037 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1038 kmp_info_t **other_threads = team->t.t_threads;
1039 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1040 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1041 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1042 kmp_uint32 branch_factor = 1 << branch_bits;
1043 kmp_uint32 offset;
1044 kmp_uint32 level;
1045
1046 KA_TRACE(
1047 20,
1048 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1049 gtid, team->t.t_id, tid, bt));
1050 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1051
1052#if USE_ITT_BUILD && USE_ITT_NOTIFY
1053 // Barrier imbalance - save arrive time to the thread
1054 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1055 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1056 __itt_get_timestamp();
1057 }
1058#endif
1059 /* Perform a hypercube-embedded tree gather to wait until all of the threads
1060 have arrived, and reduce any required data as we go. */
1061 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1062 for (level = 0, offset = 1; offset < num_threads;
1063 level += branch_bits, offset <<= branch_bits) {
1064 kmp_uint32 child;
1065 kmp_uint32 child_tid;
1066
1067 if (((tid >> level) & (branch_factor - 1)) != 0) {
1068 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1069
1070 KMP_MB(); // Synchronize parent and child threads.
1071 KA_TRACE(20,
1072 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1073 "arrived(%p): %llu => %llu\n",
1074 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1075 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1076 thr_bar->b_arrived,
1077 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1078 // Mark arrival to parent thread
1079 /* After performing this write (in the last iteration of the enclosing for
1080 loop), a worker thread may not assume that the team is valid any more
1081 - it could be deallocated by the primary thread at any time. */
1082 p_flag.set_waiter(other_threads[parent_tid]);
1083 p_flag.release();
1084 break;
1085 }
1086
1087 // Parent threads wait for children to arrive
1088 if (new_state == KMP_BARRIER_UNUSED_STATE)
1089 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1090 for (child = 1, child_tid = tid + (1 << level);
1091 child < branch_factor && child_tid < num_threads;
1092 child++, child_tid += (1 << level)) {
1093 kmp_info_t *child_thr = other_threads[child_tid];
1094 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1095#if KMP_CACHE_MANAGE
1096 kmp_uint32 next_child_tid = child_tid + (1 << level);
1097 // Prefetch next thread's arrived count
1098 if (child + 1 < branch_factor && next_child_tid < num_threads)
1099 KMP_CACHE_PREFETCH(
1100 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1101#endif /* KMP_CACHE_MANAGE */
1102 KA_TRACE(20,
1103 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1104 "arrived(%p) == %llu\n",
1105 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1106 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1107 // Wait for child to arrive
1108 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1109 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1110 KMP_MB(); // Synchronize parent and child threads.
1111#if USE_ITT_BUILD && USE_ITT_NOTIFY
1112 // Barrier imbalance - write min of the thread time and a child time to
1113 // the thread.
1114 if (__kmp_forkjoin_frames_mode == 2) {
1115 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1116 child_thr->th.th_bar_min_time);
1117 }
1118#endif
1119 if (reduce) {
1120 KA_TRACE(100,
1121 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1122 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1123 team->t.t_id, child_tid));
1124 OMPT_REDUCTION_DECL(this_thr, gtid);
1125 OMPT_REDUCTION_BEGIN;
1126 (*reduce)(this_thr->th.th_local.reduce_data,
1127 child_thr->th.th_local.reduce_data);
1128 OMPT_REDUCTION_END;
1129 }
1130 }
1131 }
1132
1133 if (KMP_MASTER_TID(tid)) {
1134 // Need to update the team arrived pointer if we are the primary thread
1135 if (new_state == KMP_BARRIER_UNUSED_STATE)
1136 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1137 else
1138 team->t.t_bar[bt].b_arrived = new_state;
1139 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1140 "arrived(%p) = %llu\n",
1141 gtid, team->t.t_id, tid, team->t.t_id,
1142 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1143 }
1144 KA_TRACE(
1145 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1146 gtid, team->t.t_id, tid, bt));
1147}
1148
1149// The reverse versions seem to beat the forward versions overall
1150#define KMP_REVERSE_HYPER_BAR
1151static void __kmp_hyper_barrier_release(
1152 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1153 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1154 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1155 kmp_team_t *team;
1156 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1157 kmp_info_t **other_threads;
1158 kmp_uint32 num_threads;
1159 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1160 kmp_uint32 branch_factor = 1 << branch_bits;
1161 kmp_uint32 child;
1162 kmp_uint32 child_tid;
1163 kmp_uint32 offset;
1164 kmp_uint32 level;
1165
1166 /* Perform a hypercube-embedded tree release for all of the threads that have
1167 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1168 are released in the reverse order of the corresponding gather, otherwise
1169 threads are released in the same order. */
1170 if (KMP_MASTER_TID(tid)) { // primary thread
1171 team = __kmp_threads[gtid]->th.th_team;
1172 KMP_DEBUG_ASSERT(team != NULL);
1173 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1174 "barrier type %d\n",
1175 gtid, team->t.t_id, tid, bt));
1176#if KMP_BARRIER_ICV_PUSH
1177 if (propagate_icvs) { // primary already has ICVs in final destination; copy
1178 copy_icvs(&thr_bar->th_fixed_icvs,
1179 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1180 }
1181#endif
1182 } else { // Handle fork barrier workers who aren't part of a team yet
1183 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1184 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1185 // Wait for parent thread to release us
1186 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1187 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1188#if USE_ITT_BUILD && USE_ITT_NOTIFY
1189 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1190 // In fork barrier where we could not get the object reliably
1191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1192 // Cancel wait on previous parallel region...
1193 __kmp_itt_task_starting(itt_sync_obj);
1194
1195 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1196 return;
1197
1198 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1199 if (itt_sync_obj != NULL)
1200 // Call prepare as early as possible for "new" barrier
1201 __kmp_itt_task_finished(itt_sync_obj);
1202 } else
1203#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1204 // Early exit for reaping threads releasing forkjoin barrier
1205 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1206 return;
1207
1208 // The worker thread may now assume that the team is valid.
1209 team = __kmp_threads[gtid]->th.th_team;
1210 KMP_DEBUG_ASSERT(team != NULL);
1211 tid = __kmp_tid_from_gtid(gtid);
1212
1213 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1214 KA_TRACE(20,
1215 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1216 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1217 KMP_MB(); // Flush all pending memory write invalidates.
1218 }
1219 num_threads = this_thr->th.th_team_nproc;
1220 other_threads = team->t.t_threads;
1221
1222#ifdef KMP_REVERSE_HYPER_BAR
1223 // Count up to correct level for parent
1224 for (level = 0, offset = 1;
1225 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1226 level += branch_bits, offset <<= branch_bits)
1227 ;
1228
1229 // Now go down from there
1230 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1231 level -= branch_bits, offset >>= branch_bits)
1232#else
1233 // Go down the tree, level by level
1234 for (level = 0, offset = 1; offset < num_threads;
1235 level += branch_bits, offset <<= branch_bits)
1236#endif // KMP_REVERSE_HYPER_BAR
1237 {
1238#ifdef KMP_REVERSE_HYPER_BAR
1239 /* Now go in reverse order through the children, highest to lowest.
1240 Initial setting of child is conservative here. */
1241 child = num_threads >> ((level == 0) ? level : level - 1);
1242 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1243 child_tid = tid + (child << level);
1244 child >= 1; child--, child_tid -= (1 << level))
1245#else
1246 if (((tid >> level) & (branch_factor - 1)) != 0)
1247 // No need to go lower than this, since this is the level parent would be
1248 // notified
1249 break;
1250 // Iterate through children on this level of the tree
1251 for (child = 1, child_tid = tid + (1 << level);
1252 child < branch_factor && child_tid < num_threads;
1253 child++, child_tid += (1 << level))
1254#endif // KMP_REVERSE_HYPER_BAR
1255 {
1256 if (child_tid >= num_threads)
1257 continue; // Child doesn't exist so keep going
1258 else {
1259 kmp_info_t *child_thr = other_threads[child_tid];
1260 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1261#if KMP_CACHE_MANAGE
1262 kmp_uint32 next_child_tid = child_tid - (1 << level);
1263// Prefetch next thread's go count
1264#ifdef KMP_REVERSE_HYPER_BAR
1265 if (child - 1 >= 1 && next_child_tid < num_threads)
1266#else
1267 if (child + 1 < branch_factor && next_child_tid < num_threads)
1268#endif // KMP_REVERSE_HYPER_BAR
1269 KMP_CACHE_PREFETCH(
1270 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1271#endif /* KMP_CACHE_MANAGE */
1272
1273#if KMP_BARRIER_ICV_PUSH
1274 if (propagate_icvs) // push my fixed ICVs to my child
1275 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1276#endif // KMP_BARRIER_ICV_PUSH
1277
1278 KA_TRACE(
1279 20,
1280 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1281 "go(%p): %u => %u\n",
1282 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1283 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1284 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1285 // Release child from barrier
1286 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1287 flag.release();
1288 }
1289 }
1290 }
1291#if KMP_BARRIER_ICV_PUSH
1292 if (propagate_icvs &&
1293 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1294 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1295 FALSE);
1296 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1297 &thr_bar->th_fixed_icvs);
1298 }
1299#endif
1300 KA_TRACE(
1301 20,
1302 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1303 gtid, team->t.t_id, tid, bt));
1304}
1305
1306// Hierarchical Barrier
1307
1308// Initialize thread barrier data
1309/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1310 Performs the minimum amount of initialization required based on how the team
1311 has changed. Returns true if leaf children will require both on-core and
1312 traditional wake-up mechanisms. For example, if the team size increases,
1313 threads already in the team will respond to on-core wakeup on their parent
1314 thread, but threads newly added to the team will only be listening on the
1315 their local b_go. */
1316static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1317 kmp_bstate_t *thr_bar,
1318 kmp_uint32 nproc, int gtid,
1319 int tid, kmp_team_t *team) {
1320 // Checks to determine if (re-)initialization is needed
1321 bool uninitialized = thr_bar->team == NULL;
1322 bool team_changed = team != thr_bar->team;
1323 bool team_sz_changed = nproc != thr_bar->nproc;
1324 bool tid_changed = tid != thr_bar->old_tid;
1325 bool retval = false;
1326
1327 if (uninitialized || team_sz_changed) {
1328 __kmp_get_hierarchy(nproc, thr_bar);
1329 }
1330
1331 if (uninitialized || team_sz_changed || tid_changed) {
1332 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1333 thr_bar->parent_tid = -1; // default for primary thread
1334 if (!KMP_MASTER_TID(tid)) {
1335 // if not primary thread, find parent thread in hierarchy
1336 kmp_uint32 d = 0;
1337 while (d < thr_bar->depth) { // find parent based on level of thread in
1338 // hierarchy, and note level
1339 kmp_uint32 rem;
1340 if (d == thr_bar->depth - 2) { // reached level right below the primary
1341 thr_bar->parent_tid = 0;
1342 thr_bar->my_level = d;
1343 break;
1344 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1345 // TODO: can we make the above op faster?
1346 // thread is not a subtree root at next level, so this is max
1347 thr_bar->parent_tid = tid - rem;
1348 thr_bar->my_level = d;
1349 break;
1350 }
1351 ++d;
1352 }
1353 }
1354 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1355 (thr_bar->skip_per_level[thr_bar->my_level])),
1356 &(thr_bar->offset));
1357 thr_bar->old_tid = tid;
1358 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1359 thr_bar->team = team;
1360 thr_bar->parent_bar =
1361 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1362 }
1363 if (uninitialized || team_changed || tid_changed) {
1364 thr_bar->team = team;
1365 thr_bar->parent_bar =
1366 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1367 retval = true;
1368 }
1369 if (uninitialized || team_sz_changed || tid_changed) {
1370 thr_bar->nproc = nproc;
1371 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1372 if (thr_bar->my_level == 0)
1373 thr_bar->leaf_kids = 0;
1374 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1375 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1376 thr_bar->leaf_state = 0;
1377 for (int i = 0; i < thr_bar->leaf_kids; ++i)
1378 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1379 }
1380 return retval;
1381}
1382
1383static void __kmp_hierarchical_barrier_gather(
1384 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1385 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1386 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1387 kmp_team_t *team = this_thr->th.th_team;
1388 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1389 kmp_uint32 nproc = this_thr->th.th_team_nproc;
1390 kmp_info_t **other_threads = team->t.t_threads;
1391 kmp_uint64 new_state = 0;
1392
1393 int level = team->t.t_level;
1394 if (other_threads[0]
1395 ->th.th_teams_microtask) // are we inside the teams construct?
1396 if (this_thr->th.th_teams_size.nteams > 1)
1397 ++level; // level was not increased in teams construct for team_of_masters
1398 if (level == 1)
1399 thr_bar->use_oncore_barrier = 1;
1400 else
1401 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1402
1403 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1404 "barrier type %d\n",
1405 gtid, team->t.t_id, tid, bt));
1406 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1407
1408#if USE_ITT_BUILD && USE_ITT_NOTIFY
1409 // Barrier imbalance - save arrive time to the thread
1410 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1411 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1412 }
1413#endif
1414
1415 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1416 team);
1417
1418 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1419 kmp_int32 child_tid;
1420 new_state =
1421 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1422 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1423 thr_bar->use_oncore_barrier) {
1424 if (thr_bar->leaf_kids) {
1425 // First, wait for leaf children to check-in on my b_arrived flag
1426 kmp_uint64 leaf_state =
1427 KMP_MASTER_TID(tid)
1428 ? thr_bar->b_arrived | thr_bar->leaf_state
1429 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1430 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1431 "for leaf kids\n",
1432 gtid, team->t.t_id, tid));
1433 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1434 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1435 if (reduce) {
1436 OMPT_REDUCTION_DECL(this_thr, gtid);
1437 OMPT_REDUCTION_BEGIN;
1438 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1439 ++child_tid) {
1440 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1441 "T#%d(%d:%d)\n",
1442 gtid, team->t.t_id, tid,
1443 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1444 child_tid));
1445 (*reduce)(this_thr->th.th_local.reduce_data,
1446 other_threads[child_tid]->th.th_local.reduce_data);
1447 }
1448 OMPT_REDUCTION_END;
1449 }
1450 // clear leaf_state bits
1451 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1452 }
1453 // Next, wait for higher level children on each child's b_arrived flag
1454 for (kmp_uint32 d = 1; d < thr_bar->my_level;
1455 ++d) { // gather lowest level threads first, but skip 0
1456 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1457 skip = thr_bar->skip_per_level[d];
1458 if (last > nproc)
1459 last = nproc;
1460 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1461 kmp_info_t *child_thr = other_threads[child_tid];
1462 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1463 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1464 "T#%d(%d:%d) "
1465 "arrived(%p) == %llu\n",
1466 gtid, team->t.t_id, tid,
1467 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1468 child_tid, &child_bar->b_arrived, new_state));
1469 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1470 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1471 if (reduce) {
1472 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1473 "T#%d(%d:%d)\n",
1474 gtid, team->t.t_id, tid,
1475 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1476 child_tid));
1477 (*reduce)(this_thr->th.th_local.reduce_data,
1478 child_thr->th.th_local.reduce_data);
1479 }
1480 }
1481 }
1482 } else { // Blocktime is not infinite
1483 for (kmp_uint32 d = 0; d < thr_bar->my_level;
1484 ++d) { // Gather lowest level threads first
1485 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1486 skip = thr_bar->skip_per_level[d];
1487 if (last > nproc)
1488 last = nproc;
1489 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1490 kmp_info_t *child_thr = other_threads[child_tid];
1491 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1492 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1493 "T#%d(%d:%d) "
1494 "arrived(%p) == %llu\n",
1495 gtid, team->t.t_id, tid,
1496 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1497 child_tid, &child_bar->b_arrived, new_state));
1498 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1499 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1500 if (reduce) {
1501 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1502 "T#%d(%d:%d)\n",
1503 gtid, team->t.t_id, tid,
1504 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1505 child_tid));
1506 (*reduce)(this_thr->th.th_local.reduce_data,
1507 child_thr->th.th_local.reduce_data);
1508 }
1509 }
1510 }
1511 }
1512 }
1513 // All subordinates are gathered; now release parent if not primary thread
1514
1515 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1516 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1517 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1518 gtid, team->t.t_id, tid,
1519 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1520 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1521 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1522 /* Mark arrival to parent: After performing this write, a worker thread may
1523 not assume that the team is valid any more - it could be deallocated by
1524 the primary thread at any time. */
1525 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1526 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1527 // flag; release it
1528 kmp_flag_64<> flag(&thr_bar->b_arrived,
1529 other_threads[thr_bar->parent_tid]);
1530 flag.release();
1531 } else {
1532 // Leaf does special release on "offset" bits of parent's b_arrived flag
1533 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1534 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1535 thr_bar->offset + 1);
1536 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1537 flag.release();
1538 }
1539 } else { // Primary thread needs to update the team's b_arrived value
1540 team->t.t_bar[bt].b_arrived = new_state;
1541 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1542 "arrived(%p) = %llu\n",
1543 gtid, team->t.t_id, tid, team->t.t_id,
1544 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1545 }
1546 // Is the team access below unsafe or just technically invalid?
1547 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1548 "barrier type %d\n",
1549 gtid, team->t.t_id, tid, bt));
1550}
1551
1552static void __kmp_hierarchical_barrier_release(
1553 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1554 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1555 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1556 kmp_team_t *team;
1557 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1558 kmp_uint32 nproc;
1559 bool team_change = false; // indicates on-core barrier shouldn't be used
1560
1561 if (KMP_MASTER_TID(tid)) {
1562 team = __kmp_threads[gtid]->th.th_team;
1563 KMP_DEBUG_ASSERT(team != NULL);
1564 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1565 "entered barrier type %d\n",
1566 gtid, team->t.t_id, tid, bt));
1567 } else { // Worker threads
1568 // Wait for parent thread to release me
1569 if (!thr_bar->use_oncore_barrier ||
1570 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1571 thr_bar->team == NULL) {
1572 // Use traditional method of waiting on my own b_go flag
1573 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1574 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1575 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1576 TCW_8(thr_bar->b_go,
1577 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1578 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1579 // infinite, not nested
1580 // Wait on my "offset" bits on parent's b_go flag
1581 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1582 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1583 thr_bar->offset + 1, bt,
1584 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1585 flag.wait(this_thr, TRUE);
1586 if (thr_bar->wait_flag ==
1587 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1588 TCW_8(thr_bar->b_go,
1589 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1590 } else { // Reset my bits on parent's b_go flag
1591 (RCAST(volatile char *,
1592 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1593 }
1594 }
1595 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1596 // Early exit for reaping threads releasing forkjoin barrier
1597 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1598 return;
1599 // The worker thread may now assume that the team is valid.
1600 team = __kmp_threads[gtid]->th.th_team;
1601 KMP_DEBUG_ASSERT(team != NULL);
1602 tid = __kmp_tid_from_gtid(gtid);
1603
1604 KA_TRACE(
1605 20,
1606 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1607 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1608 KMP_MB(); // Flush all pending memory write invalidates.
1609 }
1610
1611 nproc = this_thr->th.th_team_nproc;
1612 int level = team->t.t_level;
1613 if (team->t.t_threads[0]
1614 ->th.th_teams_microtask) { // are we inside the teams construct?
1615 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1616 this_thr->th.th_teams_level == level)
1617 ++level; // level was not increased in teams construct for team_of_workers
1618 if (this_thr->th.th_teams_size.nteams > 1)
1619 ++level; // level was not increased in teams construct for team_of_masters
1620 }
1621 if (level == 1)
1622 thr_bar->use_oncore_barrier = 1;
1623 else
1624 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1625
1626 // If the team size has increased, we still communicate with old leaves via
1627 // oncore barrier.
1628 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1629 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1630 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1631 tid, team);
1632 // But if the entire team changes, we won't use oncore barrier at all
1633 if (team_change)
1634 old_leaf_kids = 0;
1635
1636#if KMP_BARRIER_ICV_PUSH
1637 if (propagate_icvs) {
1638 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1639 FALSE);
1640 if (KMP_MASTER_TID(
1641 tid)) { // primary already has copy in final destination; copy
1642 copy_icvs(&thr_bar->th_fixed_icvs,
1643 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1644 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1645 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1646 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1647 // leaves (on-core children) pull parent's fixed ICVs directly to local
1648 // ICV store
1649 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1650 &thr_bar->parent_bar->th_fixed_icvs);
1651 // non-leaves will get ICVs piggybacked with b_go via NGO store
1652 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1653 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1654 // access
1655 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1656 else // leaves copy parent's fixed ICVs directly to local ICV store
1657 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1658 &thr_bar->parent_bar->th_fixed_icvs);
1659 }
1660 }
1661#endif // KMP_BARRIER_ICV_PUSH
1662
1663 // Now, release my children
1664 if (thr_bar->my_level) { // not a leaf
1665 kmp_int32 child_tid;
1666 kmp_uint32 last;
1667 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1668 thr_bar->use_oncore_barrier) {
1669 if (KMP_MASTER_TID(tid)) { // do a flat release
1670 // Set local b_go to bump children via NGO store of the cache line
1671 // containing IVCs and b_go.
1672 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1673 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1674 // the cache line
1675 ngo_load(&thr_bar->th_fixed_icvs);
1676 // This loops over all the threads skipping only the leaf nodes in the
1677 // hierarchy
1678 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1679 child_tid += thr_bar->skip_per_level[1]) {
1680 kmp_bstate_t *child_bar =
1681 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1682 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1683 "releasing T#%d(%d:%d)"
1684 " go(%p): %u => %u\n",
1685 gtid, team->t.t_id, tid,
1686 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1687 child_tid, &child_bar->b_go, child_bar->b_go,
1688 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1689 // Use ngo store (if available) to both store ICVs and release child
1690 // via child's b_go
1691 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1692 }
1693 ngo_sync();
1694 }
1695 TCW_8(thr_bar->b_go,
1696 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1697 // Now, release leaf children
1698 if (thr_bar->leaf_kids) { // if there are any
1699 // We test team_change on the off-chance that the level 1 team changed.
1700 if (team_change ||
1701 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1702 if (old_leaf_kids) { // release old leaf kids
1703 thr_bar->b_go |= old_leaf_state;
1704 }
1705 // Release new leaf kids
1706 last = tid + thr_bar->skip_per_level[1];
1707 if (last > nproc)
1708 last = nproc;
1709 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1710 ++child_tid) { // skip_per_level[0]=1
1711 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713 KA_TRACE(
1714 20,
1715 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1716 " T#%d(%d:%d) go(%p): %u => %u\n",
1717 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1718 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1719 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1720 // Release child using child's b_go flag
1721 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1722 flag.release();
1723 }
1724 } else { // Release all children at once with leaf_state bits on my own
1725 // b_go flag
1726 thr_bar->b_go |= thr_bar->leaf_state;
1727 }
1728 }
1729 } else { // Blocktime is not infinite; do a simple hierarchical release
1730 for (int d = thr_bar->my_level - 1; d >= 0;
1731 --d) { // Release highest level threads first
1732 last = tid + thr_bar->skip_per_level[d + 1];
1733 kmp_uint32 skip = thr_bar->skip_per_level[d];
1734 if (last > nproc)
1735 last = nproc;
1736 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1737 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1738 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1739 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1740 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1741 gtid, team->t.t_id, tid,
1742 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1743 child_tid, &child_bar->b_go, child_bar->b_go,
1744 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1745 // Release child using child's b_go flag
1746 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1747 flag.release();
1748 }
1749 }
1750 }
1751#if KMP_BARRIER_ICV_PUSH
1752 if (propagate_icvs && !KMP_MASTER_TID(tid))
1753 // non-leaves copy ICVs from fixed ICVs to local dest
1754 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1755 &thr_bar->th_fixed_icvs);
1756#endif // KMP_BARRIER_ICV_PUSH
1757 }
1758 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1759 "barrier type %d\n",
1760 gtid, team->t.t_id, tid, bt));
1761}
1762
1763// End of Barrier Algorithms
1764
1765// type traits for cancellable value
1766// if cancellable is true, then is_cancellable is a normal boolean variable
1767// if cancellable is false, then is_cancellable is a compile time constant
1768template <bool cancellable> struct is_cancellable {};
1769template <> struct is_cancellable<true> {
1770 bool value;
1771 is_cancellable() : value(false) {}
1772 is_cancellable(bool b) : value(b) {}
1773 is_cancellable &operator=(bool b) {
1774 value = b;
1775 return *this;
1776 }
1777 operator bool() const { return value; }
1778};
1779template <> struct is_cancellable<false> {
1780 is_cancellable &operator=(bool b) { return *this; }
1781 constexpr operator bool() const { return false; }
1782};
1783
1784// Internal function to do a barrier.
1785/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1786 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1787 barrier
1788 When cancellable = false,
1789 Returns 0 if primary thread, 1 if worker thread.
1790 When cancellable = true
1791 Returns 0 if not cancelled, 1 if cancelled. */
1792template <bool cancellable = false>
1793static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1794 size_t reduce_size, void *reduce_data,
1795 void (*reduce)(void *, void *)) {
1796 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1797 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1798 int tid = __kmp_tid_from_gtid(gtid);
1799 kmp_info_t *this_thr = __kmp_threads[gtid];
1800 kmp_team_t *team = this_thr->th.th_team;
1801 int status = 0;
1802 is_cancellable<cancellable> cancelled;
1803#if OMPT_SUPPORT && OMPT_OPTIONAL
1804 ompt_data_t *my_task_data;
1805 ompt_data_t *my_parallel_data;
1806 void *return_address;
1807 ompt_sync_region_t barrier_kind;
1808#endif
1809
1810 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1811 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1812
1813#if OMPT_SUPPORT
1814 if (ompt_enabled.enabled) {
1815#if OMPT_OPTIONAL
1816 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1817 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1818 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1819 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1820 if (ompt_enabled.ompt_callback_sync_region) {
1821 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1822 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1823 return_address);
1824 }
1825 if (ompt_enabled.ompt_callback_sync_region_wait) {
1826 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1827 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1828 return_address);
1829 }
1830#endif
1831 // It is OK to report the barrier state after the barrier begin callback.
1832 // According to the OMPT specification, a compliant implementation may
1833 // even delay reporting this state until the barrier begins to wait.
1834 auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1835 switch (barrier_kind) {
1836 case ompt_sync_region_barrier_explicit:
1837 ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1838 break;
1839 case ompt_sync_region_barrier_implicit_workshare:
1840 ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1841 break;
1842 case ompt_sync_region_barrier_implicit_parallel:
1843 ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1844 break;
1845 case ompt_sync_region_barrier_teams:
1846 ompt_thr_info->state = ompt_state_wait_barrier_teams;
1847 break;
1848 case ompt_sync_region_barrier_implementation:
1849 [[fallthrough]];
1850 default:
1851 ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1852 }
1853 }
1854#endif
1855
1856 if (!team->t.t_serialized) {
1857#if USE_ITT_BUILD
1858 // This value will be used in itt notify events below.
1859 void *itt_sync_obj = NULL;
1860#if USE_ITT_NOTIFY
1861 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1862 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1863#endif
1864#endif /* USE_ITT_BUILD */
1865 if (__kmp_tasking_mode == tskm_extra_barrier) {
1866 __kmp_tasking_barrier(team, this_thr, gtid);
1867 KA_TRACE(15,
1868 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1869 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1870 }
1871
1872 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1873 access it when the team struct is not guaranteed to exist. */
1874 // See note about the corresponding code in __kmp_join_barrier() being
1875 // performance-critical.
1876 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1877#if KMP_USE_MONITOR
1878 this_thr->th.th_team_bt_intervals =
1879 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1880 this_thr->th.th_team_bt_set =
1881 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1882#else
1883 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1884#endif
1885 }
1886
1887#if USE_ITT_BUILD
1888 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1890#endif /* USE_ITT_BUILD */
1891#if USE_DEBUGGER
1892 // Let the debugger know: the thread arrived to the barrier and waiting.
1893 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1894 team->t.t_bar[bt].b_master_arrived += 1;
1895 } else {
1896 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1897 } // if
1898#endif /* USE_DEBUGGER */
1899 if (reduce != NULL) {
1900 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1901 this_thr->th.th_local.reduce_data = reduce_data;
1902 }
1903
1904 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1905 __kmp_task_team_setup(this_thr, team);
1906
1907 if (cancellable) {
1908 cancelled = __kmp_linear_barrier_gather_cancellable(
1909 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1910 } else {
1911 switch (__kmp_barrier_gather_pattern[bt]) {
1912 case bp_dist_bar: {
1913 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1914 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1915 break;
1916 }
1917 case bp_hyper_bar: {
1918 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1919 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1920 break;
1921 }
1922 case bp_hierarchical_bar: {
1923 __kmp_hierarchical_barrier_gather(
1924 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1925 break;
1926 }
1927 case bp_tree_bar: {
1928 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1929 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1930 break;
1931 }
1932 default: {
1933 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1934 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1935 }
1936 }
1937 }
1938
1939 KMP_MB();
1940
1941 if (KMP_MASTER_TID(tid)) {
1942 status = 0;
1943 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1944 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1945 }
1946#if USE_DEBUGGER
1947 // Let the debugger know: All threads are arrived and starting leaving the
1948 // barrier.
1949 team->t.t_bar[bt].b_team_arrived += 1;
1950#endif
1951
1952 if (__kmp_omp_cancellation) {
1953 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1954 // Reset cancellation flag for worksharing constructs
1955 if (cancel_request == cancel_loop ||
1956 cancel_request == cancel_sections) {
1957 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1958 }
1959 }
1960#if USE_ITT_BUILD
1961 /* TODO: In case of split reduction barrier, primary thread may send
1962 acquired event early, before the final summation into the shared
1963 variable is done (final summation can be a long operation for array
1964 reductions). */
1965 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1966 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1967#endif /* USE_ITT_BUILD */
1968#if USE_ITT_BUILD && USE_ITT_NOTIFY
1969 // Barrier - report frame end (only if active_level == 1)
1970 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1971 __kmp_forkjoin_frames_mode &&
1972 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1973 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1974 team->t.t_active_level == 1) {
1975 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1976 kmp_uint64 cur_time = __itt_get_timestamp();
1977 kmp_info_t **other_threads = team->t.t_threads;
1978 int nproc = this_thr->th.th_team_nproc;
1979 int i;
1980 switch (__kmp_forkjoin_frames_mode) {
1981 case 1:
1982 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1983 loc, nproc);
1984 this_thr->th.th_frame_time = cur_time;
1985 break;
1986 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1987 // be fixed)
1988 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1989 1, loc, nproc);
1990 break;
1991 case 3:
1992 if (__itt_metadata_add_ptr) {
1993 // Initialize with primary thread's wait time
1994 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1995 // Set arrive time to zero to be able to check it in
1996 // __kmp_invoke_task(); the same is done inside the loop below
1997 this_thr->th.th_bar_arrive_time = 0;
1998 for (i = 1; i < nproc; ++i) {
1999 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2000 other_threads[i]->th.th_bar_arrive_time = 0;
2001 }
2002 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2003 cur_time, delta,
2004 (kmp_uint64)(reduce != NULL));
2005 }
2006 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2007 loc, nproc);
2008 this_thr->th.th_frame_time = cur_time;
2009 break;
2010 }
2011 }
2012#endif /* USE_ITT_BUILD */
2013 } else {
2014 status = 1;
2015#if USE_ITT_BUILD
2016 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2017 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2018#endif /* USE_ITT_BUILD */
2019 }
2020 if ((status == 1 || !is_split) && !cancelled) {
2021 if (cancellable) {
2022 cancelled = __kmp_linear_barrier_release_cancellable(
2023 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2024 } else {
2025 switch (__kmp_barrier_release_pattern[bt]) {
2026 case bp_dist_bar: {
2027 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2028 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2029 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2030 break;
2031 }
2032 case bp_hyper_bar: {
2033 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2034 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2035 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2036 break;
2037 }
2038 case bp_hierarchical_bar: {
2039 __kmp_hierarchical_barrier_release(
2040 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2041 break;
2042 }
2043 case bp_tree_bar: {
2044 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2045 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2046 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2047 break;
2048 }
2049 default: {
2050 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2051 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2052 }
2053 }
2054 }
2055 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2056 __kmp_task_team_sync(this_thr, team);
2057 }
2058 }
2059
2060#if USE_ITT_BUILD
2061 /* GEH: TODO: Move this under if-condition above and also include in
2062 __kmp_end_split_barrier(). This will more accurately represent the actual
2063 release time of the threads for split barriers. */
2064 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2065 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2066#endif /* USE_ITT_BUILD */
2067 } else { // Team is serialized.
2068 status = 0;
2069 if (__kmp_tasking_mode != tskm_immediate_exec) {
2070 if (this_thr->th.th_task_team != NULL) {
2071#if USE_ITT_NOTIFY
2072 void *itt_sync_obj = NULL;
2073 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2074 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2075 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2076 }
2077#endif
2078
2079 KMP_DEBUG_ASSERT(
2080 this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2081 this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2082 TRUE);
2083 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2084 __kmp_task_team_setup(this_thr, team);
2085
2086#if USE_ITT_BUILD
2087 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2088 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2089#endif /* USE_ITT_BUILD */
2090 }
2091 }
2092 }
2093 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2094 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2095 __kmp_tid_from_gtid(gtid), status));
2096
2097#if OMPT_SUPPORT
2098 if (ompt_enabled.enabled) {
2099#if OMPT_OPTIONAL
2100 if (ompt_enabled.ompt_callback_sync_region_wait) {
2101 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2102 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2103 return_address);
2104 }
2105 if (ompt_enabled.ompt_callback_sync_region) {
2106 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2107 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2108 return_address);
2109 }
2110#endif
2111 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2112 }
2113#endif
2114
2115 if (cancellable)
2116 return (int)cancelled;
2117 return status;
2118}
2119
2120// Returns 0 if primary thread, 1 if worker thread.
2121int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2122 size_t reduce_size, void *reduce_data,
2123 void (*reduce)(void *, void *)) {
2124 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2125 reduce);
2126}
2127
2128#if defined(KMP_GOMP_COMPAT)
2129// Returns 1 if cancelled, 0 otherwise
2130int __kmp_barrier_gomp_cancel(int gtid) {
2131 if (__kmp_omp_cancellation) {
2132 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2133 0, NULL, NULL);
2134 if (cancelled) {
2135 int tid = __kmp_tid_from_gtid(gtid);
2136 kmp_info_t *this_thr = __kmp_threads[gtid];
2137 if (KMP_MASTER_TID(tid)) {
2138 // Primary thread does not need to revert anything
2139 } else {
2140 // Workers need to revert their private b_arrived flag
2141 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2142 KMP_BARRIER_STATE_BUMP;
2143 }
2144 }
2145 return cancelled;
2146 }
2147 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2148 return FALSE;
2149}
2150#endif
2151
2152void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2153 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2154 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2155 KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2156 int tid = __kmp_tid_from_gtid(gtid);
2157 kmp_info_t *this_thr = __kmp_threads[gtid];
2158 kmp_team_t *team = this_thr->th.th_team;
2159
2160 if (!team->t.t_serialized) {
2161 if (KMP_MASTER_GTID(gtid)) {
2162 switch (__kmp_barrier_release_pattern[bt]) {
2163 case bp_dist_bar: {
2164 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2165 FALSE USE_ITT_BUILD_ARG(NULL));
2166 break;
2167 }
2168 case bp_hyper_bar: {
2169 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2170 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2171 FALSE USE_ITT_BUILD_ARG(NULL));
2172 break;
2173 }
2174 case bp_hierarchical_bar: {
2175 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2176 FALSE USE_ITT_BUILD_ARG(NULL));
2177 break;
2178 }
2179 case bp_tree_bar: {
2180 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2181 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2182 FALSE USE_ITT_BUILD_ARG(NULL));
2183 break;
2184 }
2185 default: {
2186 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2187 FALSE USE_ITT_BUILD_ARG(NULL));
2188 }
2189 }
2190 if (__kmp_tasking_mode != tskm_immediate_exec) {
2191 __kmp_task_team_sync(this_thr, team);
2192 } // if
2193 }
2194 }
2195}
2196
2197void __kmp_join_barrier(int gtid) {
2198 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2199 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2200
2201 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2202
2203 kmp_info_t *this_thr = __kmp_threads[gtid];
2204 kmp_team_t *team;
2205 int tid;
2206#ifdef KMP_DEBUG
2207 int team_id;
2208#endif /* KMP_DEBUG */
2209#if USE_ITT_BUILD
2210 void *itt_sync_obj = NULL;
2211#if USE_ITT_NOTIFY
2212 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2213 // Get object created at fork_barrier
2214 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2215#endif
2216#endif /* USE_ITT_BUILD */
2217#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2218 int nproc = this_thr->th.th_team_nproc;
2219#endif
2220 KMP_MB();
2221
2222 // Get current info
2223 team = this_thr->th.th_team;
2224 KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2225 tid = __kmp_tid_from_gtid(gtid);
2226#ifdef KMP_DEBUG
2227 team_id = team->t.t_id;
2228 kmp_info_t *master_thread = this_thr->th.th_team_master;
2229 if (master_thread != team->t.t_threads[0]) {
2230 __kmp_print_structure();
2231 }
2232#endif /* KMP_DEBUG */
2233 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2234 KMP_MB();
2235
2236 // Verify state
2237 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2238 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2239 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2240 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2241 gtid, team_id, tid));
2242
2243#if OMPT_SUPPORT
2244 if (ompt_enabled.enabled) {
2245#if OMPT_OPTIONAL
2246 ompt_data_t *my_task_data;
2247 ompt_data_t *my_parallel_data;
2248 void *codeptr = NULL;
2249 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2250 if (KMP_MASTER_TID(ds_tid) &&
2251 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2252 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2253 codeptr = team->t.ompt_team_info.master_return_address;
2254 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2255 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2256 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2257 ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2258 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2259 sync_kind = ompt_sync_region_barrier_teams;
2260 ompt_state = ompt_state_wait_barrier_teams;
2261 }
2262 if (ompt_enabled.ompt_callback_sync_region) {
2263 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2264 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2265 }
2266 if (ompt_enabled.ompt_callback_sync_region_wait) {
2267 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2268 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2269 }
2270 if (!KMP_MASTER_TID(ds_tid))
2271 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2272#endif
2273 this_thr->th.ompt_thread_info.state = ompt_state;
2274 }
2275#endif
2276
2277 if (__kmp_tasking_mode == tskm_extra_barrier) {
2278 __kmp_tasking_barrier(team, this_thr, gtid);
2279 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2280 gtid, team_id, tid));
2281 }
2282#ifdef KMP_DEBUG
2283 if (__kmp_tasking_mode != tskm_immediate_exec) {
2284 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2285 "%p, th_task_team = %p\n",
2286 __kmp_gtid_from_thread(this_thr), team_id,
2287 team->t.t_task_team[this_thr->th.th_task_state],
2288 this_thr->th.th_task_team));
2289 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2290 }
2291#endif /* KMP_DEBUG */
2292
2293 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2294 access it when the team struct is not guaranteed to exist. Doing these
2295 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2296 we do not perform the copy if blocktime=infinite, since the values are not
2297 used by __kmp_wait_template() in that case. */
2298 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2299#if KMP_USE_MONITOR
2300 this_thr->th.th_team_bt_intervals =
2301 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2302 this_thr->th.th_team_bt_set =
2303 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2304#else
2305 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2306#endif
2307 }
2308
2309#if USE_ITT_BUILD
2310 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2311 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2312#endif /* USE_ITT_BUILD */
2313
2314 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2315 case bp_dist_bar: {
2316 __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2317 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2318 break;
2319 }
2320 case bp_hyper_bar: {
2321 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2322 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2323 break;
2324 }
2325 case bp_hierarchical_bar: {
2326 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2327 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2328 break;
2329 }
2330 case bp_tree_bar: {
2331 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2332 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2333 break;
2334 }
2335 default: {
2336 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2337 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2338 }
2339 }
2340
2341 /* From this point on, the team data structure may be deallocated at any time
2342 by the primary thread - it is unsafe to reference it in any of the worker
2343 threads. Any per-team data items that need to be referenced before the
2344 end of the barrier should be moved to the kmp_task_team_t structs. */
2345 if (KMP_MASTER_TID(tid)) {
2346 if (__kmp_tasking_mode != tskm_immediate_exec) {
2347 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2348 }
2349 if (__kmp_display_affinity) {
2350 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2351 }
2352#if KMP_STATS_ENABLED
2353 // Have primary thread flag the workers to indicate they are now waiting for
2354 // next parallel region, Also wake them up so they switch their timers to
2355 // idle.
2356 for (int i = 0; i < team->t.t_nproc; ++i) {
2357 kmp_info_t *team_thread = team->t.t_threads[i];
2358 if (team_thread == this_thr)
2359 continue;
2360 team_thread->th.th_stats->setIdleFlag();
2361 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2362 team_thread->th.th_sleep_loc != NULL)
2363 __kmp_null_resume_wrapper(team_thread);
2364 }
2365#endif
2366#if USE_ITT_BUILD
2367 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2368 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2369#endif /* USE_ITT_BUILD */
2370
2371#if USE_ITT_BUILD && USE_ITT_NOTIFY
2372 // Join barrier - report frame end
2373 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2374 __kmp_forkjoin_frames_mode &&
2375 (this_thr->th.th_teams_microtask == NULL || // either not in teams
2376 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2377 team->t.t_active_level == 1) {
2378 kmp_uint64 cur_time = __itt_get_timestamp();
2379 ident_t *loc = team->t.t_ident;
2380 kmp_info_t **other_threads = team->t.t_threads;
2381 switch (__kmp_forkjoin_frames_mode) {
2382 case 1:
2383 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2384 loc, nproc);
2385 break;
2386 case 2:
2387 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2388 loc, nproc);
2389 break;
2390 case 3:
2391 if (__itt_metadata_add_ptr) {
2392 // Initialize with primary thread's wait time
2393 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2394 // Set arrive time to zero to be able to check it in
2395 // __kmp_invoke_task(); the same is done inside the loop below
2396 this_thr->th.th_bar_arrive_time = 0;
2397 for (int i = 1; i < nproc; ++i) {
2398 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2399 other_threads[i]->th.th_bar_arrive_time = 0;
2400 }
2401 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2402 cur_time, delta, 0);
2403 }
2404 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2405 loc, nproc);
2406 this_thr->th.th_frame_time = cur_time;
2407 break;
2408 }
2409 }
2410#endif /* USE_ITT_BUILD */
2411 }
2412#if USE_ITT_BUILD
2413 else {
2414 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2415 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2416 }
2417#endif /* USE_ITT_BUILD */
2418
2419#if KMP_DEBUG
2420 if (KMP_MASTER_TID(tid)) {
2421 KA_TRACE(
2422 15,
2423 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2424 gtid, team_id, tid, nproc));
2425 }
2426#endif /* KMP_DEBUG */
2427
2428 // TODO now, mark worker threads as done so they may be disbanded
2429 KMP_MB(); // Flush all pending memory write invalidates.
2430 KA_TRACE(10,
2431 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2432
2433}
2434
2435// TODO release worker threads' fork barriers as we are ready instead of all at
2436// once
2437void __kmp_fork_barrier(int gtid, int tid) {
2438 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2439 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2440 kmp_info_t *this_thr = __kmp_threads[gtid];
2441 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2442#if USE_ITT_BUILD
2443 void *itt_sync_obj = NULL;
2444#endif /* USE_ITT_BUILD */
2445#ifdef KMP_DEBUG
2446 if (team)
2447 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2448 (team != NULL) ? team->t.t_id : -1, tid));
2449#endif
2450 // th_team pointer only valid for primary thread here
2451 if (KMP_MASTER_TID(tid)) {
2452#if USE_ITT_BUILD && USE_ITT_NOTIFY
2453 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2454 // Create itt barrier object
2455 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2456 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2457 }
2458#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2459
2460#ifdef KMP_DEBUG
2461 KMP_DEBUG_ASSERT(team);
2462 kmp_info_t **other_threads = team->t.t_threads;
2463 int i;
2464
2465 // Verify state
2466 KMP_MB();
2467
2468 for (i = 1; i < team->t.t_nproc; ++i) {
2469 KA_TRACE(500,
2470 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2471 "== %u.\n",
2472 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2473 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2474 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2475 KMP_DEBUG_ASSERT(
2476 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2477 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2478 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2479 }
2480#endif
2481
2482 if (__kmp_tasking_mode != tskm_immediate_exec)
2483 __kmp_task_team_setup(this_thr, team);
2484
2485 /* The primary thread may have changed its blocktime between join barrier
2486 and fork barrier. Copy the blocktime info to the thread, where
2487 __kmp_wait_template() can access it when the team struct is not
2488 guaranteed to exist. */
2489 // See note about the corresponding code in __kmp_join_barrier() being
2490 // performance-critical
2491 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2492#if KMP_USE_MONITOR
2493 this_thr->th.th_team_bt_intervals =
2494 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2495 this_thr->th.th_team_bt_set =
2496 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2497#else
2498 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2499#endif
2500 }
2501 } // primary thread
2502
2503 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2504 case bp_dist_bar: {
2505 __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2506 TRUE USE_ITT_BUILD_ARG(NULL));
2507 break;
2508 }
2509 case bp_hyper_bar: {
2510 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2511 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2512 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2513 break;
2514 }
2515 case bp_hierarchical_bar: {
2516 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2517 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2518 break;
2519 }
2520 case bp_tree_bar: {
2521 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2522 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2523 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2524 break;
2525 }
2526 default: {
2527 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2528 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2529 }
2530 }
2531
2532#if OMPT_SUPPORT
2533 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2534 if (ompt_enabled.enabled &&
2535 (ompt_state == ompt_state_wait_barrier_teams ||
2536 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2537 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2538 ompt_data_t *task_data = (team)
2539 ? OMPT_CUR_TASK_DATA(this_thr)
2540 : &(this_thr->th.ompt_thread_info.task_data);
2541 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2542#if OMPT_OPTIONAL
2543 void *codeptr = NULL;
2544 if (KMP_MASTER_TID(ds_tid) &&
2545 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2546 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2547 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2548 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2549 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2550 sync_kind = ompt_sync_region_barrier_teams;
2551 if (ompt_enabled.ompt_callback_sync_region_wait) {
2552 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2553 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2554 }
2555 if (ompt_enabled.ompt_callback_sync_region) {
2556 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2557 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2558 }
2559#endif
2560 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2561 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2562 ompt_scope_end, NULL, task_data, 0, ds_tid,
2563 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2564 }
2565 }
2566#endif
2567
2568 // Early exit for reaping threads releasing forkjoin barrier
2569 if (TCR_4(__kmp_global.g.g_done)) {
2570 this_thr->th.th_task_team = NULL;
2571
2572#if USE_ITT_BUILD && USE_ITT_NOTIFY
2573 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2574 if (!KMP_MASTER_TID(tid)) {
2575 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2576 if (itt_sync_obj)
2577 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2578 }
2579 }
2580#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2581 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2582 return;
2583 }
2584
2585 /* We can now assume that a valid team structure has been allocated by the
2586 primary thread and propagated to all worker threads. The current thread,
2587 however, may not be part of the team, so we can't blindly assume that the
2588 team pointer is non-null. */
2589 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2590 KMP_DEBUG_ASSERT(team != NULL);
2591 tid = __kmp_tid_from_gtid(gtid);
2592
2593#if KMP_BARRIER_ICV_PULL
2594 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2595 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2596 implicit task has this data before this function is called. We cannot
2597 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2598 thread struct, because it is not always the case that the threads arrays
2599 have been allocated when __kmp_fork_call() is executed. */
2600 {
2601 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2602 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2603 // Copy the initial ICVs from the primary thread's thread struct to the
2604 // implicit task for this tid.
2605 KA_TRACE(10,
2606 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2607 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2608 tid, FALSE);
2609 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2610 &team->t.t_threads[0]
2611 ->th.th_bar[bs_forkjoin_barrier]
2612 .bb.th_fixed_icvs);
2613 }
2614 }
2615#endif // KMP_BARRIER_ICV_PULL
2616
2617 if (__kmp_tasking_mode != tskm_immediate_exec) {
2618 __kmp_task_team_sync(this_thr, team);
2619 }
2620
2621#if KMP_AFFINITY_SUPPORTED
2622 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2623 if (proc_bind == proc_bind_intel) {
2624 // Call dynamic affinity settings
2625 if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2626 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2627 }
2628 } else if (proc_bind != proc_bind_false) {
2629 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2630 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2631 __kmp_gtid_from_thread(this_thr),
2632 this_thr->th.th_current_place));
2633 } else {
2634 __kmp_affinity_bind_place(gtid);
2635 }
2636 }
2637#endif // KMP_AFFINITY_SUPPORTED
2638 // Perform the display affinity functionality
2639 if (__kmp_display_affinity) {
2640 if (team->t.t_display_affinity
2641#if KMP_AFFINITY_SUPPORTED
2642 || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2643#endif
2644 ) {
2645 // NULL means use the affinity-format-var ICV
2646 __kmp_aux_display_affinity(gtid, NULL);
2647 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2648 this_thr->th.th_prev_level = team->t.t_level;
2649 }
2650 }
2651 if (!KMP_MASTER_TID(tid))
2652 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2653
2654#if USE_ITT_BUILD && USE_ITT_NOTIFY
2655 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2656 if (!KMP_MASTER_TID(tid)) {
2657 // Get correct barrier object
2658 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2659 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2660 } // (prepare called inside barrier_release)
2661 }
2662#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2663 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2664 team->t.t_id, tid));
2665}
2666
2667void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2668 kmp_internal_control_t *new_icvs, ident_t *loc) {
2669 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2670
2671 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2672 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2673
2674/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2675 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2676 implicit task has this data before this function is called. */
2677#if KMP_BARRIER_ICV_PULL
2678 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2679 remains untouched), where all of the worker threads can access them and
2680 make their own copies after the barrier. */
2681 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2682 // allocated at this point
2683 copy_icvs(
2684 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2685 new_icvs);
2686 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2687 team->t.t_threads[0], team));
2688#elif KMP_BARRIER_ICV_PUSH
2689 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2690 // done here.
2691 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2692 team->t.t_threads[0], team));
2693#else
2694 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2695 // time.
2696 ngo_load(new_icvs);
2697 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2698 // allocated at this point
2699 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2700 // TODO: GEH - pass in better source location info since usually NULL here
2701 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2702 f, team->t.t_threads[f], team));
2703 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2704 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2705 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2706 f, team->t.t_threads[f], team));
2707 }
2708 ngo_sync();
2709#endif // KMP_BARRIER_ICV_PULL
2710}
Definition kmp.h:227