LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#include "kmp_utils.h"
28#if KMP_USE_HIER_SCHED
29#include "kmp_dispatch_hier.h"
30#endif
31
32#if OMPT_SUPPORT
33#include "ompt-specific.h"
34#endif
35#if OMPD_SUPPORT
36#include "ompd-specific.h"
37#endif
38
39#if OMP_PROFILING_SUPPORT
40#include "llvm/Support/TimeProfiler.h"
41static char *ProfileTraceFile = nullptr;
42#endif
43
44/* these are temporary issues to be dealt with */
45#define KMP_USE_PRCTL 0
46
47#if KMP_OS_WINDOWS
48#include <process.h>
49#endif
50
51#ifndef KMP_USE_SHM
52// Windows and WASI do not need these include files as they don't use shared
53// memory.
54#else
55#include <sys/mman.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#define SHM_SIZE 1024
59#endif
60
61#if defined(KMP_GOMP_COMPAT)
62char const __kmp_version_alt_comp[] =
63 KMP_VERSION_PREFIX "alternative compiler support: yes";
64#endif /* defined(KMP_GOMP_COMPAT) */
65
66char const __kmp_version_omp_api[] =
67 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69#ifdef KMP_DEBUG
70char const __kmp_version_lock[] =
71 KMP_VERSION_PREFIX "lock type: run time selectable";
72#endif /* KMP_DEBUG */
73
74#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76/* ------------------------------------------------------------------------ */
77
78#if KMP_USE_MONITOR
79kmp_info_t __kmp_monitor;
80#endif
81
82/* Forward declarations */
83
84void __kmp_cleanup(void);
85
86static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87 int gtid);
88static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89 kmp_internal_control_t *new_icvs,
90 ident_t *loc);
91#if KMP_AFFINITY_SUPPORTED
92static void __kmp_partition_places(kmp_team_t *team,
93 int update_master_only = 0);
94#endif
95static void __kmp_do_serial_initialize(void);
96void __kmp_fork_barrier(int gtid, int tid);
97void __kmp_join_barrier(int gtid);
98void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99 kmp_internal_control_t *new_icvs, ident_t *loc);
100
101#ifdef USE_LOAD_BALANCE
102static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103#endif
104
105static int __kmp_expand_threads(int nNeed);
106#if KMP_OS_WINDOWS
107static int __kmp_unregister_root_other_thread(int gtid);
108#endif
109static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113 int new_nthreads);
114void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117 int level) {
118 kmp_nested_nthreads_t *new_nested_nth =
119 (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120 sizeof(kmp_nested_nthreads_t));
121 int new_size = level + thr->th.th_set_nested_nth_sz;
122 new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123 for (int i = 0; i < level + 1; ++i)
124 new_nested_nth->nth[i] = 0;
125 for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126 new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127 new_nested_nth->size = new_nested_nth->used = new_size;
128 return new_nested_nth;
129}
130
131/* Calculate the identifier of the current thread */
132/* fast (and somewhat portable) way to get unique identifier of executing
133 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134int __kmp_get_global_thread_id() {
135 int i;
136 kmp_info_t **other_threads;
137 size_t stack_data;
138 char *stack_addr;
139 size_t stack_size;
140 char *stack_base;
141
142 KA_TRACE(
143 1000,
144 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145 __kmp_nth, __kmp_all_nth));
146
147 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150 __kmp_init_gtid for this to work. */
151
152 if (!TCR_4(__kmp_init_gtid))
153 return KMP_GTID_DNE;
154
155#ifdef KMP_TDATA_GTID
156 if (TCR_4(__kmp_gtid_mode) >= 3) {
157 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158 return __kmp_gtid;
159 }
160#endif
161 if (TCR_4(__kmp_gtid_mode) >= 2) {
162 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163 return __kmp_gtid_get_specific();
164 }
165 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166
167 stack_addr = (char *)&stack_data;
168 other_threads = __kmp_threads;
169
170 /* ATT: The code below is a source of potential bugs due to unsynchronized
171 access to __kmp_threads array. For example:
172 1. Current thread loads other_threads[i] to thr and checks it, it is
173 non-NULL.
174 2. Current thread is suspended by OS.
175 3. Another thread unregisters and finishes (debug versions of free()
176 may fill memory with something like 0xEF).
177 4. Current thread is resumed.
178 5. Current thread reads junk from *thr.
179 TODO: Fix it. --ln */
180
181 for (i = 0; i < __kmp_threads_capacity; i++) {
182
183 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184 if (!thr)
185 continue;
186
187 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189
190 /* stack grows down -- search through all of the active threads */
191
192 if (stack_addr <= stack_base) {
193 size_t stack_diff = stack_base - stack_addr;
194
195 if (stack_diff <= stack_size) {
196 /* The only way we can be closer than the allocated */
197 /* stack size is if we are running on this thread. */
198 // __kmp_gtid_get_specific can return negative value because this
199 // function can be called by thread destructor. However, before the
200 // thread destructor is called, the value of the corresponding
201 // thread-specific data will be reset to NULL.
202 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203 __kmp_gtid_get_specific() == i);
204 return i;
205 }
206 }
207 }
208
209 /* get specific to try and determine our gtid */
210 KA_TRACE(1000,
211 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212 "thread, using TLS\n"));
213 i = __kmp_gtid_get_specific();
214
215 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216
217 /* if we havn't been assigned a gtid, then return code */
218 if (i < 0)
219 return i;
220
221 // other_threads[i] can be nullptr at this point because the corresponding
222 // thread could have already been destructed. It can happen when this function
223 // is called in end library routine.
224 if (!TCR_SYNC_PTR(other_threads[i]))
225 return i;
226
227 /* dynamically updated stack window for uber threads to avoid get_specific
228 call */
229 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230 KMP_FATAL(StackOverflow, i);
231 }
232
233 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234 if (stack_addr > stack_base) {
235 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238 stack_base);
239 } else {
240 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241 stack_base - stack_addr);
242 }
243
244 /* Reprint stack bounds for ubermaster since they have been refined */
245 if (__kmp_storage_map) {
246 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249 other_threads[i]->th.th_info.ds.ds_stacksize,
250 "th_%d stack (refinement)", i);
251 }
252 return i;
253}
254
255int __kmp_get_global_thread_id_reg() {
256 int gtid;
257
258 if (!__kmp_init_serial) {
259 gtid = KMP_GTID_DNE;
260 } else
261#ifdef KMP_TDATA_GTID
262 if (TCR_4(__kmp_gtid_mode) >= 3) {
263 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264 gtid = __kmp_gtid;
265 } else
266#endif
267 if (TCR_4(__kmp_gtid_mode) >= 2) {
268 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269 gtid = __kmp_gtid_get_specific();
270 } else {
271 KA_TRACE(1000,
272 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273 gtid = __kmp_get_global_thread_id();
274 }
275
276 /* we must be a new uber master sibling thread */
277 if (gtid == KMP_GTID_DNE) {
278 KA_TRACE(10,
279 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280 "Registering a new gtid.\n"));
281 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282 if (!__kmp_init_serial) {
283 __kmp_do_serial_initialize();
284 gtid = __kmp_gtid_get_specific();
285 } else {
286 gtid = __kmp_register_root(FALSE);
287 }
288 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290 }
291
292 KMP_DEBUG_ASSERT(gtid >= 0);
293
294 return gtid;
295}
296
297/* caller must hold forkjoin_lock */
298void __kmp_check_stack_overlap(kmp_info_t *th) {
299 int f;
300 char *stack_beg = NULL;
301 char *stack_end = NULL;
302 int gtid;
303
304 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305 if (__kmp_storage_map) {
306 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308
309 gtid = __kmp_gtid_from_thread(th);
310
311 if (gtid == KMP_GTID_MONITOR) {
312 __kmp_print_storage_map_gtid(
313 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314 "th_%s stack (%s)", "mon",
315 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316 } else {
317 __kmp_print_storage_map_gtid(
318 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319 "th_%d stack (%s)", gtid,
320 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321 }
322 }
323
324 /* No point in checking ubermaster threads since they use refinement and
325 * cannot overlap */
326 gtid = __kmp_gtid_from_thread(th);
327 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328 KA_TRACE(10,
329 ("__kmp_check_stack_overlap: performing extensive checking\n"));
330 if (stack_beg == NULL) {
331 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333 }
334
335 for (f = 0; f < __kmp_threads_capacity; f++) {
336 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337
338 if (f_th && f_th != th) {
339 char *other_stack_end =
340 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341 char *other_stack_beg =
342 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345
346 /* Print the other stack values before the abort */
347 if (__kmp_storage_map)
348 __kmp_print_storage_map_gtid(
349 -1, other_stack_beg, other_stack_end,
350 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352
353 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354 __kmp_msg_null);
355 }
356 }
357 }
358 }
359 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360}
361
362/* ------------------------------------------------------------------------ */
363
364void __kmp_infinite_loop(void) {
365 static int done = FALSE;
366
367 while (!done) {
368 KMP_YIELD(TRUE);
369 }
370}
371
372#define MAX_MESSAGE 512
373
374void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375 char const *format, ...) {
376 char buffer[MAX_MESSAGE];
377 va_list ap;
378
379 va_start(ap, format);
380 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381 p2, (unsigned long)size, format);
382 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383 __kmp_vprintf(kmp_err, buffer, ap);
384#if KMP_PRINT_DATA_PLACEMENT
385 int node;
386 if (gtid >= 0) {
387 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388 if (__kmp_storage_map_verbose) {
389 node = __kmp_get_host_node(p1);
390 if (node < 0) /* doesn't work, so don't try this next time */
391 __kmp_storage_map_verbose = FALSE;
392 else {
393 char *last;
394 int lastNode;
395 int localProc = __kmp_get_cpu_from_gtid(gtid);
396
397 const int page_size = KMP_GET_PAGE_SIZE();
398
399 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401 if (localProc >= 0)
402 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403 localProc >> 1);
404 else
405 __kmp_printf_no_lock(" GTID %d\n", gtid);
406#if KMP_USE_PRCTL
407 /* The more elaborate format is disabled for now because of the prctl
408 * hanging bug. */
409 do {
410 last = p1;
411 lastNode = node;
412 /* This loop collates adjacent pages with the same host node. */
413 do {
414 (char *)p1 += page_size;
415 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417 lastNode);
418 } while (p1 <= p2);
419#else
420 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421 (char *)p1 + (page_size - 1),
422 __kmp_get_host_node(p1));
423 if (p1 < p2) {
424 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425 (char *)p2 + (page_size - 1),
426 __kmp_get_host_node(p2));
427 }
428#endif
429 }
430 }
431 } else
432 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433 }
434#endif /* KMP_PRINT_DATA_PLACEMENT */
435 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436
437 va_end(ap);
438}
439
440void __kmp_warn(char const *format, ...) {
441 char buffer[MAX_MESSAGE];
442 va_list ap;
443
444 if (__kmp_generate_warnings == kmp_warnings_off) {
445 return;
446 }
447
448 va_start(ap, format);
449
450 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452 __kmp_vprintf(kmp_err, buffer, ap);
453 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454
455 va_end(ap);
456}
457
458void __kmp_abort_process() {
459 // Later threads may stall here, but that's ok because abort() will kill them.
460 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461
462 if (__kmp_debug_buf) {
463 __kmp_dump_debug_buffer();
464 }
465
466#if KMP_OS_WINDOWS
467 // Let other threads know of abnormal termination and prevent deadlock
468 // if abort happened during library initialization or shutdown
469 __kmp_global.g.g_abort = SIGABRT;
470
471 /* On Windows* OS by default abort() causes pop-up error box, which stalls
472 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473 boxes. _set_abort_behavior() works well, but this function is not
474 available in VS7 (this is not problem for DLL, but it is a problem for
475 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476 help, at least in some versions of MS C RTL.
477
478 It seems following sequence is the only way to simulate abort() and
479 avoid pop-up error box. */
480 raise(SIGABRT);
481 _exit(3); // Just in case, if signal ignored, exit anyway.
482#else
483 __kmp_unregister_library();
484 abort();
485#endif
486
487 __kmp_infinite_loop();
488 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489
490} // __kmp_abort_process
491
492void __kmp_abort_thread(void) {
493 // TODO: Eliminate g_abort global variable and this function.
494 // In case of abort just call abort(), it will kill all the threads.
495 __kmp_infinite_loop();
496} // __kmp_abort_thread
497
498/* Print out the storage map for the major kmp_info_t thread data structures
499 that are allocated together. */
500
501static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503 gtid);
504
505 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507
508 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509 sizeof(kmp_local_t), "th_%d.th_local", gtid);
510
511 __kmp_print_storage_map_gtid(
512 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514
515 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516 &thr->th.th_bar[bs_plain_barrier + 1],
517 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518 gtid);
519
520 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521 &thr->th.th_bar[bs_forkjoin_barrier + 1],
522 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523 gtid);
524
525#if KMP_FAST_REDUCTION_BARRIER
526 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527 &thr->th.th_bar[bs_reduction_barrier + 1],
528 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529 gtid);
530#endif // KMP_FAST_REDUCTION_BARRIER
531}
532
533/* Print out the storage map for the major kmp_team_t team data structures
534 that are allocated together. */
535
536static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537 int team_id, int num_thr) {
538 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540 header, team_id);
541
542 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543 &team->t.t_bar[bs_last_barrier],
544 sizeof(kmp_balign_team_t) * bs_last_barrier,
545 "%s_%d.t_bar", header, team_id);
546
547 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548 &team->t.t_bar[bs_plain_barrier + 1],
549 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550 header, team_id);
551
552 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553 &team->t.t_bar[bs_forkjoin_barrier + 1],
554 sizeof(kmp_balign_team_t),
555 "%s_%d.t_bar[forkjoin]", header, team_id);
556
557#if KMP_FAST_REDUCTION_BARRIER
558 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559 &team->t.t_bar[bs_reduction_barrier + 1],
560 sizeof(kmp_balign_team_t),
561 "%s_%d.t_bar[reduction]", header, team_id);
562#endif // KMP_FAST_REDUCTION_BARRIER
563
564 __kmp_print_storage_map_gtid(
565 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567
568 __kmp_print_storage_map_gtid(
569 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571
572 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573 &team->t.t_disp_buffer[num_disp_buff],
574 sizeof(dispatch_shared_info_t) * num_disp_buff,
575 "%s_%d.t_disp_buffer", header, team_id);
576}
577
578static void __kmp_init_allocator() {
579 __kmp_init_memkind();
580 __kmp_init_target_mem();
581}
582static void __kmp_fini_allocator() {
583 __kmp_fini_target_mem();
584 __kmp_fini_memkind();
585}
586
587/* ------------------------------------------------------------------------ */
588
589#if ENABLE_LIBOMPTARGET
590static void __kmp_init_omptarget() {
591 __kmp_init_target_task();
592}
593#endif
594
595/* ------------------------------------------------------------------------ */
596
597#if KMP_DYNAMIC_LIB
598#if KMP_OS_WINDOWS
599
600BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
601 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
602
603 switch (fdwReason) {
604
605 case DLL_PROCESS_ATTACH:
606 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
607
608 return TRUE;
609
610 case DLL_PROCESS_DETACH:
611 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
612
613 // According to Windows* documentation for DllMain entry point:
614 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
615 // lpReserved == NULL when FreeLibrary() is called,
616 // lpReserved != NULL when the process is terminated.
617 // When FreeLibrary() is called, worker threads remain alive. So the
618 // runtime's state is consistent and executing proper shutdown is OK.
619 // When the process is terminated, worker threads have exited or been
620 // forcefully terminated by the OS and only the shutdown thread remains.
621 // This can leave the runtime in an inconsistent state.
622 // Hence, only attempt proper cleanup when FreeLibrary() is called.
623 // Otherwise, rely on OS to reclaim resources.
624 if (lpReserved == NULL)
625 __kmp_internal_end_library(__kmp_gtid_get_specific());
626
627 return TRUE;
628
629 case DLL_THREAD_ATTACH:
630 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
631
632 /* if we want to register new siblings all the time here call
633 * __kmp_get_gtid(); */
634 return TRUE;
635
636 case DLL_THREAD_DETACH:
637 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
638
639 __kmp_internal_end_thread(__kmp_gtid_get_specific());
640 return TRUE;
641 }
642
643 return TRUE;
644}
645
646#endif /* KMP_OS_WINDOWS */
647#endif /* KMP_DYNAMIC_LIB */
648
649/* __kmp_parallel_deo -- Wait until it's our turn. */
650void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
651 int gtid = *gtid_ref;
652#ifdef BUILD_PARALLEL_ORDERED
653 kmp_team_t *team = __kmp_team_from_gtid(gtid);
654#endif /* BUILD_PARALLEL_ORDERED */
655
656 if (__kmp_env_consistency_check) {
657 if (__kmp_threads[gtid]->th.th_root->r.r_active)
658#if KMP_USE_DYNAMIC_LOCK
659 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
660#else
661 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
662#endif
663 }
664#ifdef BUILD_PARALLEL_ORDERED
665 if (!team->t.t_serialized) {
666 KMP_MB();
667 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
668 NULL);
669 KMP_MB();
670 }
671#endif /* BUILD_PARALLEL_ORDERED */
672}
673
674/* __kmp_parallel_dxo -- Signal the next task. */
675void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
676 int gtid = *gtid_ref;
677#ifdef BUILD_PARALLEL_ORDERED
678 int tid = __kmp_tid_from_gtid(gtid);
679 kmp_team_t *team = __kmp_team_from_gtid(gtid);
680#endif /* BUILD_PARALLEL_ORDERED */
681
682 if (__kmp_env_consistency_check) {
683 if (__kmp_threads[gtid]->th.th_root->r.r_active)
684 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
685 }
686#ifdef BUILD_PARALLEL_ORDERED
687 if (!team->t.t_serialized) {
688 KMP_MB(); /* Flush all pending memory write invalidates. */
689
690 /* use the tid of the next thread in this team */
691 /* TODO replace with general release procedure */
692 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
693
694 KMP_MB(); /* Flush all pending memory write invalidates. */
695 }
696#endif /* BUILD_PARALLEL_ORDERED */
697}
698
699/* ------------------------------------------------------------------------ */
700/* The BARRIER for a SINGLE process section is always explicit */
701
702int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
703 int status;
704 kmp_info_t *th;
705 kmp_team_t *team;
706
707 if (!TCR_4(__kmp_init_parallel))
708 __kmp_parallel_initialize();
709 __kmp_resume_if_soft_paused();
710
711 th = __kmp_threads[gtid];
712 team = th->th.th_team;
713 status = 0;
714
715 th->th.th_ident = id_ref;
716
717 if (team->t.t_serialized) {
718 status = 1;
719 } else {
720 kmp_int32 old_this = th->th.th_local.this_construct;
721
722 ++th->th.th_local.this_construct;
723 /* try to set team count to thread count--success means thread got the
724 single block */
725 /* TODO: Should this be acquire or release? */
726 if (team->t.t_construct == old_this) {
727 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
728 th->th.th_local.this_construct);
729 }
730#if USE_ITT_BUILD
731 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
732 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
733 team->t.t_active_level == 1) {
734 // Only report metadata by primary thread of active team at level 1
735 __kmp_itt_metadata_single(id_ref);
736 }
737#endif /* USE_ITT_BUILD */
738 }
739
740 if (__kmp_env_consistency_check) {
741 if (status && push_ws) {
742 __kmp_push_workshare(gtid, ct_psingle, id_ref);
743 } else {
744 __kmp_check_workshare(gtid, ct_psingle, id_ref);
745 }
746 }
747#if USE_ITT_BUILD
748 if (status) {
749 __kmp_itt_single_start(gtid);
750 }
751#endif /* USE_ITT_BUILD */
752 return status;
753}
754
755void __kmp_exit_single(int gtid) {
756#if USE_ITT_BUILD
757 __kmp_itt_single_end(gtid);
758#endif /* USE_ITT_BUILD */
759 if (__kmp_env_consistency_check)
760 __kmp_pop_workshare(gtid, ct_psingle, NULL);
761}
762
763/* determine if we can go parallel or must use a serialized parallel region and
764 * how many threads we can use
765 * set_nproc is the number of threads requested for the team
766 * returns 0 if we should serialize or only use one thread,
767 * otherwise the number of threads to use
768 * The forkjoin lock is held by the caller. */
769static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
770 int master_tid, int set_nthreads,
771 int enter_teams) {
772 int capacity;
773 int new_nthreads;
774 KMP_DEBUG_ASSERT(__kmp_init_serial);
775 KMP_DEBUG_ASSERT(root && parent_team);
776 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
777
778 // If dyn-var is set, dynamically adjust the number of desired threads,
779 // according to the method specified by dynamic_mode.
780 new_nthreads = set_nthreads;
781 if (!get__dynamic_2(parent_team, master_tid)) {
782 ;
783 }
784#ifdef USE_LOAD_BALANCE
785 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
786 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
787 if (new_nthreads == 1) {
788 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
789 "reservation to 1 thread\n",
790 master_tid));
791 return 1;
792 }
793 if (new_nthreads < set_nthreads) {
794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795 "reservation to %d threads\n",
796 master_tid, new_nthreads));
797 }
798 }
799#endif /* USE_LOAD_BALANCE */
800 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
801 new_nthreads = __kmp_avail_proc - __kmp_nth +
802 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803 if (new_nthreads <= 1) {
804 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
805 "reservation to 1 thread\n",
806 master_tid));
807 return 1;
808 }
809 if (new_nthreads < set_nthreads) {
810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811 "reservation to %d threads\n",
812 master_tid, new_nthreads));
813 } else {
814 new_nthreads = set_nthreads;
815 }
816 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
817 if (set_nthreads > 2) {
818 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
819 new_nthreads = (new_nthreads % set_nthreads) + 1;
820 if (new_nthreads == 1) {
821 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
822 "reservation to 1 thread\n",
823 master_tid));
824 return 1;
825 }
826 if (new_nthreads < set_nthreads) {
827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828 "reservation to %d threads\n",
829 master_tid, new_nthreads));
830 }
831 }
832 } else {
833 KMP_ASSERT(0);
834 }
835
836 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
837 if (__kmp_nth + new_nthreads -
838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839 __kmp_max_nth) {
840 int tl_nthreads = __kmp_max_nth - __kmp_nth +
841 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842 if (tl_nthreads <= 0) {
843 tl_nthreads = 1;
844 }
845
846 // If dyn-var is false, emit a 1-time warning.
847 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848 __kmp_reserve_warn = 1;
849 __kmp_msg(kmp_ms_warning,
850 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852 }
853 if (tl_nthreads == 1) {
854 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
855 "reduced reservation to 1 thread\n",
856 master_tid));
857 return 1;
858 }
859 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
860 "reservation to %d threads\n",
861 master_tid, tl_nthreads));
862 new_nthreads = tl_nthreads;
863 }
864
865 // Respect OMP_THREAD_LIMIT
866 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
867 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
868 if (cg_nthreads + new_nthreads -
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870 max_cg_threads) {
871 int tl_nthreads = max_cg_threads - cg_nthreads +
872 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
873 if (tl_nthreads <= 0) {
874 tl_nthreads = 1;
875 }
876
877 // If dyn-var is false, emit a 1-time warning.
878 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879 __kmp_reserve_warn = 1;
880 __kmp_msg(kmp_ms_warning,
881 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
882 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
883 }
884 if (tl_nthreads == 1) {
885 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
886 "reduced reservation to 1 thread\n",
887 master_tid));
888 return 1;
889 }
890 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
891 "reservation to %d threads\n",
892 master_tid, tl_nthreads));
893 new_nthreads = tl_nthreads;
894 }
895
896 // Check if the threads array is large enough, or needs expanding.
897 // See comment in __kmp_register_root() about the adjustment if
898 // __kmp_threads[0] == NULL.
899 capacity = __kmp_threads_capacity;
900 if (TCR_PTR(__kmp_threads[0]) == NULL) {
901 --capacity;
902 }
903 // If it is not for initializing the hidden helper team, we need to take
904 // __kmp_hidden_helper_threads_num out of the capacity because it is included
905 // in __kmp_threads_capacity.
906 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
907 capacity -= __kmp_hidden_helper_threads_num;
908 }
909 if (__kmp_nth + new_nthreads -
910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911 capacity) {
912 // Expand the threads array.
913 int slotsRequired = __kmp_nth + new_nthreads -
914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915 capacity;
916 int slotsAdded = __kmp_expand_threads(slotsRequired);
917 if (slotsAdded < slotsRequired) {
918 // The threads array was not expanded enough.
919 new_nthreads -= (slotsRequired - slotsAdded);
920 KMP_ASSERT(new_nthreads >= 1);
921
922 // If dyn-var is false, emit a 1-time warning.
923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924 __kmp_reserve_warn = 1;
925 if (__kmp_tp_cached) {
926 __kmp_msg(kmp_ms_warning,
927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930 } else {
931 __kmp_msg(kmp_ms_warning,
932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934 }
935 }
936 }
937 }
938
939#ifdef KMP_DEBUG
940 if (new_nthreads == 1) {
941 KC_TRACE(10,
942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943 "dead roots and rechecking; requested %d threads\n",
944 __kmp_get_gtid(), set_nthreads));
945 } else {
946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947 " %d threads\n",
948 __kmp_get_gtid(), new_nthreads, set_nthreads));
949 }
950#endif // KMP_DEBUG
951
952 if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
953 __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
954 this_thr->th.th_nt_msg);
955 }
956 return new_nthreads;
957}
958
959/* Allocate threads from the thread pool and assign them to the new team. We are
960 assured that there are enough threads available, because we checked on that
961 earlier within critical section forkjoin */
962static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
963 kmp_info_t *master_th, int master_gtid,
964 int fork_teams_workers) {
965 int i;
966 int use_hot_team;
967
968 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
969 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
970 KMP_MB();
971
972 /* first, let's setup the primary thread */
973 master_th->th.th_info.ds.ds_tid = 0;
974 master_th->th.th_team = team;
975 master_th->th.th_team_nproc = team->t.t_nproc;
976 master_th->th.th_team_master = master_th;
977 master_th->th.th_team_serialized = FALSE;
978 master_th->th.th_dispatch = &team->t.t_dispatch[0];
979
980 /* make sure we are not the optimized hot team */
981 use_hot_team = 0;
982 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
983 if (hot_teams) { // hot teams array is not allocated if
984 // KMP_HOT_TEAMS_MAX_LEVEL=0
985 int level = team->t.t_active_level - 1; // index in array of hot teams
986 if (master_th->th.th_teams_microtask) { // are we inside the teams?
987 if (master_th->th.th_teams_size.nteams > 1) {
988 ++level; // level was not increased in teams construct for
989 // team_of_masters
990 }
991 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
992 master_th->th.th_teams_level == team->t.t_level) {
993 ++level; // level was not increased in teams construct for
994 // team_of_workers before the parallel
995 } // team->t.t_level will be increased inside parallel
996 }
997 if (level < __kmp_hot_teams_max_level) {
998 if (hot_teams[level].hot_team) {
999 // hot team has already been allocated for given level
1000 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1001 use_hot_team = 1; // the team is ready to use
1002 } else {
1003 use_hot_team = 0; // AC: threads are not allocated yet
1004 hot_teams[level].hot_team = team; // remember new hot team
1005 hot_teams[level].hot_team_nth = team->t.t_nproc;
1006 }
1007 } else {
1008 use_hot_team = 0;
1009 }
1010 }
1011 if (!use_hot_team) {
1012
1013 /* install the primary thread */
1014 team->t.t_threads[0] = master_th;
1015 __kmp_initialize_info(master_th, team, 0, master_gtid);
1016
1017 /* now, install the worker threads */
1018 for (i = 1; i < team->t.t_nproc; i++) {
1019
1020 /* fork or reallocate a new thread and install it in team */
1021 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1022 team->t.t_threads[i] = thr;
1023 KMP_DEBUG_ASSERT(thr);
1024 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1025 /* align team and thread arrived states */
1026 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1027 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1028 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1029 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1030 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1031 team->t.t_bar[bs_plain_barrier].b_arrived));
1032 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1033 thr->th.th_teams_level = master_th->th.th_teams_level;
1034 thr->th.th_teams_size = master_th->th.th_teams_size;
1035 { // Initialize threads' barrier data.
1036 int b;
1037 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1038 for (b = 0; b < bs_last_barrier; ++b) {
1039 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1040 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1041#if USE_DEBUGGER
1042 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1043#endif
1044 }
1045 }
1046 }
1047
1048#if KMP_AFFINITY_SUPPORTED
1049 // Do not partition the places list for teams construct workers who
1050 // haven't actually been forked to do real work yet. This partitioning
1051 // will take place in the parallel region nested within the teams construct.
1052 if (!fork_teams_workers) {
1053 __kmp_partition_places(team);
1054 }
1055#endif
1056
1057 if (team->t.t_nproc > 1 &&
1058 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1059 team->t.b->update_num_threads(team->t.t_nproc);
1060 __kmp_add_threads_to_team(team, team->t.t_nproc);
1061 }
1062 }
1063
1064 // Take care of primary thread's task state
1065 if (__kmp_tasking_mode != tskm_immediate_exec) {
1066 if (use_hot_team) {
1067 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1068 KA_TRACE(
1069 20,
1070 ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1071 "%p, new task_team %p / team %p\n",
1072 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1073 team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1074 team));
1075
1076 // Store primary thread's current task state on new team
1077 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1078 master_th->th.th_task_state);
1079
1080 // Restore primary thread's task state to hot team's state
1081 // by using thread 1's task state
1082 if (team->t.t_nproc > 1) {
1083 KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1084 team->t.t_threads[1]->th.th_task_state == 1);
1085 KMP_CHECK_UPDATE(master_th->th.th_task_state,
1086 team->t.t_threads[1]->th.th_task_state);
1087 } else {
1088 master_th->th.th_task_state = 0;
1089 }
1090 } else {
1091 // Store primary thread's current task_state on new team
1092 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1093 master_th->th.th_task_state);
1094 // Are not using hot team, so set task state to 0.
1095 master_th->th.th_task_state = 0;
1096 }
1097 }
1098
1099 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1100 for (i = 0; i < team->t.t_nproc; i++) {
1101 kmp_info_t *thr = team->t.t_threads[i];
1102 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1103 thr->th.th_prev_level != team->t.t_level) {
1104 team->t.t_display_affinity = 1;
1105 break;
1106 }
1107 }
1108 }
1109
1110 KMP_MB();
1111}
1112
1113#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1114// Propagate any changes to the floating point control registers out to the team
1115// We try to avoid unnecessary writes to the relevant cache line in the team
1116// structure, so we don't make changes unless they are needed.
1117inline static void propagateFPControl(kmp_team_t *team) {
1118 if (__kmp_inherit_fp_control) {
1119 kmp_int16 x87_fpu_control_word;
1120 kmp_uint32 mxcsr;
1121
1122 // Get primary thread's values of FPU control flags (both X87 and vector)
1123 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1124 __kmp_store_mxcsr(&mxcsr);
1125 mxcsr &= KMP_X86_MXCSR_MASK;
1126
1127 // There is no point looking at t_fp_control_saved here.
1128 // If it is TRUE, we still have to update the values if they are different
1129 // from those we now have. If it is FALSE we didn't save anything yet, but
1130 // our objective is the same. We have to ensure that the values in the team
1131 // are the same as those we have.
1132 // So, this code achieves what we need whether or not t_fp_control_saved is
1133 // true. By checking whether the value needs updating we avoid unnecessary
1134 // writes that would put the cache-line into a written state, causing all
1135 // threads in the team to have to read it again.
1136 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1137 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1138 // Although we don't use this value, other code in the runtime wants to know
1139 // whether it should restore them. So we must ensure it is correct.
1140 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1141 } else {
1142 // Similarly here. Don't write to this cache-line in the team structure
1143 // unless we have to.
1144 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1145 }
1146}
1147
1148// Do the opposite, setting the hardware registers to the updated values from
1149// the team.
1150inline static void updateHWFPControl(kmp_team_t *team) {
1151 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1152 // Only reset the fp control regs if they have been changed in the team.
1153 // the parallel region that we are exiting.
1154 kmp_int16 x87_fpu_control_word;
1155 kmp_uint32 mxcsr;
1156 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1157 __kmp_store_mxcsr(&mxcsr);
1158 mxcsr &= KMP_X86_MXCSR_MASK;
1159
1160 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1161 __kmp_clear_x87_fpu_status_word();
1162 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1163 }
1164
1165 if (team->t.t_mxcsr != mxcsr) {
1166 __kmp_load_mxcsr(&team->t.t_mxcsr);
1167 }
1168 }
1169}
1170#else
1171#define propagateFPControl(x) ((void)0)
1172#define updateHWFPControl(x) ((void)0)
1173#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1174
1175static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1176 int realloc); // forward declaration
1177
1178/* Run a parallel region that has been serialized, so runs only in a team of the
1179 single primary thread. */
1180void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1181 kmp_info_t *this_thr;
1182 kmp_team_t *serial_team;
1183
1184 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1185
1186 /* Skip all this code for autopar serialized loops since it results in
1187 unacceptable overhead */
1188 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1189 return;
1190
1191 if (!TCR_4(__kmp_init_parallel))
1192 __kmp_parallel_initialize();
1193 __kmp_resume_if_soft_paused();
1194
1195 this_thr = __kmp_threads[global_tid];
1196 serial_team = this_thr->th.th_serial_team;
1197
1198 /* utilize the serialized team held by this thread */
1199 KMP_DEBUG_ASSERT(serial_team);
1200 KMP_MB();
1201
1202 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1203 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1204 proc_bind = proc_bind_false;
1205 } else if (proc_bind == proc_bind_default) {
1206 // No proc_bind clause was specified, so use the current value
1207 // of proc-bind-var for this parallel region.
1208 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1209 }
1210 // Reset for next parallel region
1211 this_thr->th.th_set_proc_bind = proc_bind_default;
1212
1213 // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have
1214 // effect when parallel execution is disabled by a corresponding if clause
1215 // attached to the parallel directive.
1216 if (this_thr->th.th_nt_strict && this_thr->th.th_set_nproc > 1)
1217 __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
1218 this_thr->th.th_nt_msg);
1219 // Reset num_threads for next parallel region
1220 this_thr->th.th_set_nproc = 0;
1221
1222#if OMPT_SUPPORT
1223 ompt_data_t ompt_parallel_data = ompt_data_none;
1224 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1225 if (ompt_enabled.enabled &&
1226 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1227
1228 ompt_task_info_t *parent_task_info;
1229 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1230
1231 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1232 if (ompt_enabled.ompt_callback_parallel_begin) {
1233 int team_size = 1;
1234
1235 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1236 &(parent_task_info->task_data), &(parent_task_info->frame),
1237 &ompt_parallel_data, team_size,
1238 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1239 }
1240 }
1241#endif // OMPT_SUPPORT
1242
1243 if (this_thr->th.th_team != serial_team) {
1244 // Nested level will be an index in the nested nthreads array
1245 int level = this_thr->th.th_team->t.t_level;
1246
1247 if (serial_team->t.t_serialized) {
1248 /* this serial team was already used
1249 TODO increase performance by making this locks more specific */
1250 kmp_team_t *new_team;
1251
1252 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1253
1254 new_team = __kmp_allocate_team(
1255 this_thr->th.th_root, 1, 1,
1256#if OMPT_SUPPORT
1257 ompt_parallel_data,
1258#endif
1259 proc_bind, &this_thr->th.th_current_task->td_icvs, 0, NULL);
1260 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1261 KMP_ASSERT(new_team);
1262
1263 /* setup new serialized team and install it */
1264 new_team->t.t_threads[0] = this_thr;
1265 new_team->t.t_parent = this_thr->th.th_team;
1266 serial_team = new_team;
1267 this_thr->th.th_serial_team = serial_team;
1268
1269 KF_TRACE(
1270 10,
1271 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1272 global_tid, serial_team));
1273
1274 /* TODO the above breaks the requirement that if we run out of resources,
1275 then we can still guarantee that serialized teams are ok, since we may
1276 need to allocate a new one */
1277 } else {
1278 KF_TRACE(
1279 10,
1280 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1281 global_tid, serial_team));
1282 }
1283
1284 /* we have to initialize this serial team */
1285 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1286 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1287 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1288 serial_team->t.t_ident = loc;
1289 serial_team->t.t_serialized = 1;
1290 serial_team->t.t_nproc = 1;
1291 serial_team->t.t_parent = this_thr->th.th_team;
1292 if (this_thr->th.th_team->t.t_nested_nth)
1293 serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1294 else
1295 serial_team->t.t_nested_nth = &__kmp_nested_nth;
1296 // Save previous team's task state on serial team structure
1297 serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1298 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1299 this_thr->th.th_team = serial_team;
1300 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1301
1302 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1303 this_thr->th.th_current_task));
1304 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1305 this_thr->th.th_current_task->td_flags.executing = 0;
1306
1307 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1308
1309 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1310 implicit task for each serialized task represented by
1311 team->t.t_serialized? */
1312 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1313 &this_thr->th.th_current_task->td_parent->td_icvs);
1314
1315 // Thread value exists in the nested nthreads array for the next nested
1316 // level
1317 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1318 if (this_thr->th.th_team->t.t_nested_nth)
1319 nested_nth = this_thr->th.th_team->t.t_nested_nth;
1320 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1321 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1322 }
1323
1324 if (__kmp_nested_proc_bind.used &&
1325 (level + 1 < __kmp_nested_proc_bind.used)) {
1326 this_thr->th.th_current_task->td_icvs.proc_bind =
1327 __kmp_nested_proc_bind.bind_types[level + 1];
1328 }
1329
1330#if USE_DEBUGGER
1331 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1332#endif
1333 this_thr->th.th_info.ds.ds_tid = 0;
1334
1335 /* set thread cache values */
1336 this_thr->th.th_team_nproc = 1;
1337 this_thr->th.th_team_master = this_thr;
1338 this_thr->th.th_team_serialized = 1;
1339 this_thr->th.th_task_team = NULL;
1340 this_thr->th.th_task_state = 0;
1341
1342 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1343 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1344 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1345
1346 propagateFPControl(serial_team);
1347
1348 /* check if we need to allocate dispatch buffers stack */
1349 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1350 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1351 serial_team->t.t_dispatch->th_disp_buffer =
1352 (dispatch_private_info_t *)__kmp_allocate(
1353 sizeof(dispatch_private_info_t));
1354 }
1355 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1356
1357 KMP_MB();
1358
1359 } else {
1360 /* this serialized team is already being used,
1361 * that's fine, just add another nested level */
1362 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1363 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1364 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1365 ++serial_team->t.t_serialized;
1366 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1367
1368 // Nested level will be an index in the nested nthreads array
1369 int level = this_thr->th.th_team->t.t_level;
1370 // Thread value exists in the nested nthreads array for the next nested
1371 // level
1372
1373 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1374 if (serial_team->t.t_nested_nth)
1375 nested_nth = serial_team->t.t_nested_nth;
1376 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1377 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1378 }
1379
1380 serial_team->t.t_level++;
1381 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1382 "of serial team %p to %d\n",
1383 global_tid, serial_team, serial_team->t.t_level));
1384
1385 /* allocate/push dispatch buffers stack */
1386 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1387 {
1388 dispatch_private_info_t *disp_buffer =
1389 (dispatch_private_info_t *)__kmp_allocate(
1390 sizeof(dispatch_private_info_t));
1391 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1392 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1393 }
1394 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1395
1396 /* allocate/push task team stack */
1397 __kmp_push_task_team_node(this_thr, serial_team);
1398
1399 KMP_MB();
1400 }
1401 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1402
1403 // Perform the display affinity functionality for
1404 // serialized parallel regions
1405 if (__kmp_display_affinity) {
1406 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1407 this_thr->th.th_prev_num_threads != 1) {
1408 // NULL means use the affinity-format-var ICV
1409 __kmp_aux_display_affinity(global_tid, NULL);
1410 this_thr->th.th_prev_level = serial_team->t.t_level;
1411 this_thr->th.th_prev_num_threads = 1;
1412 }
1413 }
1414
1415 if (__kmp_env_consistency_check)
1416 __kmp_push_parallel(global_tid, NULL);
1417#if OMPT_SUPPORT
1418 serial_team->t.ompt_team_info.master_return_address = codeptr;
1419 if (ompt_enabled.enabled &&
1420 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1421 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1422 OMPT_GET_FRAME_ADDRESS(0);
1423
1424 ompt_lw_taskteam_t lw_taskteam;
1425 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1426 &ompt_parallel_data, codeptr);
1427
1428 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1429 // don't use lw_taskteam after linking. content was swaped
1430
1431 /* OMPT implicit task begin */
1432 if (ompt_enabled.ompt_callback_implicit_task) {
1433 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1434 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1435 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1436 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1437 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1438 __kmp_tid_from_gtid(global_tid);
1439 }
1440
1441 /* OMPT state */
1442 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1443 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1444 OMPT_GET_FRAME_ADDRESS(0);
1445 }
1446#endif
1447}
1448
1449// Test if this fork is for a team closely nested in a teams construct
1450static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1451 microtask_t microtask, int level,
1452 int teams_level, kmp_va_list ap) {
1453 return (master_th->th.th_teams_microtask && ap &&
1454 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1455}
1456
1457// Test if this fork is for the teams construct, i.e. to form the outer league
1458// of teams
1459static inline bool __kmp_is_entering_teams(int active_level, int level,
1460 int teams_level, kmp_va_list ap) {
1461 return ((ap == NULL && active_level == 0) ||
1462 (ap && teams_level > 0 && teams_level == level));
1463}
1464
1465// AC: This is start of parallel that is nested inside teams construct.
1466// The team is actual (hot), all workers are ready at the fork barrier.
1467// No lock needed to initialize the team a bit, then free workers.
1468static inline int
1469__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1470 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1471 enum fork_context_e call_context, microtask_t microtask,
1472 launch_t invoker, int master_set_numthreads, int level,
1473#if OMPT_SUPPORT
1474 ompt_data_t ompt_parallel_data, void *return_address,
1475#endif
1476 kmp_va_list ap) {
1477 void **argv;
1478 int i;
1479
1480 parent_team->t.t_ident = loc;
1481 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1482 parent_team->t.t_argc = argc;
1483 argv = (void **)parent_team->t.t_argv;
1484 for (i = argc - 1; i >= 0; --i) {
1485 *argv++ = va_arg(kmp_va_deref(ap), void *);
1486 }
1487 // Increment our nested depth levels, but not increase the serialization
1488 if (parent_team == master_th->th.th_serial_team) {
1489 // AC: we are in serialized parallel
1490 __kmpc_serialized_parallel(loc, gtid);
1491 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1492
1493 if (call_context == fork_context_gnu) {
1494 // AC: need to decrement t_serialized for enquiry functions to work
1495 // correctly, will restore at join time
1496 parent_team->t.t_serialized--;
1497 return TRUE;
1498 }
1499
1500#if OMPD_SUPPORT
1501 parent_team->t.t_pkfn = microtask;
1502#endif
1503
1504#if OMPT_SUPPORT
1505 void *dummy;
1506 void **exit_frame_p;
1507 ompt_data_t *implicit_task_data;
1508 ompt_lw_taskteam_t lw_taskteam;
1509
1510 if (ompt_enabled.enabled) {
1511 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1512 &ompt_parallel_data, return_address);
1513 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1514
1515 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1516 // Don't use lw_taskteam after linking. Content was swapped.
1517
1518 /* OMPT implicit task begin */
1519 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1520 if (ompt_enabled.ompt_callback_implicit_task) {
1521 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1522 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1523 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1524 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1525 }
1526
1527 /* OMPT state */
1528 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1529 } else {
1530 exit_frame_p = &dummy;
1531 }
1532#endif
1533
1534 // AC: need to decrement t_serialized for enquiry functions to work
1535 // correctly, will restore at join time
1536 parent_team->t.t_serialized--;
1537
1538 {
1539 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1540 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1541 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1542#if OMPT_SUPPORT
1543 ,
1544 exit_frame_p
1545#endif
1546 );
1547 }
1548
1549#if OMPT_SUPPORT
1550 if (ompt_enabled.enabled) {
1551 *exit_frame_p = NULL;
1552 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1553 if (ompt_enabled.ompt_callback_implicit_task) {
1554 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1555 ompt_scope_end, NULL, implicit_task_data, 1,
1556 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1557 }
1558 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1559 __ompt_lw_taskteam_unlink(master_th);
1560 if (ompt_enabled.ompt_callback_parallel_end) {
1561 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1562 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1563 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1564 }
1565 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1566 }
1567#endif
1568 return TRUE;
1569 }
1570
1571 parent_team->t.t_pkfn = microtask;
1572 parent_team->t.t_invoke = invoker;
1573 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1574 parent_team->t.t_active_level++;
1575 parent_team->t.t_level++;
1576 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1577
1578 // If the threads allocated to the team are less than the thread limit, update
1579 // the thread limit here. th_teams_size.nth is specific to this team nested
1580 // in a teams construct, the team is fully created, and we're about to do
1581 // the actual fork. Best to do this here so that the subsequent uses below
1582 // and in the join have the correct value.
1583 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1584
1585#if OMPT_SUPPORT
1586 if (ompt_enabled.enabled) {
1587 ompt_lw_taskteam_t lw_taskteam;
1588 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1589 return_address);
1590 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1591 }
1592#endif
1593
1594 /* Change number of threads in the team if requested */
1595 if (master_set_numthreads) { // The parallel has num_threads clause
1596 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1597 // AC: only can reduce number of threads dynamically, can't increase
1598 kmp_info_t **other_threads = parent_team->t.t_threads;
1599 // NOTE: if using distributed barrier, we need to run this code block
1600 // even when the team size appears not to have changed from the max.
1601 int old_proc = master_th->th.th_teams_size.nth;
1602 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1603 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1604 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1605 }
1606 parent_team->t.t_nproc = master_set_numthreads;
1607 for (i = 0; i < master_set_numthreads; ++i) {
1608 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1609 }
1610 }
1611 // Keep extra threads hot in the team for possible next parallels
1612 master_th->th.th_set_nproc = 0;
1613 }
1614
1615#if USE_DEBUGGER
1616 if (__kmp_debugging) { // Let debugger override number of threads.
1617 int nth = __kmp_omp_num_threads(loc);
1618 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1619 master_set_numthreads = nth;
1620 }
1621 }
1622#endif
1623
1624 // Figure out the proc_bind policy for the nested parallel within teams
1625 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1626 // proc_bind_default means don't update
1627 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1628 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1629 proc_bind = proc_bind_false;
1630 } else {
1631 // No proc_bind clause specified; use current proc-bind-var
1632 if (proc_bind == proc_bind_default) {
1633 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1634 }
1635 /* else: The proc_bind policy was specified explicitly on parallel clause.
1636 This overrides proc-bind-var for this parallel region, but does not
1637 change proc-bind-var. */
1638 // Figure the value of proc-bind-var for the child threads.
1639 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1640 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1641 master_th->th.th_current_task->td_icvs.proc_bind)) {
1642 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1643 }
1644 }
1645 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1646 // Need to change the bind-var ICV to correct value for each implicit task
1647 if (proc_bind_icv != proc_bind_default &&
1648 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1649 kmp_info_t **other_threads = parent_team->t.t_threads;
1650 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1651 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1652 }
1653 }
1654 // Reset for next parallel region
1655 master_th->th.th_set_proc_bind = proc_bind_default;
1656
1657#if USE_ITT_BUILD && USE_ITT_NOTIFY
1658 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1659 KMP_ITT_DEBUG) &&
1660 __kmp_forkjoin_frames_mode == 3 &&
1661 parent_team->t.t_active_level == 1 // only report frames at level 1
1662 && master_th->th.th_teams_size.nteams == 1) {
1663 kmp_uint64 tmp_time = __itt_get_timestamp();
1664 master_th->th.th_frame_time = tmp_time;
1665 parent_team->t.t_region_time = tmp_time;
1666 }
1667 if (__itt_stack_caller_create_ptr) {
1668 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1669 // create new stack stitching id before entering fork barrier
1670 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1671 }
1672#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1673#if KMP_AFFINITY_SUPPORTED
1674 __kmp_partition_places(parent_team);
1675#endif
1676
1677 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1678 "master_th=%p, gtid=%d\n",
1679 root, parent_team, master_th, gtid));
1680 __kmp_internal_fork(loc, gtid, parent_team);
1681 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1682 "master_th=%p, gtid=%d\n",
1683 root, parent_team, master_th, gtid));
1684
1685 if (call_context == fork_context_gnu)
1686 return TRUE;
1687
1688 /* Invoke microtask for PRIMARY thread */
1689 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1690 parent_team->t.t_id, parent_team->t.t_pkfn));
1691
1692 if (!parent_team->t.t_invoke(gtid)) {
1693 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1694 }
1695 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1696 parent_team->t.t_id, parent_team->t.t_pkfn));
1697 KMP_MB(); /* Flush all pending memory write invalidates. */
1698
1699 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1700
1701 return TRUE;
1702}
1703
1704// Create a serialized parallel region
1705static inline int
1706__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1707 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1708 kmp_info_t *master_th, kmp_team_t *parent_team,
1709#if OMPT_SUPPORT
1710 ompt_data_t *ompt_parallel_data, void **return_address,
1711 ompt_data_t **parent_task_data,
1712#endif
1713 kmp_va_list ap) {
1714 kmp_team_t *team;
1715 int i;
1716 void **argv;
1717
1718/* josh todo: hypothetical question: what do we do for OS X*? */
1719#if KMP_OS_LINUX && \
1720 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1721 SimpleVLA<void *> args(argc);
1722#else
1723 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1724#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1725 KMP_ARCH_AARCH64) */
1726
1727 KA_TRACE(
1728 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1729
1730 __kmpc_serialized_parallel(loc, gtid);
1731
1732#if OMPD_SUPPORT
1733 master_th->th.th_serial_team->t.t_pkfn = microtask;
1734#endif
1735
1736 if (call_context == fork_context_intel) {
1737 /* TODO this sucks, use the compiler itself to pass args! :) */
1738 master_th->th.th_serial_team->t.t_ident = loc;
1739 if (!ap) {
1740 // revert change made in __kmpc_serialized_parallel()
1741 master_th->th.th_serial_team->t.t_level--;
1742// Get args from parent team for teams construct
1743
1744#if OMPT_SUPPORT
1745 void *dummy;
1746 void **exit_frame_p;
1747 ompt_task_info_t *task_info;
1748 ompt_lw_taskteam_t lw_taskteam;
1749
1750 if (ompt_enabled.enabled) {
1751 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1752 ompt_parallel_data, *return_address);
1753
1754 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1755 // don't use lw_taskteam after linking. content was swaped
1756 task_info = OMPT_CUR_TASK_INFO(master_th);
1757 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1758 if (ompt_enabled.ompt_callback_implicit_task) {
1759 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1760 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1761 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1762 &(task_info->task_data), 1,
1763 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1764 }
1765
1766 /* OMPT state */
1767 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1768 } else {
1769 exit_frame_p = &dummy;
1770 }
1771#endif
1772
1773 {
1774 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1775 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1776 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1777#if OMPT_SUPPORT
1778 ,
1779 exit_frame_p
1780#endif
1781 );
1782 }
1783
1784#if OMPT_SUPPORT
1785 if (ompt_enabled.enabled) {
1786 *exit_frame_p = NULL;
1787 if (ompt_enabled.ompt_callback_implicit_task) {
1788 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1789 ompt_scope_end, NULL, &(task_info->task_data), 1,
1790 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1791 }
1792 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1793 __ompt_lw_taskteam_unlink(master_th);
1794 if (ompt_enabled.ompt_callback_parallel_end) {
1795 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1796 ompt_parallel_data, *parent_task_data,
1797 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1798 }
1799 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1800 }
1801#endif
1802 } else if (microtask == (microtask_t)__kmp_teams_master) {
1803 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1804 team = master_th->th.th_team;
1805 // team->t.t_pkfn = microtask;
1806 team->t.t_invoke = invoker;
1807 __kmp_alloc_argv_entries(argc, team, TRUE);
1808 team->t.t_argc = argc;
1809 argv = (void **)team->t.t_argv;
1810 for (i = argc - 1; i >= 0; --i)
1811 *argv++ = va_arg(kmp_va_deref(ap), void *);
1812 // AC: revert change made in __kmpc_serialized_parallel()
1813 // because initial code in teams should have level=0
1814 team->t.t_level--;
1815 // AC: call special invoker for outer "parallel" of teams construct
1816 invoker(gtid);
1817#if OMPT_SUPPORT
1818 if (ompt_enabled.enabled) {
1819 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1820 if (ompt_enabled.ompt_callback_implicit_task) {
1821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822 ompt_scope_end, NULL, &(task_info->task_data), 0,
1823 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1824 }
1825 if (ompt_enabled.ompt_callback_parallel_end) {
1826 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1827 ompt_parallel_data, *parent_task_data,
1828 OMPT_INVOKER(call_context) | ompt_parallel_league,
1829 *return_address);
1830 }
1831 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1832 }
1833#endif
1834 } else {
1835 argv = args;
1836 for (i = argc - 1; i >= 0; --i)
1837 *argv++ = va_arg(kmp_va_deref(ap), void *);
1838 KMP_MB();
1839
1840#if OMPT_SUPPORT
1841 void *dummy;
1842 void **exit_frame_p;
1843 ompt_task_info_t *task_info;
1844 ompt_lw_taskteam_t lw_taskteam;
1845 ompt_data_t *implicit_task_data;
1846
1847 if (ompt_enabled.enabled) {
1848 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1849 ompt_parallel_data, *return_address);
1850 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1851 // don't use lw_taskteam after linking. content was swaped
1852 task_info = OMPT_CUR_TASK_INFO(master_th);
1853 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1854
1855 /* OMPT implicit task begin */
1856 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1857 if (ompt_enabled.ompt_callback_implicit_task) {
1858 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1859 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1860 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1861 ompt_task_implicit);
1862 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1863 }
1864
1865 /* OMPT state */
1866 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1867 } else {
1868 exit_frame_p = &dummy;
1869 }
1870#endif
1871
1872 {
1873 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1874 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1875 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1876#if OMPT_SUPPORT
1877 ,
1878 exit_frame_p
1879#endif
1880 );
1881 }
1882
1883#if OMPT_SUPPORT
1884 if (ompt_enabled.enabled) {
1885 *exit_frame_p = NULL;
1886 if (ompt_enabled.ompt_callback_implicit_task) {
1887 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1888 ompt_scope_end, NULL, &(task_info->task_data), 1,
1889 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1890 }
1891
1892 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1893 __ompt_lw_taskteam_unlink(master_th);
1894 if (ompt_enabled.ompt_callback_parallel_end) {
1895 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1896 ompt_parallel_data, *parent_task_data,
1897 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1898 }
1899 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1900 }
1901#endif
1902 }
1903 } else if (call_context == fork_context_gnu) {
1904#if OMPT_SUPPORT
1905 if (ompt_enabled.enabled) {
1906 ompt_lw_taskteam_t lwt;
1907 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1908 *return_address);
1909
1910 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1911 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1912 }
1913// don't use lw_taskteam after linking. content was swaped
1914#endif
1915
1916 // we were called from GNU native code
1917 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1918 return FALSE;
1919 } else {
1920 KMP_ASSERT2(call_context < fork_context_last,
1921 "__kmp_serial_fork_call: unknown fork_context parameter");
1922 }
1923
1924 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1925 KMP_MB();
1926 return FALSE;
1927}
1928
1929/* most of the work for a fork */
1930/* return true if we really went parallel, false if serialized */
1931int __kmp_fork_call(ident_t *loc, int gtid,
1932 enum fork_context_e call_context, // Intel, GNU, ...
1933 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1934 kmp_va_list ap) {
1935 void **argv;
1936 int i;
1937 int master_tid;
1938 int master_this_cons;
1939 kmp_team_t *team;
1940 kmp_team_t *parent_team;
1941 kmp_info_t *master_th;
1942 kmp_root_t *root;
1943 int nthreads;
1944 int master_active;
1945 int master_set_numthreads;
1946 int task_thread_limit = 0;
1947 int level;
1948 int active_level;
1949 int teams_level;
1950 kmp_hot_team_ptr_t **p_hot_teams;
1951 { // KMP_TIME_BLOCK
1952 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1953 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1954
1955 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1956 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1957 /* Some systems prefer the stack for the root thread(s) to start with */
1958 /* some gap from the parent stack to prevent false sharing. */
1959 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1960 /* These 2 lines below are so this does not get optimized out */
1961 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1962 __kmp_stkpadding += (short)((kmp_int64)dummy);
1963 }
1964
1965 /* initialize if needed */
1966 KMP_DEBUG_ASSERT(
1967 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1968 if (!TCR_4(__kmp_init_parallel))
1969 __kmp_parallel_initialize();
1970 __kmp_resume_if_soft_paused();
1971
1972 /* setup current data */
1973 // AC: potentially unsafe, not in sync with library shutdown,
1974 // __kmp_threads can be freed
1975 master_th = __kmp_threads[gtid];
1976
1977 parent_team = master_th->th.th_team;
1978 master_tid = master_th->th.th_info.ds.ds_tid;
1979 master_this_cons = master_th->th.th_local.this_construct;
1980 root = master_th->th.th_root;
1981 master_active = root->r.r_active;
1982 master_set_numthreads = master_th->th.th_set_nproc;
1983 task_thread_limit =
1984 master_th->th.th_current_task->td_icvs.task_thread_limit;
1985
1986#if OMPT_SUPPORT
1987 ompt_data_t ompt_parallel_data = ompt_data_none;
1988 ompt_data_t *parent_task_data = NULL;
1989 ompt_frame_t *ompt_frame = NULL;
1990 void *return_address = NULL;
1991
1992 if (ompt_enabled.enabled) {
1993 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1994 NULL, NULL);
1995 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1996 }
1997#endif
1998
1999 // Assign affinity to root thread if it hasn't happened yet
2000 __kmp_assign_root_init_mask();
2001
2002 // Nested level will be an index in the nested nthreads array
2003 level = parent_team->t.t_level;
2004 // used to launch non-serial teams even if nested is not allowed
2005 active_level = parent_team->t.t_active_level;
2006 // needed to check nesting inside the teams
2007 teams_level = master_th->th.th_teams_level;
2008 p_hot_teams = &master_th->th.th_hot_teams;
2009 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2010 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2011 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2012 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2013 // it is either actual or not needed (when active_level > 0)
2014 (*p_hot_teams)[0].hot_team_nth = 1;
2015 }
2016
2017#if OMPT_SUPPORT
2018 if (ompt_enabled.enabled) {
2019 if (ompt_enabled.ompt_callback_parallel_begin) {
2020 int team_size = master_set_numthreads
2021 ? master_set_numthreads
2022 : get__nproc_2(parent_team, master_tid);
2023 int flags = OMPT_INVOKER(call_context) |
2024 ((microtask == (microtask_t)__kmp_teams_master)
2025 ? ompt_parallel_league
2026 : ompt_parallel_team);
2027 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029 return_address);
2030 }
2031 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032 }
2033#endif
2034
2035 master_th->th.th_ident = loc;
2036
2037 // Parallel closely nested in teams construct:
2038 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040 call_context, microtask, invoker,
2041 master_set_numthreads, level,
2042#if OMPT_SUPPORT
2043 ompt_parallel_data, return_address,
2044#endif
2045 ap);
2046 } // End parallel closely nested in teams construct
2047
2048 // Need this to happen before we determine the number of threads, not while
2049 // we are allocating the team
2050 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051
2052 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053
2054 // Determine the number of threads
2055 int enter_teams =
2056 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2057 if ((!enter_teams &&
2058 (parent_team->t.t_active_level >=
2059 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060 (__kmp_library == library_serial)) {
2061 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062 nthreads = 1;
2063 } else {
2064 nthreads = master_set_numthreads
2065 ? master_set_numthreads
2066 // TODO: get nproc directly from current task
2067 : get__nproc_2(parent_team, master_tid);
2068 // Use the thread_limit set for the current target task if exists, else go
2069 // with the deduced nthreads
2070 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071 ? task_thread_limit
2072 : nthreads;
2073 // Check if we need to take forkjoin lock? (no need for serialized
2074 // parallel out of teams construct).
2075 if (nthreads > 1) {
2076 /* determine how many new threads we can use */
2077 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078 /* AC: If we execute teams from parallel region (on host), then teams
2079 should be created but each can only have 1 thread if nesting is
2080 disabled. If teams called from serial region, then teams and their
2081 threads should be created regardless of the nesting setting. */
2082 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083 nthreads, enter_teams);
2084 if (nthreads == 1) {
2085 // Free lock for single thread execution here; for multi-thread
2086 // execution it will be freed later after team of threads created
2087 // and initialized
2088 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089 }
2090 }
2091 }
2092 KMP_DEBUG_ASSERT(nthreads > 0);
2093
2094 // If we temporarily changed the set number of threads then restore it now
2095 master_th->th.th_set_nproc = 0;
2096
2097 if (nthreads == 1) {
2098 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099 invoker, master_th, parent_team,
2100#if OMPT_SUPPORT
2101 &ompt_parallel_data, &return_address,
2102 &parent_task_data,
2103#endif
2104 ap);
2105 } // if (nthreads == 1)
2106
2107 // GEH: only modify the executing flag in the case when not serialized
2108 // serialized case is handled in kmpc_serialized_parallel
2109 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110 "curtask=%p, curtask_max_aclevel=%d\n",
2111 parent_team->t.t_active_level, master_th,
2112 master_th->th.th_current_task,
2113 master_th->th.th_current_task->td_icvs.max_active_levels));
2114 // TODO: GEH - cannot do this assertion because root thread not set up as
2115 // executing
2116 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117 master_th->th.th_current_task->td_flags.executing = 0;
2118
2119 if (!master_th->th.th_teams_microtask || level > teams_level) {
2120 /* Increment our nested depth level */
2121 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122 }
2123
2124 // See if we need to make a copy of the ICVs.
2125 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126 kmp_nested_nthreads_t *nested_nth = NULL;
2127 if (!master_th->th.th_set_nested_nth &&
2128 (level + 1 < parent_team->t.t_nested_nth->used) &&
2129 (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130 nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131 } else if (master_th->th.th_set_nested_nth) {
2132 nested_nth = __kmp_override_nested_nth(master_th, level);
2133 if ((level + 1 < nested_nth->used) &&
2134 (nested_nth->nth[level + 1] != nthreads_icv))
2135 nthreads_icv = nested_nth->nth[level + 1];
2136 else
2137 nthreads_icv = 0; // don't update
2138 } else {
2139 nthreads_icv = 0; // don't update
2140 }
2141
2142 // Figure out the proc_bind_policy for the new team.
2143 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144 // proc_bind_default means don't update
2145 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147 proc_bind = proc_bind_false;
2148 } else {
2149 // No proc_bind clause specified; use current proc-bind-var for this
2150 // parallel region
2151 if (proc_bind == proc_bind_default) {
2152 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153 }
2154 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155 if (master_th->th.th_teams_microtask &&
2156 microtask == (microtask_t)__kmp_teams_master) {
2157 proc_bind = __kmp_teams_proc_bind;
2158 }
2159 /* else: The proc_bind policy was specified explicitly on parallel clause.
2160 This overrides proc-bind-var for this parallel region, but does not
2161 change proc-bind-var. */
2162 // Figure the value of proc-bind-var for the child threads.
2163 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2165 master_th->th.th_current_task->td_icvs.proc_bind)) {
2166 // Do not modify the proc bind icv for the two teams construct forks
2167 // They just let the proc bind icv pass through
2168 if (!master_th->th.th_teams_microtask ||
2169 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171 }
2172 }
2173
2174 // Reset for next parallel region
2175 master_th->th.th_set_proc_bind = proc_bind_default;
2176
2177 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178 kmp_internal_control_t new_icvs;
2179 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180 new_icvs.next = NULL;
2181 if (nthreads_icv > 0) {
2182 new_icvs.nproc = nthreads_icv;
2183 }
2184 if (proc_bind_icv != proc_bind_default) {
2185 new_icvs.proc_bind = proc_bind_icv;
2186 }
2187
2188 /* allocate a new parallel team */
2189 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190 team = __kmp_allocate_team(root, nthreads, nthreads,
2191#if OMPT_SUPPORT
2192 ompt_parallel_data,
2193#endif
2194 proc_bind, &new_icvs, argc, master_th);
2195 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2196 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2197 } else {
2198 /* allocate a new parallel team */
2199 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2200 team = __kmp_allocate_team(
2201 root, nthreads, nthreads,
2202#if OMPT_SUPPORT
2203 ompt_parallel_data,
2204#endif
2205 proc_bind, &master_th->th.th_current_task->td_icvs, argc, master_th);
2206 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2207 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2208 &master_th->th.th_current_task->td_icvs);
2209 }
2210 KF_TRACE(
2211 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2212
2213 /* setup the new team */
2214 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2215 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2216 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2217 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2218 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2219#if OMPT_SUPPORT
2220 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2221 return_address);
2222#endif
2223 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2224 // TODO: parent_team->t.t_level == INT_MAX ???
2225 if (!master_th->th.th_teams_microtask || level > teams_level) {
2226 int new_level = parent_team->t.t_level + 1;
2227 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2228 new_level = parent_team->t.t_active_level + 1;
2229 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2230 } else {
2231 // AC: Do not increase parallel level at start of the teams construct
2232 int new_level = parent_team->t.t_level;
2233 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2234 new_level = parent_team->t.t_active_level;
2235 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2236 }
2237 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2238 // set primary thread's schedule as new run-time schedule
2239 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2240
2241 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2242 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2243
2244 // Check if hot team has potentially outdated list, and if so, free it
2245 if (team->t.t_nested_nth &&
2246 team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2247 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2248 KMP_INTERNAL_FREE(team->t.t_nested_nth);
2249 team->t.t_nested_nth = NULL;
2250 }
2251 team->t.t_nested_nth = parent_team->t.t_nested_nth;
2252 if (master_th->th.th_set_nested_nth) {
2253 if (!nested_nth)
2254 nested_nth = __kmp_override_nested_nth(master_th, level);
2255 team->t.t_nested_nth = nested_nth;
2256 KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2257 master_th->th.th_set_nested_nth = NULL;
2258 master_th->th.th_set_nested_nth_sz = 0;
2259 master_th->th.th_nt_strict = false;
2260 }
2261
2262 // Update the floating point rounding in the team if required.
2263 propagateFPControl(team);
2264#if OMPD_SUPPORT
2265 if (ompd_state & OMPD_ENABLE_BP)
2266 ompd_bp_parallel_begin();
2267#endif
2268
2269 KA_TRACE(
2270 20,
2271 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2272 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2273 team->t.t_nproc));
2274 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2275 (team->t.t_master_tid == 0 &&
2276 (team->t.t_parent == root->r.r_root_team ||
2277 team->t.t_parent->t.t_serialized)));
2278 KMP_MB();
2279
2280 /* now, setup the arguments */
2281 argv = (void **)team->t.t_argv;
2282 if (ap) {
2283 for (i = argc - 1; i >= 0; --i) {
2284 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2285 KMP_CHECK_UPDATE(*argv, new_argv);
2286 argv++;
2287 }
2288 } else {
2289 for (i = 0; i < argc; ++i) {
2290 // Get args from parent team for teams construct
2291 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2292 }
2293 }
2294
2295 /* now actually fork the threads */
2296 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2297 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2298 root->r.r_active = TRUE;
2299
2300 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2301 __kmp_setup_icv_copy(team, nthreads,
2302 &master_th->th.th_current_task->td_icvs, loc);
2303
2304#if OMPT_SUPPORT
2305 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2306#endif
2307
2308 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2309
2310#if USE_ITT_BUILD
2311 if (team->t.t_active_level == 1 // only report frames at level 1
2312 && !master_th->th.th_teams_microtask) { // not in teams construct
2313#if USE_ITT_NOTIFY
2314 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2315 (__kmp_forkjoin_frames_mode == 3 ||
2316 __kmp_forkjoin_frames_mode == 1)) {
2317 kmp_uint64 tmp_time = 0;
2318 if (__itt_get_timestamp_ptr)
2319 tmp_time = __itt_get_timestamp();
2320 // Internal fork - report frame begin
2321 master_th->th.th_frame_time = tmp_time;
2322 if (__kmp_forkjoin_frames_mode == 3)
2323 team->t.t_region_time = tmp_time;
2324 } else
2325// only one notification scheme (either "submit" or "forking/joined", not both)
2326#endif /* USE_ITT_NOTIFY */
2327 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2328 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2329 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2330 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2331 }
2332 }
2333#endif /* USE_ITT_BUILD */
2334
2335 /* now go on and do the work */
2336 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2337 KMP_MB();
2338 KF_TRACE(10,
2339 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2340 root, team, master_th, gtid));
2341
2342#if USE_ITT_BUILD
2343 if (__itt_stack_caller_create_ptr) {
2344 // create new stack stitching id before entering fork barrier
2345 if (!enter_teams) {
2346 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2347 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2348 } else if (parent_team->t.t_serialized) {
2349 // keep stack stitching id in the serialized parent_team;
2350 // current team will be used for parallel inside the teams;
2351 // if parent_team is active, then it already keeps stack stitching id
2352 // for the league of teams
2353 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2354 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2355 }
2356 }
2357#endif /* USE_ITT_BUILD */
2358
2359 // AC: skip __kmp_internal_fork at teams construct, let only primary
2360 // threads execute
2361 if (ap) {
2362 __kmp_internal_fork(loc, gtid, team);
2363 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2364 "master_th=%p, gtid=%d\n",
2365 root, team, master_th, gtid));
2366 }
2367
2368 if (call_context == fork_context_gnu) {
2369 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2370 return TRUE;
2371 }
2372
2373 /* Invoke microtask for PRIMARY thread */
2374 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2375 team->t.t_id, team->t.t_pkfn));
2376 } // END of timer KMP_fork_call block
2377
2378#if KMP_STATS_ENABLED
2379 // If beginning a teams construct, then change thread state
2380 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2381 if (!ap) {
2382 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2383 }
2384#endif
2385
2386 if (!team->t.t_invoke(gtid)) {
2387 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2388 }
2389
2390#if KMP_STATS_ENABLED
2391 // If was beginning of a teams construct, then reset thread state
2392 if (!ap) {
2393 KMP_SET_THREAD_STATE(previous_state);
2394 }
2395#endif
2396
2397 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2398 team->t.t_id, team->t.t_pkfn));
2399 KMP_MB(); /* Flush all pending memory write invalidates. */
2400
2401 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2402#if OMPT_SUPPORT
2403 if (ompt_enabled.enabled) {
2404 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2405 }
2406#endif
2407
2408 return TRUE;
2409}
2410
2411#if OMPT_SUPPORT
2412static inline void __kmp_join_restore_state(kmp_info_t *thread,
2413 kmp_team_t *team) {
2414 // restore state outside the region
2415 thread->th.ompt_thread_info.state =
2416 ((team->t.t_serialized) ? ompt_state_work_serial
2417 : ompt_state_work_parallel);
2418}
2419
2420static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2421 kmp_team_t *team, ompt_data_t *parallel_data,
2422 int flags, void *codeptr) {
2423 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2424 if (ompt_enabled.ompt_callback_parallel_end) {
2425 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2426 parallel_data, &(task_info->task_data), flags, codeptr);
2427 }
2428
2429 task_info->frame.enter_frame = ompt_data_none;
2430 __kmp_join_restore_state(thread, team);
2431}
2432#endif
2433
2434void __kmp_join_call(ident_t *loc, int gtid
2435#if OMPT_SUPPORT
2436 ,
2437 enum fork_context_e fork_context
2438#endif
2439 ,
2440 int exit_teams) {
2441 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2442 kmp_team_t *team;
2443 kmp_team_t *parent_team;
2444 kmp_info_t *master_th;
2445 kmp_root_t *root;
2446 int master_active;
2447
2448 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2449
2450 /* setup current data */
2451 master_th = __kmp_threads[gtid];
2452 root = master_th->th.th_root;
2453 team = master_th->th.th_team;
2454 parent_team = team->t.t_parent;
2455
2456 master_th->th.th_ident = loc;
2457
2458#if OMPT_SUPPORT
2459 void *team_microtask = (void *)team->t.t_pkfn;
2460 // For GOMP interface with serialized parallel, need the
2461 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2462 // and end-parallel events.
2463 if (ompt_enabled.enabled &&
2464 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2465 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2466 }
2467#endif
2468
2469#if KMP_DEBUG
2470 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2471 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2472 "th_task_team = %p\n",
2473 __kmp_gtid_from_thread(master_th), team,
2474 team->t.t_task_team[master_th->th.th_task_state],
2475 master_th->th.th_task_team));
2476 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2477 }
2478#endif
2479
2480 if (team->t.t_serialized) {
2481 if (master_th->th.th_teams_microtask) {
2482 // We are in teams construct
2483 int level = team->t.t_level;
2484 int tlevel = master_th->th.th_teams_level;
2485 if (level == tlevel) {
2486 // AC: we haven't incremented it earlier at start of teams construct,
2487 // so do it here - at the end of teams construct
2488 team->t.t_level++;
2489 } else if (level == tlevel + 1) {
2490 // AC: we are exiting parallel inside teams, need to increment
2491 // serialization in order to restore it in the next call to
2492 // __kmpc_end_serialized_parallel
2493 team->t.t_serialized++;
2494 }
2495 }
2497
2498#if OMPT_SUPPORT
2499 if (ompt_enabled.enabled) {
2500 if (fork_context == fork_context_gnu) {
2501 __ompt_lw_taskteam_unlink(master_th);
2502 }
2503 __kmp_join_restore_state(master_th, parent_team);
2504 }
2505#endif
2506
2507 return;
2508 }
2509
2510 master_active = team->t.t_master_active;
2511
2512 if (!exit_teams) {
2513 // AC: No barrier for internal teams at exit from teams construct.
2514 // But there is barrier for external team (league).
2515 __kmp_internal_join(loc, gtid, team);
2516#if USE_ITT_BUILD
2517 if (__itt_stack_caller_create_ptr) {
2518 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2519 // destroy the stack stitching id after join barrier
2520 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2521 team->t.t_stack_id = NULL;
2522 }
2523#endif
2524 } else {
2525 master_th->th.th_task_state =
2526 0; // AC: no tasking in teams (out of any parallel)
2527#if USE_ITT_BUILD
2528 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2529 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2530 // destroy the stack stitching id on exit from the teams construct
2531 // if parent_team is active, then the id will be destroyed later on
2532 // by master of the league of teams
2533 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2534 parent_team->t.t_stack_id = NULL;
2535 }
2536#endif
2537 }
2538
2539 KMP_MB();
2540
2541#if OMPT_SUPPORT
2542 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2543 void *codeptr = team->t.ompt_team_info.master_return_address;
2544#endif
2545
2546#if USE_ITT_BUILD
2547 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2548 if (team->t.t_active_level == 1 &&
2549 (!master_th->th.th_teams_microtask || /* not in teams construct */
2550 master_th->th.th_teams_size.nteams == 1)) {
2551 master_th->th.th_ident = loc;
2552 // only one notification scheme (either "submit" or "forking/joined", not
2553 // both)
2554 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2555 __kmp_forkjoin_frames_mode == 3)
2556 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2557 master_th->th.th_frame_time, 0, loc,
2558 master_th->th.th_team_nproc, 1);
2559 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2560 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2561 __kmp_itt_region_joined(gtid);
2562 } // active_level == 1
2563#endif /* USE_ITT_BUILD */
2564
2565#if KMP_AFFINITY_SUPPORTED
2566 if (!exit_teams) {
2567 // Restore master thread's partition.
2568 master_th->th.th_first_place = team->t.t_first_place;
2569 master_th->th.th_last_place = team->t.t_last_place;
2570 }
2571#endif // KMP_AFFINITY_SUPPORTED
2572
2573 if (master_th->th.th_teams_microtask && !exit_teams &&
2574 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2575 team->t.t_level == master_th->th.th_teams_level + 1) {
2576// AC: We need to leave the team structure intact at the end of parallel
2577// inside the teams construct, so that at the next parallel same (hot) team
2578// works, only adjust nesting levels
2579#if OMPT_SUPPORT
2580 ompt_data_t ompt_parallel_data = ompt_data_none;
2581 if (ompt_enabled.enabled) {
2582 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2583 if (ompt_enabled.ompt_callback_implicit_task) {
2584 int ompt_team_size = team->t.t_nproc;
2585 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2586 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2587 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2588 }
2589 task_info->frame.exit_frame = ompt_data_none;
2590 task_info->task_data = ompt_data_none;
2591 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2592 __ompt_lw_taskteam_unlink(master_th);
2593 }
2594#endif
2595 /* Decrement our nested depth level */
2596 team->t.t_level--;
2597 team->t.t_active_level--;
2598 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2599
2600 // Restore number of threads in the team if needed. This code relies on
2601 // the proper adjustment of th_teams_size.nth after the fork in
2602 // __kmp_teams_master on each teams primary thread in the case that
2603 // __kmp_reserve_threads reduced it.
2604 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2605 int old_num = master_th->th.th_team_nproc;
2606 int new_num = master_th->th.th_teams_size.nth;
2607 kmp_info_t **other_threads = team->t.t_threads;
2608 team->t.t_nproc = new_num;
2609 for (int i = 0; i < old_num; ++i) {
2610 other_threads[i]->th.th_team_nproc = new_num;
2611 }
2612 // Adjust states of non-used threads of the team
2613 for (int i = old_num; i < new_num; ++i) {
2614 // Re-initialize thread's barrier data.
2615 KMP_DEBUG_ASSERT(other_threads[i]);
2616 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2617 for (int b = 0; b < bs_last_barrier; ++b) {
2618 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2619 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2620#if USE_DEBUGGER
2621 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2622#endif
2623 }
2624 if (__kmp_tasking_mode != tskm_immediate_exec) {
2625 // Synchronize thread's task state
2626 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2627 }
2628 }
2629 }
2630
2631#if OMPT_SUPPORT
2632 if (ompt_enabled.enabled) {
2633 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2634 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2635 }
2636#endif
2637
2638 return;
2639 }
2640
2641 /* do cleanup and restore the parent team */
2642 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2643 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2644
2645 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2646
2647 /* jc: The following lock has instructions with REL and ACQ semantics,
2648 separating the parallel user code called in this parallel region
2649 from the serial user code called after this function returns. */
2650 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2651
2652 if (!master_th->th.th_teams_microtask ||
2653 team->t.t_level > master_th->th.th_teams_level) {
2654 /* Decrement our nested depth level */
2655 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2656 }
2657 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2658
2659#if OMPT_SUPPORT
2660 if (ompt_enabled.enabled) {
2661 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2662 if (ompt_enabled.ompt_callback_implicit_task) {
2663 int flags = (team_microtask == (void *)__kmp_teams_master)
2664 ? ompt_task_initial
2665 : ompt_task_implicit;
2666 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2667 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2668 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2669 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2670 }
2671 task_info->frame.exit_frame = ompt_data_none;
2672 task_info->task_data = ompt_data_none;
2673 }
2674#endif
2675
2676 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2677 master_th, team));
2678 __kmp_pop_current_task_from_thread(master_th);
2679
2680 master_th->th.th_def_allocator = team->t.t_def_allocator;
2681
2682#if OMPD_SUPPORT
2683 if (ompd_state & OMPD_ENABLE_BP)
2684 ompd_bp_parallel_end();
2685#endif
2686 updateHWFPControl(team);
2687
2688 if (root->r.r_active != master_active)
2689 root->r.r_active = master_active;
2690
2691 __kmp_free_team(root, team, master_th); // this will free worker threads
2692
2693 /* this race was fun to find. make sure the following is in the critical
2694 region otherwise assertions may fail occasionally since the old team may be
2695 reallocated and the hierarchy appears inconsistent. it is actually safe to
2696 run and won't cause any bugs, but will cause those assertion failures. it's
2697 only one deref&assign so might as well put this in the critical region */
2698 master_th->th.th_team = parent_team;
2699 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2700 master_th->th.th_team_master = parent_team->t.t_threads[0];
2701 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2702
2703 /* restore serialized team, if need be */
2704 if (parent_team->t.t_serialized &&
2705 parent_team != master_th->th.th_serial_team &&
2706 parent_team != root->r.r_root_team) {
2707 __kmp_free_team(root, master_th->th.th_serial_team, NULL);
2708 master_th->th.th_serial_team = parent_team;
2709 }
2710
2711 if (__kmp_tasking_mode != tskm_immediate_exec) {
2712 // Restore primary thread's task state from team structure
2713 KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2714 team->t.t_primary_task_state == 1);
2715 master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2716
2717 // Copy the task team from the parent team to the primary thread
2718 master_th->th.th_task_team =
2719 parent_team->t.t_task_team[master_th->th.th_task_state];
2720 KA_TRACE(20,
2721 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2722 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2723 parent_team));
2724 }
2725
2726 // TODO: GEH - cannot do this assertion because root thread not set up as
2727 // executing
2728 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2729 master_th->th.th_current_task->td_flags.executing = 1;
2730
2731 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732
2733#if KMP_AFFINITY_SUPPORTED
2734 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2735 __kmp_reset_root_init_mask(gtid);
2736 }
2737#endif
2738#if OMPT_SUPPORT
2739 int flags =
2740 OMPT_INVOKER(fork_context) |
2741 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2742 : ompt_parallel_team);
2743 if (ompt_enabled.enabled) {
2744 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2745 codeptr);
2746 }
2747#endif
2748
2749 KMP_MB();
2750 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2751}
2752
2753/* Check whether we should push an internal control record onto the
2754 serial team stack. If so, do it. */
2755void __kmp_save_internal_controls(kmp_info_t *thread) {
2756
2757 if (thread->th.th_team != thread->th.th_serial_team) {
2758 return;
2759 }
2760 if (thread->th.th_team->t.t_serialized > 1) {
2761 int push = 0;
2762
2763 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2764 push = 1;
2765 } else {
2766 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2767 thread->th.th_team->t.t_serialized) {
2768 push = 1;
2769 }
2770 }
2771 if (push) { /* push a record on the serial team's stack */
2772 kmp_internal_control_t *control =
2773 (kmp_internal_control_t *)__kmp_allocate(
2774 sizeof(kmp_internal_control_t));
2775
2776 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2777
2778 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2779
2780 control->next = thread->th.th_team->t.t_control_stack_top;
2781 thread->th.th_team->t.t_control_stack_top = control;
2782 }
2783 }
2784}
2785
2786/* Changes set_nproc */
2787void __kmp_set_num_threads(int new_nth, int gtid) {
2788 kmp_info_t *thread;
2789 kmp_root_t *root;
2790
2791 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2792 KMP_DEBUG_ASSERT(__kmp_init_serial);
2793
2794 if (new_nth < 1)
2795 new_nth = 1;
2796 else if (new_nth > __kmp_max_nth)
2797 new_nth = __kmp_max_nth;
2798
2799 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2800 thread = __kmp_threads[gtid];
2801 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2802 return; // nothing to do
2803
2804 __kmp_save_internal_controls(thread);
2805
2806 set__nproc(thread, new_nth);
2807
2808 // If this omp_set_num_threads() call will cause the hot team size to be
2809 // reduced (in the absence of a num_threads clause), then reduce it now,
2810 // rather than waiting for the next parallel region.
2811 root = thread->th.th_root;
2812 if (__kmp_init_parallel && (!root->r.r_active) &&
2813 (root->r.r_hot_team->t.t_nproc > new_nth) && __kmp_hot_teams_max_level &&
2814 !__kmp_hot_teams_mode) {
2815 kmp_team_t *hot_team = root->r.r_hot_team;
2816 int f;
2817
2818 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2819
2820 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2821 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2822 }
2823 // Release the extra threads we don't need any more.
2824 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2825 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2826 if (__kmp_tasking_mode != tskm_immediate_exec) {
2827 // When decreasing team size, threads no longer in the team should unref
2828 // task team.
2829 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2830 }
2831 __kmp_free_thread(hot_team->t.t_threads[f]);
2832 hot_team->t.t_threads[f] = NULL;
2833 }
2834 hot_team->t.t_nproc = new_nth;
2835 if (thread->th.th_hot_teams) {
2836 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2837 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2838 }
2839
2840 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2841 hot_team->t.b->update_num_threads(new_nth);
2842 __kmp_add_threads_to_team(hot_team, new_nth);
2843 }
2844
2845 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2846
2847 // Update the t_nproc field in the threads that are still active.
2848 for (f = 0; f < new_nth; f++) {
2849 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2850 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2851 }
2852 // Special flag in case omp_set_num_threads() call
2853 hot_team->t.t_size_changed = -1;
2854 }
2855}
2856
2857/* Changes max_active_levels */
2858void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2859 kmp_info_t *thread;
2860
2861 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2862 "%d = (%d)\n",
2863 gtid, max_active_levels));
2864 KMP_DEBUG_ASSERT(__kmp_init_serial);
2865
2866 // validate max_active_levels
2867 if (max_active_levels < 0) {
2868 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2869 // We ignore this call if the user has specified a negative value.
2870 // The current setting won't be changed. The last valid setting will be
2871 // used. A warning will be issued (if warnings are allowed as controlled by
2872 // the KMP_WARNINGS env var).
2873 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2874 "max_active_levels for thread %d = (%d)\n",
2875 gtid, max_active_levels));
2876 return;
2877 }
2878 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2879 // it's OK, the max_active_levels is within the valid range: [ 0;
2880 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2881 // We allow a zero value. (implementation defined behavior)
2882 } else {
2883 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2884 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2885 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2886 // Current upper limit is MAX_INT. (implementation defined behavior)
2887 // If the input exceeds the upper limit, we correct the input to be the
2888 // upper limit. (implementation defined behavior)
2889 // Actually, the flow should never get here until we use MAX_INT limit.
2890 }
2891 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2892 "max_active_levels for thread %d = (%d)\n",
2893 gtid, max_active_levels));
2894
2895 thread = __kmp_threads[gtid];
2896
2897 __kmp_save_internal_controls(thread);
2898
2899 set__max_active_levels(thread, max_active_levels);
2900}
2901
2902/* Gets max_active_levels */
2903int __kmp_get_max_active_levels(int gtid) {
2904 kmp_info_t *thread;
2905
2906 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2907 KMP_DEBUG_ASSERT(__kmp_init_serial);
2908
2909 thread = __kmp_threads[gtid];
2910 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2911 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2912 "curtask_maxaclevel=%d\n",
2913 gtid, thread->th.th_current_task,
2914 thread->th.th_current_task->td_icvs.max_active_levels));
2915 return thread->th.th_current_task->td_icvs.max_active_levels;
2916}
2917
2918// nteams-var per-device ICV
2919void __kmp_set_num_teams(int num_teams) {
2920 if (num_teams > 0)
2921 __kmp_nteams = num_teams;
2922}
2923int __kmp_get_max_teams(void) { return __kmp_nteams; }
2924// teams-thread-limit-var per-device ICV
2925void __kmp_set_teams_thread_limit(int limit) {
2926 if (limit > 0)
2927 __kmp_teams_thread_limit = limit;
2928}
2929int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2930
2931KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2932KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2933
2934/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2935void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2936 kmp_info_t *thread;
2937 kmp_sched_t orig_kind;
2938 // kmp_team_t *team;
2939
2940 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2941 gtid, (int)kind, chunk));
2942 KMP_DEBUG_ASSERT(__kmp_init_serial);
2943
2944 // Check if the kind parameter is valid, correct if needed.
2945 // Valid parameters should fit in one of two intervals - standard or extended:
2946 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2947 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2948 orig_kind = kind;
2949 kind = __kmp_sched_without_mods(kind);
2950
2951 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2952 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2953 // TODO: Hint needs attention in case we change the default schedule.
2954 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2955 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2956 __kmp_msg_null);
2957 kind = kmp_sched_default;
2958 chunk = 0; // ignore chunk value in case of bad kind
2959 }
2960
2961 thread = __kmp_threads[gtid];
2962
2963 __kmp_save_internal_controls(thread);
2964
2965 if (kind < kmp_sched_upper_std) {
2966 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2967 // differ static chunked vs. unchunked: chunk should be invalid to
2968 // indicate unchunked schedule (which is the default)
2969 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2970 } else {
2971 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2972 __kmp_sch_map[kind - kmp_sched_lower - 1];
2973 }
2974 } else {
2975 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2976 // kmp_sched_lower - 2 ];
2977 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2978 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2979 kmp_sched_lower - 2];
2980 }
2981 __kmp_sched_apply_mods_intkind(
2982 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2983 if (kind == kmp_sched_auto || chunk < 1) {
2984 // ignore parameter chunk for schedule auto
2985 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2986 } else {
2987 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2988 }
2989}
2990
2991/* Gets def_sched_var ICV values */
2992void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2993 kmp_info_t *thread;
2994 enum sched_type th_type;
2995
2996 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2997 KMP_DEBUG_ASSERT(__kmp_init_serial);
2998
2999 thread = __kmp_threads[gtid];
3000
3001 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3002 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3003 case kmp_sch_static:
3004 case kmp_sch_static_greedy:
3005 case kmp_sch_static_balanced:
3006 *kind = kmp_sched_static;
3007 __kmp_sched_apply_mods_stdkind(kind, th_type);
3008 *chunk = 0; // chunk was not set, try to show this fact via zero value
3009 return;
3010 case kmp_sch_static_chunked:
3011 *kind = kmp_sched_static;
3012 break;
3013 case kmp_sch_dynamic_chunked:
3014 *kind = kmp_sched_dynamic;
3015 break;
3017 case kmp_sch_guided_iterative_chunked:
3018 case kmp_sch_guided_analytical_chunked:
3019 *kind = kmp_sched_guided;
3020 break;
3021 case kmp_sch_auto:
3022 *kind = kmp_sched_auto;
3023 break;
3024 case kmp_sch_trapezoidal:
3025 *kind = kmp_sched_trapezoidal;
3026 break;
3027#if KMP_STATIC_STEAL_ENABLED
3028 case kmp_sch_static_steal:
3029 *kind = kmp_sched_static_steal;
3030 break;
3031#endif
3032 default:
3033 KMP_FATAL(UnknownSchedulingType, th_type);
3034 }
3035
3036 __kmp_sched_apply_mods_stdkind(kind, th_type);
3037 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3038}
3039
3040int __kmp_get_ancestor_thread_num(int gtid, int level) {
3041
3042 int ii, dd;
3043 kmp_team_t *team;
3044 kmp_info_t *thr;
3045
3046 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3047 KMP_DEBUG_ASSERT(__kmp_init_serial);
3048
3049 // validate level
3050 if (level == 0)
3051 return 0;
3052 if (level < 0)
3053 return -1;
3054 thr = __kmp_threads[gtid];
3055 team = thr->th.th_team;
3056 ii = team->t.t_level;
3057 if (level > ii)
3058 return -1;
3059
3060 if (thr->th.th_teams_microtask) {
3061 // AC: we are in teams region where multiple nested teams have same level
3062 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3063 if (level <=
3064 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3065 KMP_DEBUG_ASSERT(ii >= tlevel);
3066 // AC: As we need to pass by the teams league, we need to artificially
3067 // increase ii
3068 if (ii == tlevel) {
3069 ii += 2; // three teams have same level
3070 } else {
3071 ii++; // two teams have same level
3072 }
3073 }
3074 }
3075
3076 if (ii == level)
3077 return __kmp_tid_from_gtid(gtid);
3078
3079 dd = team->t.t_serialized;
3080 level++;
3081 while (ii > level) {
3082 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3083 }
3084 if ((team->t.t_serialized) && (!dd)) {
3085 team = team->t.t_parent;
3086 continue;
3087 }
3088 if (ii > level) {
3089 team = team->t.t_parent;
3090 dd = team->t.t_serialized;
3091 ii--;
3092 }
3093 }
3094
3095 return (dd > 1) ? (0) : (team->t.t_master_tid);
3096}
3097
3098int __kmp_get_team_size(int gtid, int level) {
3099
3100 int ii, dd;
3101 kmp_team_t *team;
3102 kmp_info_t *thr;
3103
3104 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3105 KMP_DEBUG_ASSERT(__kmp_init_serial);
3106
3107 // validate level
3108 if (level == 0)
3109 return 1;
3110 if (level < 0)
3111 return -1;
3112 thr = __kmp_threads[gtid];
3113 team = thr->th.th_team;
3114 ii = team->t.t_level;
3115 if (level > ii)
3116 return -1;
3117
3118 if (thr->th.th_teams_microtask) {
3119 // AC: we are in teams region where multiple nested teams have same level
3120 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3121 if (level <=
3122 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3123 KMP_DEBUG_ASSERT(ii >= tlevel);
3124 // AC: As we need to pass by the teams league, we need to artificially
3125 // increase ii
3126 if (ii == tlevel) {
3127 ii += 2; // three teams have same level
3128 } else {
3129 ii++; // two teams have same level
3130 }
3131 }
3132 }
3133
3134 while (ii > level) {
3135 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3136 }
3137 if (team->t.t_serialized && (!dd)) {
3138 team = team->t.t_parent;
3139 continue;
3140 }
3141 if (ii > level) {
3142 team = team->t.t_parent;
3143 ii--;
3144 }
3145 }
3146
3147 return team->t.t_nproc;
3148}
3149
3150kmp_r_sched_t __kmp_get_schedule_global() {
3151 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3152 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3153 // independently. So one can get the updated schedule here.
3154
3155 kmp_r_sched_t r_sched;
3156
3157 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3158 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3159 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3160 // different roots (even in OMP 2.5)
3161 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3162 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3163 if (s == kmp_sch_static) {
3164 // replace STATIC with more detailed schedule (balanced or greedy)
3165 r_sched.r_sched_type = __kmp_static;
3166 } else if (s == kmp_sch_guided_chunked) {
3167 // replace GUIDED with more detailed schedule (iterative or analytical)
3168 r_sched.r_sched_type = __kmp_guided;
3169 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3170 r_sched.r_sched_type = __kmp_sched;
3171 }
3172 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3173
3174 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3175 // __kmp_chunk may be wrong here (if it was not ever set)
3176 r_sched.chunk = KMP_DEFAULT_CHUNK;
3177 } else {
3178 r_sched.chunk = __kmp_chunk;
3179 }
3180
3181 return r_sched;
3182}
3183
3184/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3185 at least argc number of *t_argv entries for the requested team. */
3186static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3187
3188 KMP_DEBUG_ASSERT(team);
3189 if (!realloc || argc > team->t.t_max_argc) {
3190
3191 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3192 "current entries=%d\n",
3193 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3194 /* if previously allocated heap space for args, free them */
3195 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3196 __kmp_free((void *)team->t.t_argv);
3197
3198 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3199 /* use unused space in the cache line for arguments */
3200 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3201 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3202 "argv entries\n",
3203 team->t.t_id, team->t.t_max_argc));
3204 team->t.t_argv = &team->t.t_inline_argv[0];
3205 if (__kmp_storage_map) {
3206 __kmp_print_storage_map_gtid(
3207 -1, &team->t.t_inline_argv[0],
3208 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3209 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3210 team->t.t_id);
3211 }
3212 } else {
3213 /* allocate space for arguments in the heap */
3214 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3215 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3216 : 2 * argc;
3217 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3218 "argv entries\n",
3219 team->t.t_id, team->t.t_max_argc));
3220 team->t.t_argv =
3221 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3222 if (__kmp_storage_map) {
3223 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3224 &team->t.t_argv[team->t.t_max_argc],
3225 sizeof(void *) * team->t.t_max_argc,
3226 "team_%d.t_argv", team->t.t_id);
3227 }
3228 }
3229 }
3230}
3231
3232static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3233 int i;
3234 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3235 team->t.t_threads =
3236 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3237 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3238 sizeof(dispatch_shared_info_t) * num_disp_buff);
3239 team->t.t_dispatch =
3240 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3241 team->t.t_implicit_task_taskdata =
3242 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3243 team->t.t_max_nproc = max_nth;
3244
3245 /* setup dispatch buffers */
3246 for (i = 0; i < num_disp_buff; ++i) {
3247 team->t.t_disp_buffer[i].buffer_index = i;
3248 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3249 }
3250}
3251
3252static void __kmp_free_team_arrays(kmp_team_t *team) {
3253 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3254 int i;
3255 for (i = 0; i < team->t.t_max_nproc; ++i) {
3256 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3257 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3258 team->t.t_dispatch[i].th_disp_buffer = NULL;
3259 }
3260 }
3261#if KMP_USE_HIER_SCHED
3262 __kmp_dispatch_free_hierarchies(team);
3263#endif
3264 __kmp_free(team->t.t_threads);
3265 __kmp_free(team->t.t_disp_buffer);
3266 __kmp_free(team->t.t_dispatch);
3267 __kmp_free(team->t.t_implicit_task_taskdata);
3268 team->t.t_threads = NULL;
3269 team->t.t_disp_buffer = NULL;
3270 team->t.t_dispatch = NULL;
3271 team->t.t_implicit_task_taskdata = 0;
3272}
3273
3274static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3275 kmp_info_t **oldThreads = team->t.t_threads;
3276
3277 __kmp_free(team->t.t_disp_buffer);
3278 __kmp_free(team->t.t_dispatch);
3279 __kmp_free(team->t.t_implicit_task_taskdata);
3280 __kmp_allocate_team_arrays(team, max_nth);
3281
3282 KMP_MEMCPY(team->t.t_threads, oldThreads,
3283 team->t.t_nproc * sizeof(kmp_info_t *));
3284
3285 __kmp_free(oldThreads);
3286}
3287
3288static kmp_internal_control_t __kmp_get_global_icvs(void) {
3289
3290 kmp_r_sched_t r_sched =
3291 __kmp_get_schedule_global(); // get current state of scheduling globals
3292
3293 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3294
3295 kmp_internal_control_t g_icvs = {
3296 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3297 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3298 // adjustment of threads (per thread)
3299 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3300 // whether blocktime is explicitly set
3301 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3302#if KMP_USE_MONITOR
3303 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3304// intervals
3305#endif
3306 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3307 // next parallel region (per thread)
3308 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3309 __kmp_cg_max_nth, // int thread_limit;
3310 __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3311 // on task. This is used in the case of target thread_limit
3312 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3313 // for max_active_levels
3314 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3315 // {sched,chunk} pair
3316 __kmp_nested_proc_bind.bind_types[0],
3317 __kmp_default_device,
3318 NULL // struct kmp_internal_control *next;
3319 };
3320
3321 return g_icvs;
3322}
3323
3324static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3325
3326 kmp_internal_control_t gx_icvs;
3327 gx_icvs.serial_nesting_level =
3328 0; // probably =team->t.t_serial like in save_inter_controls
3329 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3330 gx_icvs.next = NULL;
3331
3332 return gx_icvs;
3333}
3334
3335static void __kmp_initialize_root(kmp_root_t *root) {
3336 int f;
3337 kmp_team_t *root_team;
3338 kmp_team_t *hot_team;
3339 int hot_team_max_nth;
3340 kmp_r_sched_t r_sched =
3341 __kmp_get_schedule_global(); // get current state of scheduling globals
3342 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3343 KMP_DEBUG_ASSERT(root);
3344 KMP_ASSERT(!root->r.r_begin);
3345
3346 /* setup the root state structure */
3347 __kmp_init_lock(&root->r.r_begin_lock);
3348 root->r.r_begin = FALSE;
3349 root->r.r_active = FALSE;
3350 root->r.r_in_parallel = 0;
3351 root->r.r_blocktime = __kmp_dflt_blocktime;
3352#if KMP_AFFINITY_SUPPORTED
3353 root->r.r_affinity_assigned = FALSE;
3354#endif
3355
3356 /* setup the root team for this task */
3357 /* allocate the root team structure */
3358 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3359
3360 root_team = __kmp_allocate_team(root,
3361 1, // new_nproc
3362 1, // max_nproc
3363#if OMPT_SUPPORT
3364 ompt_data_none, // root parallel id
3365#endif
3366 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3367 0, // argc
3368 NULL // primary thread is unknown
3369 );
3370#if USE_DEBUGGER
3371 // Non-NULL value should be assigned to make the debugger display the root
3372 // team.
3373 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3374#endif
3375
3376 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3377
3378 root->r.r_root_team = root_team;
3379 root_team->t.t_control_stack_top = NULL;
3380
3381 /* initialize root team */
3382 root_team->t.t_threads[0] = NULL;
3383 root_team->t.t_nproc = 1;
3384 root_team->t.t_serialized = 1;
3385 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3386 root_team->t.t_sched.sched = r_sched.sched;
3387 root_team->t.t_nested_nth = &__kmp_nested_nth;
3388 KA_TRACE(
3389 20,
3390 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3391 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3392
3393 /* setup the hot team for this task */
3394 /* allocate the hot team structure */
3395 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3396
3397 hot_team = __kmp_allocate_team(root,
3398 1, // new_nproc
3399 __kmp_dflt_team_nth_ub * 2, // max_nproc
3400#if OMPT_SUPPORT
3401 ompt_data_none, // root parallel id
3402#endif
3403 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3404 0, // argc
3405 NULL // primary thread is unknown
3406 );
3407 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3408
3409 root->r.r_hot_team = hot_team;
3410 root_team->t.t_control_stack_top = NULL;
3411
3412 /* first-time initialization */
3413 hot_team->t.t_parent = root_team;
3414
3415 /* initialize hot team */
3416 hot_team_max_nth = hot_team->t.t_max_nproc;
3417 for (f = 0; f < hot_team_max_nth; ++f) {
3418 hot_team->t.t_threads[f] = NULL;
3419 }
3420 hot_team->t.t_nproc = 1;
3421 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3422 hot_team->t.t_sched.sched = r_sched.sched;
3423 hot_team->t.t_size_changed = 0;
3424 hot_team->t.t_nested_nth = &__kmp_nested_nth;
3425}
3426
3427#ifdef KMP_DEBUG
3428
3429typedef struct kmp_team_list_item {
3430 kmp_team_p const *entry;
3431 struct kmp_team_list_item *next;
3432} kmp_team_list_item_t;
3433typedef kmp_team_list_item_t *kmp_team_list_t;
3434
3435static void __kmp_print_structure_team_accum( // Add team to list of teams.
3436 kmp_team_list_t list, // List of teams.
3437 kmp_team_p const *team // Team to add.
3438) {
3439
3440 // List must terminate with item where both entry and next are NULL.
3441 // Team is added to the list only once.
3442 // List is sorted in ascending order by team id.
3443 // Team id is *not* a key.
3444
3445 kmp_team_list_t l;
3446
3447 KMP_DEBUG_ASSERT(list != NULL);
3448 if (team == NULL) {
3449 return;
3450 }
3451
3452 __kmp_print_structure_team_accum(list, team->t.t_parent);
3453 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3454
3455 // Search list for the team.
3456 l = list;
3457 while (l->next != NULL && l->entry != team) {
3458 l = l->next;
3459 }
3460 if (l->next != NULL) {
3461 return; // Team has been added before, exit.
3462 }
3463
3464 // Team is not found. Search list again for insertion point.
3465 l = list;
3466 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3467 l = l->next;
3468 }
3469
3470 // Insert team.
3471 {
3472 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3473 sizeof(kmp_team_list_item_t));
3474 *item = *l;
3475 l->entry = team;
3476 l->next = item;
3477 }
3478}
3479
3480static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3481
3482) {
3483 __kmp_printf("%s", title);
3484 if (team != NULL) {
3485 __kmp_printf("%2x %p\n", team->t.t_id, team);
3486 } else {
3487 __kmp_printf(" - (nil)\n");
3488 }
3489}
3490
3491static void __kmp_print_structure_thread(char const *title,
3492 kmp_info_p const *thread) {
3493 __kmp_printf("%s", title);
3494 if (thread != NULL) {
3495 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3496 } else {
3497 __kmp_printf(" - (nil)\n");
3498 }
3499}
3500
3501void __kmp_print_structure(void) {
3502
3503 kmp_team_list_t list;
3504
3505 // Initialize list of teams.
3506 list =
3507 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3508 list->entry = NULL;
3509 list->next = NULL;
3510
3511 __kmp_printf("\n------------------------------\nGlobal Thread "
3512 "Table\n------------------------------\n");
3513 {
3514 int gtid;
3515 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3516 __kmp_printf("%2d", gtid);
3517 if (__kmp_threads != NULL) {
3518 __kmp_printf(" %p", __kmp_threads[gtid]);
3519 }
3520 if (__kmp_root != NULL) {
3521 __kmp_printf(" %p", __kmp_root[gtid]);
3522 }
3523 __kmp_printf("\n");
3524 }
3525 }
3526
3527 // Print out __kmp_threads array.
3528 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3529 "----------\n");
3530 if (__kmp_threads != NULL) {
3531 int gtid;
3532 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3533 kmp_info_t const *thread = __kmp_threads[gtid];
3534 if (thread != NULL) {
3535 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3536 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3537 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3538 __kmp_print_structure_team(" Serial Team: ",
3539 thread->th.th_serial_team);
3540 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3541 __kmp_print_structure_thread(" Primary: ",
3542 thread->th.th_team_master);
3543 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3544 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3545 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3546 __kmp_print_structure_thread(" Next in pool: ",
3547 thread->th.th_next_pool);
3548 __kmp_printf("\n");
3549 __kmp_print_structure_team_accum(list, thread->th.th_team);
3550 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3551 }
3552 }
3553 } else {
3554 __kmp_printf("Threads array is not allocated.\n");
3555 }
3556
3557 // Print out __kmp_root array.
3558 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3559 "--------\n");
3560 if (__kmp_root != NULL) {
3561 int gtid;
3562 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3563 kmp_root_t const *root = __kmp_root[gtid];
3564 if (root != NULL) {
3565 __kmp_printf("GTID %2d %p:\n", gtid, root);
3566 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3567 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3568 __kmp_print_structure_thread(" Uber Thread: ",
3569 root->r.r_uber_thread);
3570 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3571 __kmp_printf(" In Parallel: %2d\n",
3572 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3573 __kmp_printf("\n");
3574 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3575 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3576 }
3577 }
3578 } else {
3579 __kmp_printf("Ubers array is not allocated.\n");
3580 }
3581
3582 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3583 "--------\n");
3584 while (list->next != NULL) {
3585 kmp_team_p const *team = list->entry;
3586 int i;
3587 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3588 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3589 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3590 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3591 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3592 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3593 for (i = 0; i < team->t.t_nproc; ++i) {
3594 __kmp_printf(" Thread %2d: ", i);
3595 __kmp_print_structure_thread("", team->t.t_threads[i]);
3596 }
3597 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3598 __kmp_printf("\n");
3599 list = list->next;
3600 }
3601
3602 // Print out __kmp_thread_pool and __kmp_team_pool.
3603 __kmp_printf("\n------------------------------\nPools\n----------------------"
3604 "--------\n");
3605 __kmp_print_structure_thread("Thread pool: ",
3606 CCAST(kmp_info_t *, __kmp_thread_pool));
3607 __kmp_print_structure_team("Team pool: ",
3608 CCAST(kmp_team_t *, __kmp_team_pool));
3609 __kmp_printf("\n");
3610
3611 // Free team list.
3612 while (list != NULL) {
3613 kmp_team_list_item_t *item = list;
3614 list = list->next;
3615 KMP_INTERNAL_FREE(item);
3616 }
3617}
3618
3619#endif
3620
3621//---------------------------------------------------------------------------
3622// Stuff for per-thread fast random number generator
3623// Table of primes
3624static const unsigned __kmp_primes[] = {
3625 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3626 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3627 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3628 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3629 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3630 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3631 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3632 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3633 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3634 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3635 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3636
3637//---------------------------------------------------------------------------
3638// __kmp_get_random: Get a random number using a linear congruential method.
3639unsigned short __kmp_get_random(kmp_info_t *thread) {
3640 unsigned x = thread->th.th_x;
3641 unsigned short r = (unsigned short)(x >> 16);
3642
3643 thread->th.th_x = x * thread->th.th_a + 1;
3644
3645 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3646 thread->th.th_info.ds.ds_tid, r));
3647
3648 return r;
3649}
3650//--------------------------------------------------------
3651// __kmp_init_random: Initialize a random number generator
3652void __kmp_init_random(kmp_info_t *thread) {
3653 unsigned seed = thread->th.th_info.ds.ds_tid;
3654
3655 thread->th.th_a =
3656 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3657 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3658 KA_TRACE(30,
3659 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3660}
3661
3662#if KMP_OS_WINDOWS
3663/* reclaim array entries for root threads that are already dead, returns number
3664 * reclaimed */
3665static int __kmp_reclaim_dead_roots(void) {
3666 int i, r = 0;
3667
3668 for (i = 0; i < __kmp_threads_capacity; ++i) {
3669 if (KMP_UBER_GTID(i) &&
3670 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3671 !__kmp_root[i]
3672 ->r.r_active) { // AC: reclaim only roots died in non-active state
3673 r += __kmp_unregister_root_other_thread(i);
3674 }
3675 }
3676 return r;
3677}
3678#endif
3679
3680/* This function attempts to create free entries in __kmp_threads and
3681 __kmp_root, and returns the number of free entries generated.
3682
3683 For Windows* OS static library, the first mechanism used is to reclaim array
3684 entries for root threads that are already dead.
3685
3686 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3687 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3688 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3689 threadprivate cache array has been created. Synchronization with
3690 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3691
3692 After any dead root reclamation, if the clipping value allows array expansion
3693 to result in the generation of a total of nNeed free slots, the function does
3694 that expansion. If not, nothing is done beyond the possible initial root
3695 thread reclamation.
3696
3697 If any argument is negative, the behavior is undefined. */
3698static int __kmp_expand_threads(int nNeed) {
3699 int added = 0;
3700 int minimumRequiredCapacity;
3701 int newCapacity;
3702 kmp_info_t **newThreads;
3703 kmp_root_t **newRoot;
3704
3705 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3706 // resizing __kmp_threads does not need additional protection if foreign
3707 // threads are present
3708
3709#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3710 /* only for Windows static library */
3711 /* reclaim array entries for root threads that are already dead */
3712 added = __kmp_reclaim_dead_roots();
3713
3714 if (nNeed) {
3715 nNeed -= added;
3716 if (nNeed < 0)
3717 nNeed = 0;
3718 }
3719#endif
3720 if (nNeed <= 0)
3721 return added;
3722
3723 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3724 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3725 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3726 // > __kmp_max_nth in one of two ways:
3727 //
3728 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3729 // may not be reused by another thread, so we may need to increase
3730 // __kmp_threads_capacity to __kmp_max_nth + 1.
3731 //
3732 // 2) New foreign root(s) are encountered. We always register new foreign
3733 // roots. This may cause a smaller # of threads to be allocated at
3734 // subsequent parallel regions, but the worker threads hang around (and
3735 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3736 //
3737 // Anyway, that is the reason for moving the check to see if
3738 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3739 // instead of having it performed here. -BB
3740
3741 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3742
3743 /* compute expansion headroom to check if we can expand */
3744 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3745 /* possible expansion too small -- give up */
3746 return added;
3747 }
3748 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3749
3750 newCapacity = __kmp_threads_capacity;
3751 do {
3752 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3753 : __kmp_sys_max_nth;
3754 } while (newCapacity < minimumRequiredCapacity);
3755 newThreads = (kmp_info_t **)__kmp_allocate(
3756 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3757 newRoot =
3758 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3759 KMP_MEMCPY(newThreads, __kmp_threads,
3760 __kmp_threads_capacity * sizeof(kmp_info_t *));
3761 KMP_MEMCPY(newRoot, __kmp_root,
3762 __kmp_threads_capacity * sizeof(kmp_root_t *));
3763 // Put old __kmp_threads array on a list. Any ongoing references to the old
3764 // list will be valid. This list is cleaned up at library shutdown.
3765 kmp_old_threads_list_t *node =
3766 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3767 node->threads = __kmp_threads;
3768 node->next = __kmp_old_threads_list;
3769 __kmp_old_threads_list = node;
3770
3771 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3772 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3773 added += newCapacity - __kmp_threads_capacity;
3774 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3775
3776 if (newCapacity > __kmp_tp_capacity) {
3777 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3778 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3779 __kmp_threadprivate_resize_cache(newCapacity);
3780 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3781 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3782 }
3783 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3784 }
3785
3786 return added;
3787}
3788
3789/* Register the current thread as a root thread and obtain our gtid. We must
3790 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3791 thread that calls from __kmp_do_serial_initialize() */
3792int __kmp_register_root(int initial_thread) {
3793 kmp_info_t *root_thread;
3794 kmp_root_t *root;
3795 int gtid;
3796 int capacity;
3797 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3798 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3799 KMP_MB();
3800
3801 /* 2007-03-02:
3802 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3803 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3804 work as expected -- it may return false (that means there is at least one
3805 empty slot in __kmp_threads array), but it is possible the only free slot
3806 is #0, which is reserved for initial thread and so cannot be used for this
3807 one. Following code workarounds this bug.
3808
3809 However, right solution seems to be not reserving slot #0 for initial
3810 thread because:
3811 (1) there is no magic in slot #0,
3812 (2) we cannot detect initial thread reliably (the first thread which does
3813 serial initialization may be not a real initial thread).
3814 */
3815 capacity = __kmp_threads_capacity;
3816 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3817 --capacity;
3818 }
3819
3820 // If it is not for initializing the hidden helper team, we need to take
3821 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3822 // in __kmp_threads_capacity.
3823 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3824 capacity -= __kmp_hidden_helper_threads_num;
3825 }
3826
3827 /* see if there are too many threads */
3828 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3829 if (__kmp_tp_cached) {
3830 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3831 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3832 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3833 } else {
3834 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3835 __kmp_msg_null);
3836 }
3837 }
3838
3839 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3840 // 0: initial thread, also a regular OpenMP thread.
3841 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3842 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3843 // regular OpenMP threads.
3844 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3845 // Find an available thread slot for hidden helper thread. Slots for hidden
3846 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3847 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3848 gtid <= __kmp_hidden_helper_threads_num;
3849 gtid++)
3850 ;
3851 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3852 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3853 "hidden helper thread: T#%d\n",
3854 gtid));
3855 } else {
3856 /* find an available thread slot */
3857 // Don't reassign the zero slot since we need that to only be used by
3858 // initial thread. Slots for hidden helper threads should also be skipped.
3859 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3860 gtid = 0;
3861 } else {
3862 for (gtid = __kmp_hidden_helper_threads_num + 1;
3863 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3864 ;
3865 }
3866 KA_TRACE(
3867 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3868 KMP_ASSERT(gtid < __kmp_threads_capacity);
3869 }
3870
3871 /* update global accounting */
3872 __kmp_all_nth++;
3873 TCW_4(__kmp_nth, __kmp_nth + 1);
3874
3875 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3876 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3877 if (__kmp_adjust_gtid_mode) {
3878 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3879 if (TCR_4(__kmp_gtid_mode) != 2) {
3880 TCW_4(__kmp_gtid_mode, 2);
3881 }
3882 } else {
3883 if (TCR_4(__kmp_gtid_mode) != 1) {
3884 TCW_4(__kmp_gtid_mode, 1);
3885 }
3886 }
3887 }
3888
3889#ifdef KMP_ADJUST_BLOCKTIME
3890 /* Adjust blocktime to zero if necessary */
3891 /* Middle initialization might not have occurred yet */
3892 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3893 if (__kmp_nth > __kmp_avail_proc) {
3894 __kmp_zero_bt = TRUE;
3895 }
3896 }
3897#endif /* KMP_ADJUST_BLOCKTIME */
3898
3899 /* setup this new hierarchy */
3900 if (!(root = __kmp_root[gtid])) {
3901 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3902 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3903 }
3904
3905#if KMP_STATS_ENABLED
3906 // Initialize stats as soon as possible (right after gtid assignment).
3907 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3908 __kmp_stats_thread_ptr->startLife();
3909 KMP_SET_THREAD_STATE(SERIAL_REGION);
3910 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3911#endif
3912 __kmp_initialize_root(root);
3913
3914 /* setup new root thread structure */
3915 if (root->r.r_uber_thread) {
3916 root_thread = root->r.r_uber_thread;
3917 } else {
3918 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3919 if (__kmp_storage_map) {
3920 __kmp_print_thread_storage_map(root_thread, gtid);
3921 }
3922 root_thread->th.th_info.ds.ds_gtid = gtid;
3923#if OMPT_SUPPORT
3924 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3925#endif
3926 root_thread->th.th_root = root;
3927 if (__kmp_env_consistency_check) {
3928 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3929 }
3930#if USE_FAST_MEMORY
3931 __kmp_initialize_fast_memory(root_thread);
3932#endif /* USE_FAST_MEMORY */
3933
3934#if KMP_USE_BGET
3935 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3936 __kmp_initialize_bget(root_thread);
3937#endif
3938 __kmp_init_random(root_thread); // Initialize random number generator
3939 }
3940
3941 /* setup the serial team held in reserve by the root thread */
3942 if (!root_thread->th.th_serial_team) {
3943 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3944 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3945 root_thread->th.th_serial_team =
3946 __kmp_allocate_team(root, 1, 1,
3947#if OMPT_SUPPORT
3948 ompt_data_none, // root parallel id
3949#endif
3950 proc_bind_default, &r_icvs, 0, NULL);
3951 }
3952 KMP_ASSERT(root_thread->th.th_serial_team);
3953 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3954 root_thread->th.th_serial_team));
3955
3956 /* drop root_thread into place */
3957 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3958
3959 root->r.r_root_team->t.t_threads[0] = root_thread;
3960 root->r.r_hot_team->t.t_threads[0] = root_thread;
3961 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3962 // AC: the team created in reserve, not for execution (it is unused for now).
3963 root_thread->th.th_serial_team->t.t_serialized = 0;
3964 root->r.r_uber_thread = root_thread;
3965
3966 /* initialize the thread, get it ready to go */
3967 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3968 TCW_4(__kmp_init_gtid, TRUE);
3969
3970 /* prepare the primary thread for get_gtid() */
3971 __kmp_gtid_set_specific(gtid);
3972
3973#if USE_ITT_BUILD
3974 __kmp_itt_thread_name(gtid);
3975#endif /* USE_ITT_BUILD */
3976
3977#ifdef KMP_TDATA_GTID
3978 __kmp_gtid = gtid;
3979#endif
3980 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3981 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3982
3983 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3984 "plain=%u\n",
3985 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3986 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3987 KMP_INIT_BARRIER_STATE));
3988 { // Initialize barrier data.
3989 int b;
3990 for (b = 0; b < bs_last_barrier; ++b) {
3991 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3992#if USE_DEBUGGER
3993 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3994#endif
3995 }
3996 }
3997 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3998 KMP_INIT_BARRIER_STATE);
3999
4000#if KMP_AFFINITY_SUPPORTED
4001 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4002 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4003 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4004 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4005#endif /* KMP_AFFINITY_SUPPORTED */
4006 root_thread->th.th_def_allocator = __kmp_def_allocator;
4007 root_thread->th.th_prev_level = 0;
4008 root_thread->th.th_prev_num_threads = 1;
4009
4010 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4011 tmp->cg_root = root_thread;
4012 tmp->cg_thread_limit = __kmp_cg_max_nth;
4013 tmp->cg_nthreads = 1;
4014 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4015 " cg_nthreads init to 1\n",
4016 root_thread, tmp));
4017 tmp->up = NULL;
4018 root_thread->th.th_cg_roots = tmp;
4019
4020 __kmp_root_counter++;
4021
4022#if OMPT_SUPPORT
4023 if (ompt_enabled.enabled) {
4024
4025 kmp_info_t *root_thread = ompt_get_thread();
4026
4027 ompt_set_thread_state(root_thread, ompt_state_overhead);
4028
4029 if (ompt_enabled.ompt_callback_thread_begin) {
4030 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4031 ompt_thread_initial, __ompt_get_thread_data_internal());
4032 }
4033 ompt_data_t *task_data;
4034 ompt_data_t *parallel_data;
4035 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4036 NULL);
4037 if (ompt_enabled.ompt_callback_implicit_task) {
4038 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4039 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4040 }
4041
4042 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4043 }
4044#endif
4045#if OMPD_SUPPORT
4046 if (ompd_state & OMPD_ENABLE_BP)
4047 ompd_bp_thread_begin();
4048#endif
4049
4050 KMP_MB();
4051 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4052
4053 return gtid;
4054}
4055
4056static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4057 const int max_level) {
4058 int i, n, nth;
4059 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4060 if (!hot_teams || !hot_teams[level].hot_team) {
4061 return 0;
4062 }
4063 KMP_DEBUG_ASSERT(level < max_level);
4064 kmp_team_t *team = hot_teams[level].hot_team;
4065 nth = hot_teams[level].hot_team_nth;
4066 n = nth - 1; // primary thread is not freed
4067 if (level < max_level - 1) {
4068 for (i = 0; i < nth; ++i) {
4069 kmp_info_t *th = team->t.t_threads[i];
4070 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4071 if (i > 0 && th->th.th_hot_teams) {
4072 __kmp_free(th->th.th_hot_teams);
4073 th->th.th_hot_teams = NULL;
4074 }
4075 }
4076 }
4077 __kmp_free_team(root, team, NULL);
4078 return n;
4079}
4080
4081// Resets a root thread and clear its root and hot teams.
4082// Returns the number of __kmp_threads entries directly and indirectly freed.
4083static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4084 kmp_team_t *root_team = root->r.r_root_team;
4085 kmp_team_t *hot_team = root->r.r_hot_team;
4086 int n = hot_team->t.t_nproc;
4087 int i;
4088
4089 KMP_DEBUG_ASSERT(!root->r.r_active);
4090
4091 root->r.r_root_team = NULL;
4092 root->r.r_hot_team = NULL;
4093 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4094 // before call to __kmp_free_team().
4095 __kmp_free_team(root, root_team, NULL);
4096 if (__kmp_hot_teams_max_level >
4097 0) { // need to free nested hot teams and their threads if any
4098 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4099 kmp_info_t *th = hot_team->t.t_threads[i];
4100 if (__kmp_hot_teams_max_level > 1) {
4101 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4102 }
4103 if (th->th.th_hot_teams) {
4104 __kmp_free(th->th.th_hot_teams);
4105 th->th.th_hot_teams = NULL;
4106 }
4107 }
4108 }
4109 __kmp_free_team(root, hot_team, NULL);
4110
4111 // Before we can reap the thread, we need to make certain that all other
4112 // threads in the teams that had this root as ancestor have stopped trying to
4113 // steal tasks.
4114 if (__kmp_tasking_mode != tskm_immediate_exec) {
4115 __kmp_wait_to_unref_task_teams();
4116 }
4117
4118#if KMP_OS_WINDOWS
4119 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4120 KA_TRACE(
4121 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4122 "\n",
4123 (LPVOID) & (root->r.r_uber_thread->th),
4124 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4125 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4126#endif /* KMP_OS_WINDOWS */
4127
4128#if OMPD_SUPPORT
4129 if (ompd_state & OMPD_ENABLE_BP)
4130 ompd_bp_thread_end();
4131#endif
4132
4133#if OMPT_SUPPORT
4134 ompt_data_t *task_data;
4135 ompt_data_t *parallel_data;
4136 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4137 NULL);
4138 if (ompt_enabled.ompt_callback_implicit_task) {
4139 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4140 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4141 }
4142 if (ompt_enabled.ompt_callback_thread_end) {
4143 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4144 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4145 }
4146#endif
4147
4148 TCW_4(__kmp_nth,
4149 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4150 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4151 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4152 " to %d\n",
4153 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4154 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4155 if (i == 1) {
4156 // need to free contention group structure
4157 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4158 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4159 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4160 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4161 root->r.r_uber_thread->th.th_cg_roots = NULL;
4162 }
4163 __kmp_reap_thread(root->r.r_uber_thread, 1);
4164
4165 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4166 // instead of freeing.
4167 root->r.r_uber_thread = NULL;
4168 /* mark root as no longer in use */
4169 root->r.r_begin = FALSE;
4170
4171 return n;
4172}
4173
4174void __kmp_unregister_root_current_thread(int gtid) {
4175 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4176 /* this lock should be ok, since unregister_root_current_thread is never
4177 called during an abort, only during a normal close. furthermore, if you
4178 have the forkjoin lock, you should never try to get the initz lock */
4179 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4180 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4181 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4182 "exiting T#%d\n",
4183 gtid));
4184 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4185 return;
4186 }
4187 kmp_root_t *root = __kmp_root[gtid];
4188
4189 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4190 KMP_ASSERT(KMP_UBER_GTID(gtid));
4191 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4192 KMP_ASSERT(root->r.r_active == FALSE);
4193
4194 KMP_MB();
4195
4196 kmp_info_t *thread = __kmp_threads[gtid];
4197 kmp_team_t *team = thread->th.th_team;
4198 kmp_task_team_t *task_team = thread->th.th_task_team;
4199
4200 // we need to wait for the proxy tasks before finishing the thread
4201 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4202 task_team->tt.tt_hidden_helper_task_encountered)) {
4203#if OMPT_SUPPORT
4204 // the runtime is shutting down so we won't report any events
4205 thread->th.ompt_thread_info.state = ompt_state_undefined;
4206#endif
4207 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4208 }
4209
4210 __kmp_reset_root(gtid, root);
4211
4212 KMP_MB();
4213 KC_TRACE(10,
4214 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4215
4216 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4217}
4218
4219#if KMP_OS_WINDOWS
4220/* __kmp_forkjoin_lock must be already held
4221 Unregisters a root thread that is not the current thread. Returns the number
4222 of __kmp_threads entries freed as a result. */
4223static int __kmp_unregister_root_other_thread(int gtid) {
4224 kmp_root_t *root = __kmp_root[gtid];
4225 int r;
4226
4227 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4228 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4229 KMP_ASSERT(KMP_UBER_GTID(gtid));
4230 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4231 KMP_ASSERT(root->r.r_active == FALSE);
4232
4233 r = __kmp_reset_root(gtid, root);
4234 KC_TRACE(10,
4235 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4236 return r;
4237}
4238#endif
4239
4240#if KMP_DEBUG
4241void __kmp_task_info() {
4242
4243 kmp_int32 gtid = __kmp_entry_gtid();
4244 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4245 kmp_info_t *this_thr = __kmp_threads[gtid];
4246 kmp_team_t *steam = this_thr->th.th_serial_team;
4247 kmp_team_t *team = this_thr->th.th_team;
4248
4249 __kmp_printf(
4250 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4251 "ptask=%p\n",
4252 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4253 team->t.t_implicit_task_taskdata[tid].td_parent);
4254}
4255#endif // KMP_DEBUG
4256
4257/* TODO optimize with one big memclr, take out what isn't needed, split
4258 responsibility to workers as much as possible, and delay initialization of
4259 features as much as possible */
4260static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4261 int tid, int gtid) {
4262 /* this_thr->th.th_info.ds.ds_gtid is setup in
4263 kmp_allocate_thread/create_worker.
4264 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4265 KMP_DEBUG_ASSERT(this_thr != NULL);
4266 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4267 KMP_DEBUG_ASSERT(team);
4268 KMP_DEBUG_ASSERT(team->t.t_threads);
4269 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4270 kmp_info_t *master = team->t.t_threads[0];
4271 KMP_DEBUG_ASSERT(master);
4272 KMP_DEBUG_ASSERT(master->th.th_root);
4273
4274 KMP_MB();
4275
4276 TCW_SYNC_PTR(this_thr->th.th_team, team);
4277
4278 this_thr->th.th_info.ds.ds_tid = tid;
4279 this_thr->th.th_set_nproc = 0;
4280 if (__kmp_tasking_mode != tskm_immediate_exec)
4281 // When tasking is possible, threads are not safe to reap until they are
4282 // done tasking; this will be set when tasking code is exited in wait
4283 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4284 else // no tasking --> always safe to reap
4285 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4286 this_thr->th.th_set_proc_bind = proc_bind_default;
4287
4288#if KMP_AFFINITY_SUPPORTED
4289 this_thr->th.th_new_place = this_thr->th.th_current_place;
4290#endif
4291 this_thr->th.th_root = master->th.th_root;
4292
4293 /* setup the thread's cache of the team structure */
4294 this_thr->th.th_team_nproc = team->t.t_nproc;
4295 this_thr->th.th_team_master = master;
4296 this_thr->th.th_team_serialized = team->t.t_serialized;
4297
4298 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4299
4300 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4301 tid, gtid, this_thr, this_thr->th.th_current_task));
4302
4303 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4304 team, tid, TRUE);
4305
4306 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4307 tid, gtid, this_thr, this_thr->th.th_current_task));
4308 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4309 // __kmp_initialize_team()?
4310
4311 /* TODO no worksharing in speculative threads */
4312 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4313
4314 this_thr->th.th_local.this_construct = 0;
4315
4316 if (!this_thr->th.th_pri_common) {
4317 this_thr->th.th_pri_common =
4318 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4319 if (__kmp_storage_map) {
4320 __kmp_print_storage_map_gtid(
4321 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4322 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4323 }
4324 this_thr->th.th_pri_head = NULL;
4325 }
4326
4327 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4328 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4329 // Make new thread's CG root same as primary thread's
4330 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4331 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4332 if (tmp) {
4333 // worker changes CG, need to check if old CG should be freed
4334 int i = tmp->cg_nthreads--;
4335 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4336 " on node %p of thread %p to %d\n",
4337 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4338 if (i == 1) {
4339 __kmp_free(tmp); // last thread left CG --> free it
4340 }
4341 }
4342 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4343 // Increment new thread's CG root's counter to add the new thread
4344 this_thr->th.th_cg_roots->cg_nthreads++;
4345 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4346 " node %p of thread %p to %d\n",
4347 this_thr, this_thr->th.th_cg_roots,
4348 this_thr->th.th_cg_roots->cg_root,
4349 this_thr->th.th_cg_roots->cg_nthreads));
4350 this_thr->th.th_current_task->td_icvs.thread_limit =
4351 this_thr->th.th_cg_roots->cg_thread_limit;
4352 }
4353
4354 /* Initialize dynamic dispatch */
4355 {
4356 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4357 // Use team max_nproc since this will never change for the team.
4358 size_t disp_size =
4359 sizeof(dispatch_private_info_t) *
4360 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4361 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4362 team->t.t_max_nproc));
4363 KMP_ASSERT(dispatch);
4364 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4365 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4366
4367 dispatch->th_disp_index = 0;
4368 dispatch->th_doacross_buf_idx = 0;
4369 if (!dispatch->th_disp_buffer) {
4370 dispatch->th_disp_buffer =
4371 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4372
4373 if (__kmp_storage_map) {
4374 __kmp_print_storage_map_gtid(
4375 gtid, &dispatch->th_disp_buffer[0],
4376 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4377 ? 1
4378 : __kmp_dispatch_num_buffers],
4379 disp_size,
4380 "th_%d.th_dispatch.th_disp_buffer "
4381 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4382 gtid, team->t.t_id, gtid);
4383 }
4384 } else {
4385 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4386 }
4387
4388 dispatch->th_dispatch_pr_current = 0;
4389 dispatch->th_dispatch_sh_current = 0;
4390
4391 dispatch->th_deo_fcn = 0; /* ORDERED */
4392 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4393 }
4394
4395 this_thr->th.th_next_pool = NULL;
4396
4397 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4398 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4399
4400 KMP_MB();
4401}
4402
4403/* allocate a new thread for the requesting team. this is only called from
4404 within a forkjoin critical section. we will first try to get an available
4405 thread from the thread pool. if none is available, we will fork a new one
4406 assuming we are able to create a new one. this should be assured, as the
4407 caller should check on this first. */
4408kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4409 int new_tid) {
4410 kmp_team_t *serial_team;
4411 kmp_info_t *new_thr;
4412 int new_gtid;
4413
4414 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4415 KMP_DEBUG_ASSERT(root && team);
4416 KMP_MB();
4417
4418 /* first, try to get one from the thread pool unless allocating thread is
4419 * the main hidden helper thread. The hidden helper team should always
4420 * allocate new OS threads. */
4421 if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4422 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4423 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4424 if (new_thr == __kmp_thread_pool_insert_pt) {
4425 __kmp_thread_pool_insert_pt = NULL;
4426 }
4427 TCW_4(new_thr->th.th_in_pool, FALSE);
4428 __kmp_suspend_initialize_thread(new_thr);
4429 __kmp_lock_suspend_mx(new_thr);
4430 if (new_thr->th.th_active_in_pool == TRUE) {
4431 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4432 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4433 new_thr->th.th_active_in_pool = FALSE;
4434 }
4435 __kmp_unlock_suspend_mx(new_thr);
4436
4437 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4438 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4439 KMP_ASSERT(!new_thr->th.th_team);
4440 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4441
4442 /* setup the thread structure */
4443 __kmp_initialize_info(new_thr, team, new_tid,
4444 new_thr->th.th_info.ds.ds_gtid);
4445 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4446
4447 TCW_4(__kmp_nth, __kmp_nth + 1);
4448
4449 new_thr->th.th_task_state = 0;
4450
4451 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4452 // Make sure pool thread has transitioned to waiting on own thread struct
4453 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4454 // Thread activated in __kmp_allocate_team when increasing team size
4455 }
4456
4457#ifdef KMP_ADJUST_BLOCKTIME
4458 /* Adjust blocktime back to zero if necessary */
4459 /* Middle initialization might not have occurred yet */
4460 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4461 if (__kmp_nth > __kmp_avail_proc) {
4462 __kmp_zero_bt = TRUE;
4463 }
4464 }
4465#endif /* KMP_ADJUST_BLOCKTIME */
4466
4467#if KMP_DEBUG
4468 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4469 // KMP_BARRIER_PARENT_FLAG.
4470 int b;
4471 kmp_balign_t *balign = new_thr->th.th_bar;
4472 for (b = 0; b < bs_last_barrier; ++b)
4473 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4474#endif
4475
4476 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4477 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4478
4479 KMP_MB();
4480 return new_thr;
4481 }
4482
4483 /* no, well fork a new one */
4484 KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4485 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4486
4487#if KMP_USE_MONITOR
4488 // If this is the first worker thread the RTL is creating, then also
4489 // launch the monitor thread. We try to do this as early as possible.
4490 if (!TCR_4(__kmp_init_monitor)) {
4491 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4492 if (!TCR_4(__kmp_init_monitor)) {
4493 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4494 TCW_4(__kmp_init_monitor, 1);
4495 __kmp_create_monitor(&__kmp_monitor);
4496 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4497#if KMP_OS_WINDOWS
4498 // AC: wait until monitor has started. This is a fix for CQ232808.
4499 // The reason is that if the library is loaded/unloaded in a loop with
4500 // small (parallel) work in between, then there is high probability that
4501 // monitor thread started after the library shutdown. At shutdown it is
4502 // too late to cope with the problem, because when the primary thread is
4503 // in DllMain (process detach) the monitor has no chances to start (it is
4504 // blocked), and primary thread has no means to inform the monitor that
4505 // the library has gone, because all the memory which the monitor can
4506 // access is going to be released/reset.
4507 while (TCR_4(__kmp_init_monitor) < 2) {
4508 KMP_YIELD(TRUE);
4509 }
4510 KF_TRACE(10, ("after monitor thread has started\n"));
4511#endif
4512 }
4513 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4514 }
4515#endif
4516
4517 KMP_MB();
4518
4519 {
4520 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4521 ? 1
4522 : __kmp_hidden_helper_threads_num + 1;
4523
4524 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4525 ++new_gtid) {
4526 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4527 }
4528
4529 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4530 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4531 }
4532 }
4533
4534 /* allocate space for it. */
4535 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4536
4537 new_thr->th.th_nt_strict = false;
4538 new_thr->th.th_nt_loc = NULL;
4539 new_thr->th.th_nt_sev = severity_fatal;
4540 new_thr->th.th_nt_msg = NULL;
4541
4542 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4543
4544#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4545 // suppress race conditions detection on synchronization flags in debug mode
4546 // this helps to analyze library internals eliminating false positives
4547 __itt_suppress_mark_range(
4548 __itt_suppress_range, __itt_suppress_threading_errors,
4549 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4550 __itt_suppress_mark_range(
4551 __itt_suppress_range, __itt_suppress_threading_errors,
4552 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4553#if KMP_OS_WINDOWS
4554 __itt_suppress_mark_range(
4555 __itt_suppress_range, __itt_suppress_threading_errors,
4556 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4557#else
4558 __itt_suppress_mark_range(__itt_suppress_range,
4559 __itt_suppress_threading_errors,
4560 &new_thr->th.th_suspend_init_count,
4561 sizeof(new_thr->th.th_suspend_init_count));
4562#endif
4563 // TODO: check if we need to also suppress b_arrived flags
4564 __itt_suppress_mark_range(__itt_suppress_range,
4565 __itt_suppress_threading_errors,
4566 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4567 sizeof(new_thr->th.th_bar[0].bb.b_go));
4568 __itt_suppress_mark_range(__itt_suppress_range,
4569 __itt_suppress_threading_errors,
4570 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4571 sizeof(new_thr->th.th_bar[1].bb.b_go));
4572 __itt_suppress_mark_range(__itt_suppress_range,
4573 __itt_suppress_threading_errors,
4574 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4575 sizeof(new_thr->th.th_bar[2].bb.b_go));
4576#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4577 if (__kmp_storage_map) {
4578 __kmp_print_thread_storage_map(new_thr, new_gtid);
4579 }
4580
4581 // add the reserve serialized team, initialized from the team's primary thread
4582 {
4583 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4584 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4585 new_thr->th.th_serial_team = serial_team =
4586 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4587#if OMPT_SUPPORT
4588 ompt_data_none, // root parallel id
4589#endif
4590 proc_bind_default, &r_icvs, 0, NULL);
4591 }
4592 KMP_ASSERT(serial_team);
4593 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4594 // execution (it is unused for now).
4595 serial_team->t.t_threads[0] = new_thr;
4596 KF_TRACE(10,
4597 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4598 new_thr));
4599
4600 /* setup the thread structures */
4601 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4602
4603#if USE_FAST_MEMORY
4604 __kmp_initialize_fast_memory(new_thr);
4605#endif /* USE_FAST_MEMORY */
4606
4607#if KMP_USE_BGET
4608 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4609 __kmp_initialize_bget(new_thr);
4610#endif
4611
4612 __kmp_init_random(new_thr); // Initialize random number generator
4613
4614 /* Initialize these only once when thread is grabbed for a team allocation */
4615 KA_TRACE(20,
4616 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4617 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4618
4619 int b;
4620 kmp_balign_t *balign = new_thr->th.th_bar;
4621 for (b = 0; b < bs_last_barrier; ++b) {
4622 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4623 balign[b].bb.team = NULL;
4624 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4625 balign[b].bb.use_oncore_barrier = 0;
4626 }
4627
4628 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4629 new_thr->th.th_sleep_loc_type = flag_unset;
4630
4631 new_thr->th.th_spin_here = FALSE;
4632 new_thr->th.th_next_waiting = 0;
4633#if KMP_OS_UNIX
4634 new_thr->th.th_blocking = false;
4635#endif
4636
4637#if KMP_AFFINITY_SUPPORTED
4638 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4639 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4640 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4641 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4642#endif
4643 new_thr->th.th_def_allocator = __kmp_def_allocator;
4644 new_thr->th.th_prev_level = 0;
4645 new_thr->th.th_prev_num_threads = 1;
4646
4647 TCW_4(new_thr->th.th_in_pool, FALSE);
4648 new_thr->th.th_active_in_pool = FALSE;
4649 TCW_4(new_thr->th.th_active, TRUE);
4650
4651 new_thr->th.th_set_nested_nth = NULL;
4652 new_thr->th.th_set_nested_nth_sz = 0;
4653
4654 /* adjust the global counters */
4655 __kmp_all_nth++;
4656 __kmp_nth++;
4657
4658 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4659 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4660 if (__kmp_adjust_gtid_mode) {
4661 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4662 if (TCR_4(__kmp_gtid_mode) != 2) {
4663 TCW_4(__kmp_gtid_mode, 2);
4664 }
4665 } else {
4666 if (TCR_4(__kmp_gtid_mode) != 1) {
4667 TCW_4(__kmp_gtid_mode, 1);
4668 }
4669 }
4670 }
4671
4672#ifdef KMP_ADJUST_BLOCKTIME
4673 /* Adjust blocktime back to zero if necessary */
4674 /* Middle initialization might not have occurred yet */
4675 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4676 if (__kmp_nth > __kmp_avail_proc) {
4677 __kmp_zero_bt = TRUE;
4678 }
4679 }
4680#endif /* KMP_ADJUST_BLOCKTIME */
4681
4682#if KMP_AFFINITY_SUPPORTED
4683 // Set the affinity and topology information for new thread
4684 __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4685#endif
4686
4687 /* actually fork it and create the new worker thread */
4688 KF_TRACE(
4689 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4690 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4691 KF_TRACE(10,
4692 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4693
4694 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4695 new_gtid));
4696 KMP_MB();
4697 return new_thr;
4698}
4699
4700/* Reinitialize team for reuse.
4701 The hot team code calls this case at every fork barrier, so EPCC barrier
4702 test are extremely sensitive to changes in it, esp. writes to the team
4703 struct, which cause a cache invalidation in all threads.
4704 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4705static void __kmp_reinitialize_team(kmp_team_t *team,
4706 kmp_internal_control_t *new_icvs,
4707 ident_t *loc) {
4708 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4709 team->t.t_threads[0], team));
4710 KMP_DEBUG_ASSERT(team && new_icvs);
4711 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4712 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4713
4714 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4715 // Copy ICVs to the primary thread's implicit taskdata
4716 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4717 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4718
4719 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4720 team->t.t_threads[0], team));
4721}
4722
4723/* Initialize the team data structure.
4724 This assumes the t_threads and t_max_nproc are already set.
4725 Also, we don't touch the arguments */
4726static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4727 kmp_internal_control_t *new_icvs,
4728 ident_t *loc) {
4729 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4730
4731 /* verify */
4732 KMP_DEBUG_ASSERT(team);
4733 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4734 KMP_DEBUG_ASSERT(team->t.t_threads);
4735 KMP_MB();
4736
4737 team->t.t_master_tid = 0; /* not needed */
4738 /* team->t.t_master_bar; not needed */
4739 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4740 team->t.t_nproc = new_nproc;
4741
4742 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4743 team->t.t_next_pool = NULL;
4744 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4745 * up hot team */
4746
4747 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4748 team->t.t_invoke = NULL; /* not needed */
4749
4750 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4751 team->t.t_sched.sched = new_icvs->sched.sched;
4752
4753#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4754 team->t.t_fp_control_saved = FALSE; /* not needed */
4755 team->t.t_x87_fpu_control_word = 0; /* not needed */
4756 team->t.t_mxcsr = 0; /* not needed */
4757#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4758
4759 team->t.t_construct = 0;
4760
4761 team->t.t_ordered.dt.t_value = 0;
4762 team->t.t_master_active = FALSE;
4763
4764#ifdef KMP_DEBUG
4765 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4766#endif
4767#if KMP_OS_WINDOWS
4768 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4769#endif
4770
4771 team->t.t_control_stack_top = NULL;
4772
4773 __kmp_reinitialize_team(team, new_icvs, loc);
4774
4775 KMP_MB();
4776 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4777}
4778
4779#if KMP_AFFINITY_SUPPORTED
4780static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4781 int first, int last, int newp) {
4782 th->th.th_first_place = first;
4783 th->th.th_last_place = last;
4784 th->th.th_new_place = newp;
4785 if (newp != th->th.th_current_place) {
4786 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4787 team->t.t_display_affinity = 1;
4788 // Copy topology information associated with the new place
4789 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4790 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4791 }
4792}
4793
4794// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4795// It calculates the worker + primary thread's partition based upon the parent
4796// thread's partition, and binds each worker to a thread in their partition.
4797// The primary thread's partition should already include its current binding.
4798static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4799 // Do not partition places for the hidden helper team
4800 if (KMP_HIDDEN_HELPER_TEAM(team))
4801 return;
4802 // Copy the primary thread's place partition to the team struct
4803 kmp_info_t *master_th = team->t.t_threads[0];
4804 KMP_DEBUG_ASSERT(master_th != NULL);
4805 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4806 int first_place = master_th->th.th_first_place;
4807 int last_place = master_th->th.th_last_place;
4808 int masters_place = master_th->th.th_current_place;
4809 int num_masks = __kmp_affinity.num_masks;
4810 team->t.t_first_place = first_place;
4811 team->t.t_last_place = last_place;
4812
4813 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4814 "bound to place %d partition = [%d,%d]\n",
4815 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4816 team->t.t_id, masters_place, first_place, last_place));
4817
4818 switch (proc_bind) {
4819
4820 case proc_bind_default:
4821 // Serial teams might have the proc_bind policy set to proc_bind_default.
4822 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4823 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4824 break;
4825
4826 case proc_bind_primary: {
4827 int f;
4828 int n_th = team->t.t_nproc;
4829 for (f = 1; f < n_th; f++) {
4830 kmp_info_t *th = team->t.t_threads[f];
4831 KMP_DEBUG_ASSERT(th != NULL);
4832 __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4833
4834 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4835 "partition = [%d,%d]\n",
4836 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4837 f, masters_place, first_place, last_place));
4838 }
4839 } break;
4840
4841 case proc_bind_close: {
4842 int f;
4843 int n_th = team->t.t_nproc;
4844 int n_places;
4845 if (first_place <= last_place) {
4846 n_places = last_place - first_place + 1;
4847 } else {
4848 n_places = num_masks - first_place + last_place + 1;
4849 }
4850 if (n_th <= n_places) {
4851 int place = masters_place;
4852 for (f = 1; f < n_th; f++) {
4853 kmp_info_t *th = team->t.t_threads[f];
4854 KMP_DEBUG_ASSERT(th != NULL);
4855
4856 if (place == last_place) {
4857 place = first_place;
4858 } else if (place == (num_masks - 1)) {
4859 place = 0;
4860 } else {
4861 place++;
4862 }
4863 __kmp_set_thread_place(team, th, first_place, last_place, place);
4864
4865 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4866 "partition = [%d,%d]\n",
4867 __kmp_gtid_from_thread(team->t.t_threads[f]),
4868 team->t.t_id, f, place, first_place, last_place));
4869 }
4870 } else {
4871 int S, rem, gap, s_count;
4872 S = n_th / n_places;
4873 s_count = 0;
4874 rem = n_th - (S * n_places);
4875 gap = rem > 0 ? n_places / rem : n_places;
4876 int place = masters_place;
4877 int gap_ct = gap;
4878 for (f = 0; f < n_th; f++) {
4879 kmp_info_t *th = team->t.t_threads[f];
4880 KMP_DEBUG_ASSERT(th != NULL);
4881
4882 __kmp_set_thread_place(team, th, first_place, last_place, place);
4883 s_count++;
4884
4885 if ((s_count == S) && rem && (gap_ct == gap)) {
4886 // do nothing, add an extra thread to place on next iteration
4887 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4888 // we added an extra thread to this place; move to next place
4889 if (place == last_place) {
4890 place = first_place;
4891 } else if (place == (num_masks - 1)) {
4892 place = 0;
4893 } else {
4894 place++;
4895 }
4896 s_count = 0;
4897 gap_ct = 1;
4898 rem--;
4899 } else if (s_count == S) { // place full; don't add extra
4900 if (place == last_place) {
4901 place = first_place;
4902 } else if (place == (num_masks - 1)) {
4903 place = 0;
4904 } else {
4905 place++;
4906 }
4907 gap_ct++;
4908 s_count = 0;
4909 }
4910
4911 KA_TRACE(100,
4912 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4913 "partition = [%d,%d]\n",
4914 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4915 th->th.th_new_place, first_place, last_place));
4916 }
4917 KMP_DEBUG_ASSERT(place == masters_place);
4918 }
4919 } break;
4920
4921 case proc_bind_spread: {
4922 int f;
4923 int n_th = team->t.t_nproc;
4924 int n_places;
4925 int thidx;
4926 if (first_place <= last_place) {
4927 n_places = last_place - first_place + 1;
4928 } else {
4929 n_places = num_masks - first_place + last_place + 1;
4930 }
4931 if (n_th <= n_places) {
4932 int place = -1;
4933
4934 if (n_places != num_masks) {
4935 int S = n_places / n_th;
4936 int s_count, rem, gap, gap_ct;
4937
4938 place = masters_place;
4939 rem = n_places - n_th * S;
4940 gap = rem ? n_th / rem : 1;
4941 gap_ct = gap;
4942 thidx = n_th;
4943 if (update_master_only == 1)
4944 thidx = 1;
4945 for (f = 0; f < thidx; f++) {
4946 kmp_info_t *th = team->t.t_threads[f];
4947 KMP_DEBUG_ASSERT(th != NULL);
4948
4949 int fplace = place, nplace = place;
4950 s_count = 1;
4951 while (s_count < S) {
4952 if (place == last_place) {
4953 place = first_place;
4954 } else if (place == (num_masks - 1)) {
4955 place = 0;
4956 } else {
4957 place++;
4958 }
4959 s_count++;
4960 }
4961 if (rem && (gap_ct == gap)) {
4962 if (place == last_place) {
4963 place = first_place;
4964 } else if (place == (num_masks - 1)) {
4965 place = 0;
4966 } else {
4967 place++;
4968 }
4969 rem--;
4970 gap_ct = 0;
4971 }
4972 __kmp_set_thread_place(team, th, fplace, place, nplace);
4973 gap_ct++;
4974
4975 if (place == last_place) {
4976 place = first_place;
4977 } else if (place == (num_masks - 1)) {
4978 place = 0;
4979 } else {
4980 place++;
4981 }
4982
4983 KA_TRACE(100,
4984 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4985 "partition = [%d,%d], num_masks: %u\n",
4986 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4987 f, th->th.th_new_place, th->th.th_first_place,
4988 th->th.th_last_place, num_masks));
4989 }
4990 } else {
4991 /* Having uniform space of available computation places I can create
4992 T partitions of round(P/T) size and put threads into the first
4993 place of each partition. */
4994 double current = static_cast<double>(masters_place);
4995 double spacing =
4996 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4997 int first, last;
4998 kmp_info_t *th;
4999
5000 thidx = n_th + 1;
5001 if (update_master_only == 1)
5002 thidx = 1;
5003 for (f = 0; f < thidx; f++) {
5004 first = static_cast<int>(current);
5005 last = static_cast<int>(current + spacing) - 1;
5006 KMP_DEBUG_ASSERT(last >= first);
5007 if (first >= n_places) {
5008 if (masters_place) {
5009 first -= n_places;
5010 last -= n_places;
5011 if (first == (masters_place + 1)) {
5012 KMP_DEBUG_ASSERT(f == n_th);
5013 first--;
5014 }
5015 if (last == masters_place) {
5016 KMP_DEBUG_ASSERT(f == (n_th - 1));
5017 last--;
5018 }
5019 } else {
5020 KMP_DEBUG_ASSERT(f == n_th);
5021 first = 0;
5022 last = 0;
5023 }
5024 }
5025 if (last >= n_places) {
5026 last = (n_places - 1);
5027 }
5028 place = first;
5029 current += spacing;
5030 if (f < n_th) {
5031 KMP_DEBUG_ASSERT(0 <= first);
5032 KMP_DEBUG_ASSERT(n_places > first);
5033 KMP_DEBUG_ASSERT(0 <= last);
5034 KMP_DEBUG_ASSERT(n_places > last);
5035 KMP_DEBUG_ASSERT(last_place >= first_place);
5036 th = team->t.t_threads[f];
5037 KMP_DEBUG_ASSERT(th);
5038 __kmp_set_thread_place(team, th, first, last, place);
5039 KA_TRACE(100,
5040 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5041 "partition = [%d,%d], spacing = %.4f\n",
5042 __kmp_gtid_from_thread(team->t.t_threads[f]),
5043 team->t.t_id, f, th->th.th_new_place,
5044 th->th.th_first_place, th->th.th_last_place, spacing));
5045 }
5046 }
5047 }
5048 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5049 } else {
5050 int S, rem, gap, s_count;
5051 S = n_th / n_places;
5052 s_count = 0;
5053 rem = n_th - (S * n_places);
5054 gap = rem > 0 ? n_places / rem : n_places;
5055 int place = masters_place;
5056 int gap_ct = gap;
5057 thidx = n_th;
5058 if (update_master_only == 1)
5059 thidx = 1;
5060 for (f = 0; f < thidx; f++) {
5061 kmp_info_t *th = team->t.t_threads[f];
5062 KMP_DEBUG_ASSERT(th != NULL);
5063
5064 __kmp_set_thread_place(team, th, place, place, place);
5065 s_count++;
5066
5067 if ((s_count == S) && rem && (gap_ct == gap)) {
5068 // do nothing, add an extra thread to place on next iteration
5069 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5070 // we added an extra thread to this place; move on to next place
5071 if (place == last_place) {
5072 place = first_place;
5073 } else if (place == (num_masks - 1)) {
5074 place = 0;
5075 } else {
5076 place++;
5077 }
5078 s_count = 0;
5079 gap_ct = 1;
5080 rem--;
5081 } else if (s_count == S) { // place is full; don't add extra thread
5082 if (place == last_place) {
5083 place = first_place;
5084 } else if (place == (num_masks - 1)) {
5085 place = 0;
5086 } else {
5087 place++;
5088 }
5089 gap_ct++;
5090 s_count = 0;
5091 }
5092
5093 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5094 "partition = [%d,%d]\n",
5095 __kmp_gtid_from_thread(team->t.t_threads[f]),
5096 team->t.t_id, f, th->th.th_new_place,
5097 th->th.th_first_place, th->th.th_last_place));
5098 }
5099 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5100 }
5101 } break;
5102
5103 default:
5104 break;
5105 }
5106
5107 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5108}
5109
5110#endif // KMP_AFFINITY_SUPPORTED
5111
5112/* allocate a new team data structure to use. take one off of the free pool if
5113 available */
5114kmp_team_t *__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5115#if OMPT_SUPPORT
5116 ompt_data_t ompt_parallel_data,
5117#endif
5118 kmp_proc_bind_t new_proc_bind,
5119 kmp_internal_control_t *new_icvs, int argc,
5120 kmp_info_t *master) {
5121 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5122 int f;
5123 kmp_team_t *team;
5124 int use_hot_team = !root->r.r_active;
5125 int level = 0;
5126 int do_place_partition = 1;
5127
5128 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5129 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5130 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5131 KMP_MB();
5132
5133 kmp_hot_team_ptr_t *hot_teams;
5134 if (master) {
5135 team = master->th.th_team;
5136 level = team->t.t_active_level;
5137 if (master->th.th_teams_microtask) { // in teams construct?
5138 if (master->th.th_teams_size.nteams > 1 &&
5139 ( // #teams > 1
5140 team->t.t_pkfn ==
5141 (microtask_t)__kmp_teams_master || // inner fork of the teams
5142 master->th.th_teams_level <
5143 team->t.t_level)) { // or nested parallel inside the teams
5144 ++level; // not increment if #teams==1, or for outer fork of the teams;
5145 // increment otherwise
5146 }
5147 // Do not perform the place partition if inner fork of the teams
5148 // Wait until nested parallel region encountered inside teams construct
5149 if ((master->th.th_teams_size.nteams == 1 &&
5150 master->th.th_teams_level >= team->t.t_level) ||
5151 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5152 do_place_partition = 0;
5153 }
5154 hot_teams = master->th.th_hot_teams;
5155 if (level < __kmp_hot_teams_max_level && hot_teams &&
5156 hot_teams[level].hot_team) {
5157 // hot team has already been allocated for given level
5158 use_hot_team = 1;
5159 } else {
5160 use_hot_team = 0;
5161 }
5162 } else {
5163 // check we won't access uninitialized hot_teams, just in case
5164 KMP_DEBUG_ASSERT(new_nproc == 1);
5165 }
5166 // Optimization to use a "hot" team
5167 if (use_hot_team && new_nproc > 1) {
5168 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5169 team = hot_teams[level].hot_team;
5170#if KMP_DEBUG
5171 if (__kmp_tasking_mode != tskm_immediate_exec) {
5172 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5173 "task_team[1] = %p before reinit\n",
5174 team->t.t_task_team[0], team->t.t_task_team[1]));
5175 }
5176#endif
5177
5178 if (team->t.t_nproc != new_nproc &&
5179 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5180 // Distributed barrier may need a resize
5181 int old_nthr = team->t.t_nproc;
5182 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5183 }
5184
5185 // If not doing the place partition, then reset the team's proc bind
5186 // to indicate that partitioning of all threads still needs to take place
5187 if (do_place_partition == 0)
5188 team->t.t_proc_bind = proc_bind_default;
5189 // Has the number of threads changed?
5190 /* Let's assume the most common case is that the number of threads is
5191 unchanged, and put that case first. */
5192 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5193 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5194 // This case can mean that omp_set_num_threads() was called and the hot
5195 // team size was already reduced, so we check the special flag
5196 if (team->t.t_size_changed == -1) {
5197 team->t.t_size_changed = 1;
5198 } else {
5199 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5200 }
5201
5202 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5203 kmp_r_sched_t new_sched = new_icvs->sched;
5204 // set primary thread's schedule as new run-time schedule
5205 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5206
5207 __kmp_reinitialize_team(team, new_icvs,
5208 root->r.r_uber_thread->th.th_ident);
5209
5210 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5211 team->t.t_threads[0], team));
5212 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5213
5214#if KMP_AFFINITY_SUPPORTED
5215 if ((team->t.t_size_changed == 0) &&
5216 (team->t.t_proc_bind == new_proc_bind)) {
5217 if (new_proc_bind == proc_bind_spread) {
5218 if (do_place_partition) {
5219 // add flag to update only master for spread
5220 __kmp_partition_places(team, 1);
5221 }
5222 }
5223 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5224 "proc_bind = %d, partition = [%d,%d]\n",
5225 team->t.t_id, new_proc_bind, team->t.t_first_place,
5226 team->t.t_last_place));
5227 } else {
5228 if (do_place_partition) {
5229 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5230 __kmp_partition_places(team);
5231 }
5232 }
5233#else
5234 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5235#endif /* KMP_AFFINITY_SUPPORTED */
5236 } else if (team->t.t_nproc > new_nproc) {
5237 KA_TRACE(20,
5238 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5239 new_nproc));
5240
5241 team->t.t_size_changed = 1;
5242 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5243 // Barrier size already reduced earlier in this function
5244 // Activate team threads via th_used_in_team
5245 __kmp_add_threads_to_team(team, new_nproc);
5246 }
5247 // When decreasing team size, threads no longer in the team should
5248 // unref task team.
5249 if (__kmp_tasking_mode != tskm_immediate_exec) {
5250 for (f = new_nproc; f < team->t.t_nproc; f++) {
5251 kmp_info_t *th = team->t.t_threads[f];
5252 KMP_DEBUG_ASSERT(th);
5253 th->th.th_task_team = NULL;
5254 }
5255 }
5256 if (__kmp_hot_teams_mode == 0) {
5257 // AC: saved number of threads should correspond to team's value in this
5258 // mode, can be bigger in mode 1, when hot team has threads in reserve
5259 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5260 hot_teams[level].hot_team_nth = new_nproc;
5261 /* release the extra threads we don't need any more */
5262 for (f = new_nproc; f < team->t.t_nproc; f++) {
5263 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5264 __kmp_free_thread(team->t.t_threads[f]);
5265 team->t.t_threads[f] = NULL;
5266 }
5267 } // (__kmp_hot_teams_mode == 0)
5268 else {
5269 // When keeping extra threads in team, switch threads to wait on own
5270 // b_go flag
5271 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5272 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5273 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5274 for (int b = 0; b < bs_last_barrier; ++b) {
5275 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5276 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5277 }
5278 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5279 }
5280 }
5281 }
5282 team->t.t_nproc = new_nproc;
5283 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5284 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5285 __kmp_reinitialize_team(team, new_icvs,
5286 root->r.r_uber_thread->th.th_ident);
5287
5288 // Update remaining threads
5289 for (f = 0; f < new_nproc; ++f) {
5290 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5291 }
5292
5293 // restore the current task state of the primary thread: should be the
5294 // implicit task
5295 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5296 team->t.t_threads[0], team));
5297
5298 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5299
5300#ifdef KMP_DEBUG
5301 for (f = 0; f < team->t.t_nproc; f++) {
5302 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5303 team->t.t_threads[f]->th.th_team_nproc ==
5304 team->t.t_nproc);
5305 }
5306#endif
5307
5308 if (do_place_partition) {
5309 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5310#if KMP_AFFINITY_SUPPORTED
5311 __kmp_partition_places(team);
5312#endif
5313 }
5314 } else { // team->t.t_nproc < new_nproc
5315
5316 KA_TRACE(20,
5317 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5318 new_nproc));
5319 int old_nproc = team->t.t_nproc; // save old value and use to update only
5320 team->t.t_size_changed = 1;
5321
5322 int avail_threads = hot_teams[level].hot_team_nth;
5323 if (new_nproc < avail_threads)
5324 avail_threads = new_nproc;
5325 kmp_info_t **other_threads = team->t.t_threads;
5326 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5327 // Adjust barrier data of reserved threads (if any) of the team
5328 // Other data will be set in __kmp_initialize_info() below.
5329 int b;
5330 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5331 for (b = 0; b < bs_last_barrier; ++b) {
5332 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5333 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5334#if USE_DEBUGGER
5335 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5336#endif
5337 }
5338 }
5339 if (hot_teams[level].hot_team_nth >= new_nproc) {
5340 // we have all needed threads in reserve, no need to allocate any
5341 // this only possible in mode 1, cannot have reserved threads in mode 0
5342 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5343 team->t.t_nproc = new_nproc; // just get reserved threads involved
5344 } else {
5345 // We may have some threads in reserve, but not enough;
5346 // get reserved threads involved if any.
5347 team->t.t_nproc = hot_teams[level].hot_team_nth;
5348 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5349 if (team->t.t_max_nproc < new_nproc) {
5350 /* reallocate larger arrays */
5351 __kmp_reallocate_team_arrays(team, new_nproc);
5352 __kmp_reinitialize_team(team, new_icvs, NULL);
5353 }
5354
5355#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5356 KMP_AFFINITY_SUPPORTED
5357 /* Temporarily set full mask for primary thread before creation of
5358 workers. The reason is that workers inherit the affinity from the
5359 primary thread, so if a lot of workers are created on the single
5360 core quickly, they don't get a chance to set their own affinity for
5361 a long time. */
5362 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5363#endif
5364
5365 /* allocate new threads for the hot team */
5366 for (f = team->t.t_nproc; f < new_nproc; f++) {
5367 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5368 KMP_DEBUG_ASSERT(new_worker);
5369 team->t.t_threads[f] = new_worker;
5370
5371 KA_TRACE(20,
5372 ("__kmp_allocate_team: team %d init T#%d arrived: "
5373 "join=%llu, plain=%llu\n",
5374 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5375 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5376 team->t.t_bar[bs_plain_barrier].b_arrived));
5377
5378 { // Initialize barrier data for new threads.
5379 int b;
5380 kmp_balign_t *balign = new_worker->th.th_bar;
5381 for (b = 0; b < bs_last_barrier; ++b) {
5382 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5383 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5384 KMP_BARRIER_PARENT_FLAG);
5385#if USE_DEBUGGER
5386 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5387#endif
5388 }
5389 }
5390 }
5391
5392#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5393 KMP_AFFINITY_SUPPORTED
5394 /* Restore initial primary thread's affinity mask */
5395 new_temp_affinity.restore();
5396#endif
5397 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5398 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5399 // Barrier size already increased earlier in this function
5400 // Activate team threads via th_used_in_team
5401 __kmp_add_threads_to_team(team, new_nproc);
5402 }
5403 /* make sure everyone is syncronized */
5404 // new threads below
5405 __kmp_initialize_team(team, new_nproc, new_icvs,
5406 root->r.r_uber_thread->th.th_ident);
5407
5408 /* reinitialize the threads */
5409 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5410 for (f = 0; f < team->t.t_nproc; ++f)
5411 __kmp_initialize_info(team->t.t_threads[f], team, f,
5412 __kmp_gtid_from_tid(f, team));
5413
5414 // set th_task_state for new threads in hot team with older thread's state
5415 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5416 for (f = old_nproc; f < team->t.t_nproc; ++f)
5417 team->t.t_threads[f]->th.th_task_state = old_state;
5418
5419#ifdef KMP_DEBUG
5420 for (f = 0; f < team->t.t_nproc; ++f) {
5421 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5422 team->t.t_threads[f]->th.th_team_nproc ==
5423 team->t.t_nproc);
5424 }
5425#endif
5426
5427 if (do_place_partition) {
5428 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5429#if KMP_AFFINITY_SUPPORTED
5430 __kmp_partition_places(team);
5431#endif
5432 }
5433 } // Check changes in number of threads
5434
5435 if (master->th.th_teams_microtask) {
5436 for (f = 1; f < new_nproc; ++f) {
5437 // propagate teams construct specific info to workers
5438 kmp_info_t *thr = team->t.t_threads[f];
5439 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5440 thr->th.th_teams_level = master->th.th_teams_level;
5441 thr->th.th_teams_size = master->th.th_teams_size;
5442 }
5443 }
5444 if (level) {
5445 // Sync barrier state for nested hot teams, not needed for outermost hot
5446 // team.
5447 for (f = 1; f < new_nproc; ++f) {
5448 kmp_info_t *thr = team->t.t_threads[f];
5449 int b;
5450 kmp_balign_t *balign = thr->th.th_bar;
5451 for (b = 0; b < bs_last_barrier; ++b) {
5452 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5453 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5454#if USE_DEBUGGER
5455 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5456#endif
5457 }
5458 }
5459 }
5460
5461 /* reallocate space for arguments if necessary */
5462 __kmp_alloc_argv_entries(argc, team, TRUE);
5463 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5464 // The hot team re-uses the previous task team,
5465 // if untouched during the previous release->gather phase.
5466
5467 KF_TRACE(10, (" hot_team = %p\n", team));
5468
5469#if KMP_DEBUG
5470 if (__kmp_tasking_mode != tskm_immediate_exec) {
5471 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5472 "task_team[1] = %p after reinit\n",
5473 team->t.t_task_team[0], team->t.t_task_team[1]));
5474 }
5475#endif
5476
5477#if OMPT_SUPPORT
5478 __ompt_team_assign_id(team, ompt_parallel_data);
5479#endif
5480
5481 KMP_MB();
5482
5483 return team;
5484 }
5485
5486 /* next, let's try to take one from the team pool */
5487 KMP_MB();
5488 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5489 /* TODO: consider resizing undersized teams instead of reaping them, now
5490 that we have a resizing mechanism */
5491 if (team->t.t_max_nproc >= max_nproc) {
5492 /* take this team from the team pool */
5493 __kmp_team_pool = team->t.t_next_pool;
5494
5495 if (max_nproc > 1 &&
5496 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5497 if (!team->t.b) { // Allocate barrier structure
5498 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5499 }
5500 }
5501
5502 /* setup the team for fresh use */
5503 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5504
5505 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5506 "task_team[1] %p to NULL\n",
5507 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5508 team->t.t_task_team[0] = NULL;
5509 team->t.t_task_team[1] = NULL;
5510
5511 /* reallocate space for arguments if necessary */
5512 __kmp_alloc_argv_entries(argc, team, TRUE);
5513 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5514
5515 KA_TRACE(
5516 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5517 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5518 { // Initialize barrier data.
5519 int b;
5520 for (b = 0; b < bs_last_barrier; ++b) {
5521 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5522#if USE_DEBUGGER
5523 team->t.t_bar[b].b_master_arrived = 0;
5524 team->t.t_bar[b].b_team_arrived = 0;
5525#endif
5526 }
5527 }
5528
5529 team->t.t_proc_bind = new_proc_bind;
5530
5531 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5532 team->t.t_id));
5533
5534#if OMPT_SUPPORT
5535 __ompt_team_assign_id(team, ompt_parallel_data);
5536#endif
5537
5538 team->t.t_nested_nth = NULL;
5539
5540 KMP_MB();
5541
5542 return team;
5543 }
5544
5545 /* reap team if it is too small, then loop back and check the next one */
5546 // not sure if this is wise, but, will be redone during the hot-teams
5547 // rewrite.
5548 /* TODO: Use technique to find the right size hot-team, don't reap them */
5549 team = __kmp_reap_team(team);
5550 __kmp_team_pool = team;
5551 }
5552
5553 /* nothing available in the pool, no matter, make a new team! */
5554 KMP_MB();
5555 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5556
5557 /* and set it up */
5558 team->t.t_max_nproc = max_nproc;
5559 if (max_nproc > 1 &&
5560 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5561 // Allocate barrier structure
5562 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5563 }
5564
5565 /* NOTE well, for some reason allocating one big buffer and dividing it up
5566 seems to really hurt performance a lot on the P4, so, let's not use this */
5567 __kmp_allocate_team_arrays(team, max_nproc);
5568
5569 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5570 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5571
5572 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5573 "%p to NULL\n",
5574 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5575 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5576 // memory, no need to duplicate
5577 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5578 // memory, no need to duplicate
5579
5580 if (__kmp_storage_map) {
5581 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5582 }
5583
5584 /* allocate space for arguments */
5585 __kmp_alloc_argv_entries(argc, team, FALSE);
5586 team->t.t_argc = argc;
5587
5588 KA_TRACE(20,
5589 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5590 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5591 { // Initialize barrier data.
5592 int b;
5593 for (b = 0; b < bs_last_barrier; ++b) {
5594 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5595#if USE_DEBUGGER
5596 team->t.t_bar[b].b_master_arrived = 0;
5597 team->t.t_bar[b].b_team_arrived = 0;
5598#endif
5599 }
5600 }
5601
5602 team->t.t_proc_bind = new_proc_bind;
5603
5604#if OMPT_SUPPORT
5605 __ompt_team_assign_id(team, ompt_parallel_data);
5606 team->t.ompt_serialized_team_info = NULL;
5607#endif
5608
5609 KMP_MB();
5610
5611 team->t.t_nested_nth = NULL;
5612
5613 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5614 team->t.t_id));
5615
5616 return team;
5617}
5618
5619/* TODO implement hot-teams at all levels */
5620/* TODO implement lazy thread release on demand (disband request) */
5621
5622/* free the team. return it to the team pool. release all the threads
5623 * associated with it */
5624void __kmp_free_team(kmp_root_t *root, kmp_team_t *team, kmp_info_t *master) {
5625 int f;
5626 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5627 team->t.t_id));
5628
5629 /* verify state */
5630 KMP_DEBUG_ASSERT(root);
5631 KMP_DEBUG_ASSERT(team);
5632 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5633 KMP_DEBUG_ASSERT(team->t.t_threads);
5634
5635 int use_hot_team = team == root->r.r_hot_team;
5636 int level;
5637 if (master) {
5638 level = team->t.t_active_level - 1;
5639 if (master->th.th_teams_microtask) { // in teams construct?
5640 if (master->th.th_teams_size.nteams > 1) {
5641 ++level; // level was not increased in teams construct for
5642 // team_of_masters
5643 }
5644 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5645 master->th.th_teams_level == team->t.t_level) {
5646 ++level; // level was not increased in teams construct for
5647 // team_of_workers before the parallel
5648 } // team->t.t_level will be increased inside parallel
5649 }
5650#if KMP_DEBUG
5651 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5652#endif
5653 if (level < __kmp_hot_teams_max_level) {
5654 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5655 use_hot_team = 1;
5656 }
5657 }
5658
5659 /* team is done working */
5660 TCW_SYNC_PTR(team->t.t_pkfn,
5661 NULL); // Important for Debugging Support Library.
5662#if KMP_OS_WINDOWS
5663 team->t.t_copyin_counter = 0; // init counter for possible reuse
5664#endif
5665 // Do not reset pointer to parent team to NULL for hot teams.
5666
5667 /* if we are non-hot team, release our threads */
5668 if (!use_hot_team) {
5669 if (__kmp_tasking_mode != tskm_immediate_exec) {
5670 // Wait for threads to reach reapable state
5671 for (f = 1; f < team->t.t_nproc; ++f) {
5672 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5673 kmp_info_t *th = team->t.t_threads[f];
5674 volatile kmp_uint32 *state = &th->th.th_reap_state;
5675 while (*state != KMP_SAFE_TO_REAP) {
5676#if KMP_OS_WINDOWS
5677 // On Windows a thread can be killed at any time, check this
5678 DWORD ecode;
5679 if (!__kmp_is_thread_alive(th, &ecode)) {
5680 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5681 break;
5682 }
5683#endif
5684 // first check if thread is sleeping
5685 if (th->th.th_sleep_loc)
5686 __kmp_null_resume_wrapper(th);
5687 KMP_CPU_PAUSE();
5688 }
5689 }
5690
5691 // Delete task teams
5692 int tt_idx;
5693 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5694 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5695 if (task_team != NULL) {
5696 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5697 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5698 team->t.t_threads[f]->th.th_task_team = NULL;
5699 }
5700 KA_TRACE(
5701 20,
5702 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5703 __kmp_get_gtid(), task_team, team->t.t_id));
5704 __kmp_free_task_team(master, task_team);
5705 team->t.t_task_team[tt_idx] = NULL;
5706 }
5707 }
5708 }
5709
5710 // Before clearing parent pointer, check if nested_nth list should be freed
5711 if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5712 team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5713 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5714 KMP_INTERNAL_FREE(team->t.t_nested_nth);
5715 }
5716 team->t.t_nested_nth = NULL;
5717
5718 // Reset pointer to parent team only for non-hot teams.
5719 team->t.t_parent = NULL;
5720 team->t.t_level = 0;
5721 team->t.t_active_level = 0;
5722
5723 /* free the worker threads */
5724 for (f = 1; f < team->t.t_nproc; ++f) {
5725 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5726 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5727 (void)KMP_COMPARE_AND_STORE_ACQ32(
5728 &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
5729 }
5730 __kmp_free_thread(team->t.t_threads[f]);
5731 }
5732
5733 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5734 if (team->t.b) {
5735 // wake up thread at old location
5736 team->t.b->go_release();
5737 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5738 for (f = 1; f < team->t.t_nproc; ++f) {
5739 if (team->t.b->sleep[f].sleep) {
5740 __kmp_atomic_resume_64(
5741 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5742 (kmp_atomic_flag_64<> *)NULL);
5743 }
5744 }
5745 }
5746 // Wait for threads to be removed from team
5747 for (int f = 1; f < team->t.t_nproc; ++f) {
5748 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5749 KMP_CPU_PAUSE();
5750 }
5751 }
5752 }
5753
5754 for (f = 1; f < team->t.t_nproc; ++f) {
5755 team->t.t_threads[f] = NULL;
5756 }
5757
5758 if (team->t.t_max_nproc > 1 &&
5759 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5760 distributedBarrier::deallocate(team->t.b);
5761 team->t.b = NULL;
5762 }
5763 /* put the team back in the team pool */
5764 /* TODO limit size of team pool, call reap_team if pool too large */
5765 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5766 __kmp_team_pool = (volatile kmp_team_t *)team;
5767 } else { // Check if team was created for primary threads in teams construct
5768 // See if first worker is a CG root
5769 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5770 team->t.t_threads[1]->th.th_cg_roots);
5771 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5772 // Clean up the CG root nodes on workers so that this team can be re-used
5773 for (f = 1; f < team->t.t_nproc; ++f) {
5774 kmp_info_t *thr = team->t.t_threads[f];
5775 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5776 thr->th.th_cg_roots->cg_root == thr);
5777 // Pop current CG root off list
5778 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5779 thr->th.th_cg_roots = tmp->up;
5780 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5781 " up to node %p. cg_nthreads was %d\n",
5782 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5783 int i = tmp->cg_nthreads--;
5784 if (i == 1) {
5785 __kmp_free(tmp); // free CG if we are the last thread in it
5786 }
5787 // Restore current task's thread_limit from CG root
5788 if (thr->th.th_cg_roots)
5789 thr->th.th_current_task->td_icvs.thread_limit =
5790 thr->th.th_cg_roots->cg_thread_limit;
5791 }
5792 }
5793 }
5794
5795 KMP_MB();
5796}
5797
5798/* reap the team. destroy it, reclaim all its resources and free its memory */
5799kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5800 kmp_team_t *next_pool = team->t.t_next_pool;
5801
5802 KMP_DEBUG_ASSERT(team);
5803 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5804 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5805 KMP_DEBUG_ASSERT(team->t.t_threads);
5806 KMP_DEBUG_ASSERT(team->t.t_argv);
5807
5808 /* TODO clean the threads that are a part of this? */
5809
5810 /* free stuff */
5811 __kmp_free_team_arrays(team);
5812 if (team->t.t_argv != &team->t.t_inline_argv[0])
5813 __kmp_free((void *)team->t.t_argv);
5814 __kmp_free(team);
5815
5816 KMP_MB();
5817 return next_pool;
5818}
5819
5820// Free the thread. Don't reap it, just place it on the pool of available
5821// threads.
5822//
5823// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5824// binding for the affinity mechanism to be useful.
5825//
5826// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5827// However, we want to avoid a potential performance problem by always
5828// scanning through the list to find the correct point at which to insert
5829// the thread (potential N**2 behavior). To do this we keep track of the
5830// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5831// With single-level parallelism, threads will always be added to the tail
5832// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5833// parallelism, all bets are off and we may need to scan through the entire
5834// free list.
5835//
5836// This change also has a potentially large performance benefit, for some
5837// applications. Previously, as threads were freed from the hot team, they
5838// would be placed back on the free list in inverse order. If the hot team
5839// grew back to it's original size, then the freed thread would be placed
5840// back on the hot team in reverse order. This could cause bad cache
5841// locality problems on programs where the size of the hot team regularly
5842// grew and shrunk.
5843//
5844// Now, for single-level parallelism, the OMP tid is always == gtid.
5845void __kmp_free_thread(kmp_info_t *this_th) {
5846 int gtid;
5847 kmp_info_t **scan;
5848
5849 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5850 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5851
5852 KMP_DEBUG_ASSERT(this_th);
5853
5854 // When moving thread to pool, switch thread to wait on own b_go flag, and
5855 // uninitialized (NULL team).
5856 int b;
5857 kmp_balign_t *balign = this_th->th.th_bar;
5858 for (b = 0; b < bs_last_barrier; ++b) {
5859 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5860 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5861 balign[b].bb.team = NULL;
5862 balign[b].bb.leaf_kids = 0;
5863 }
5864 this_th->th.th_task_state = 0;
5865 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5866
5867 /* put thread back on the free pool */
5868 TCW_PTR(this_th->th.th_team, NULL);
5869 TCW_PTR(this_th->th.th_root, NULL);
5870 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5871
5872 while (this_th->th.th_cg_roots) {
5873 this_th->th.th_cg_roots->cg_nthreads--;
5874 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5875 " %p of thread %p to %d\n",
5876 this_th, this_th->th.th_cg_roots,
5877 this_th->th.th_cg_roots->cg_root,
5878 this_th->th.th_cg_roots->cg_nthreads));
5879 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5880 if (tmp->cg_root == this_th) { // Thread is a cg_root
5881 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5882 KA_TRACE(
5883 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5884 this_th->th.th_cg_roots = tmp->up;
5885 __kmp_free(tmp);
5886 } else { // Worker thread
5887 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5888 __kmp_free(tmp);
5889 }
5890 this_th->th.th_cg_roots = NULL;
5891 break;
5892 }
5893 }
5894
5895 /* If the implicit task assigned to this thread can be used by other threads
5896 * -> multiple threads can share the data and try to free the task at
5897 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5898 * with higher probability when hot team is disabled but can occurs even when
5899 * the hot team is enabled */
5900 __kmp_free_implicit_task(this_th);
5901 this_th->th.th_current_task = NULL;
5902
5903 // If the __kmp_thread_pool_insert_pt is already past the new insert
5904 // point, then we need to re-scan the entire list.
5905 gtid = this_th->th.th_info.ds.ds_gtid;
5906 if (__kmp_thread_pool_insert_pt != NULL) {
5907 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5908 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5909 __kmp_thread_pool_insert_pt = NULL;
5910 }
5911 }
5912
5913 // Scan down the list to find the place to insert the thread.
5914 // scan is the address of a link in the list, possibly the address of
5915 // __kmp_thread_pool itself.
5916 //
5917 // In the absence of nested parallelism, the for loop will have 0 iterations.
5918 if (__kmp_thread_pool_insert_pt != NULL) {
5919 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5920 } else {
5921 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5922 }
5923 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5924 scan = &((*scan)->th.th_next_pool))
5925 ;
5926
5927 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5928 // to its address.
5929 TCW_PTR(this_th->th.th_next_pool, *scan);
5930 __kmp_thread_pool_insert_pt = *scan = this_th;
5931 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5932 (this_th->th.th_info.ds.ds_gtid <
5933 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5934 TCW_4(this_th->th.th_in_pool, TRUE);
5935 __kmp_suspend_initialize_thread(this_th);
5936 __kmp_lock_suspend_mx(this_th);
5937 if (this_th->th.th_active == TRUE) {
5938 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5939 this_th->th.th_active_in_pool = TRUE;
5940 }
5941#if KMP_DEBUG
5942 else {
5943 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5944 }
5945#endif
5946 __kmp_unlock_suspend_mx(this_th);
5947
5948 TCW_4(__kmp_nth, __kmp_nth - 1);
5949
5950#ifdef KMP_ADJUST_BLOCKTIME
5951 /* Adjust blocktime back to user setting or default if necessary */
5952 /* Middle initialization might never have occurred */
5953 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5954 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5955 if (__kmp_nth <= __kmp_avail_proc) {
5956 __kmp_zero_bt = FALSE;
5957 }
5958 }
5959#endif /* KMP_ADJUST_BLOCKTIME */
5960
5961 KMP_MB();
5962}
5963
5964/* ------------------------------------------------------------------------ */
5965
5966void *__kmp_launch_thread(kmp_info_t *this_thr) {
5967#if OMP_PROFILING_SUPPORT
5968 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5969 // TODO: add a configuration option for time granularity
5970 if (ProfileTraceFile)
5971 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5972#endif
5973
5974 int gtid = this_thr->th.th_info.ds.ds_gtid;
5975 /* void *stack_data;*/
5976 kmp_team_t **volatile pteam;
5977
5978 KMP_MB();
5979 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5980
5981 if (__kmp_env_consistency_check) {
5982 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5983 }
5984
5985#if OMPD_SUPPORT
5986 if (ompd_state & OMPD_ENABLE_BP)
5987 ompd_bp_thread_begin();
5988#endif
5989
5990#if OMPT_SUPPORT
5991 ompt_data_t *thread_data = nullptr;
5992 if (ompt_enabled.enabled) {
5993 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5994 *thread_data = ompt_data_none;
5995
5996 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5997 this_thr->th.ompt_thread_info.wait_id = 0;
5998 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5999 this_thr->th.ompt_thread_info.parallel_flags = 0;
6000 if (ompt_enabled.ompt_callback_thread_begin) {
6001 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6002 ompt_thread_worker, thread_data);
6003 }
6004 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6005 }
6006#endif
6007
6008 /* This is the place where threads wait for work */
6009 while (!TCR_4(__kmp_global.g.g_done)) {
6010 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6011 KMP_MB();
6012
6013 /* wait for work to do */
6014 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6015
6016 /* No tid yet since not part of a team */
6017 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6018
6019#if OMPT_SUPPORT
6020 if (ompt_enabled.enabled) {
6021 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6022 }
6023#endif
6024
6025 pteam = &this_thr->th.th_team;
6026
6027 /* have we been allocated? */
6028 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6029 /* we were just woken up, so run our new task */
6030 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6031 int rc;
6032 KA_TRACE(20,
6033 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6034 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6035 (*pteam)->t.t_pkfn));
6036
6037 updateHWFPControl(*pteam);
6038
6039#if OMPT_SUPPORT
6040 if (ompt_enabled.enabled) {
6041 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6042 }
6043#endif
6044
6045 rc = (*pteam)->t.t_invoke(gtid);
6046 KMP_ASSERT(rc);
6047
6048 KMP_MB();
6049 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6050 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6051 (*pteam)->t.t_pkfn));
6052 }
6053#if OMPT_SUPPORT
6054 if (ompt_enabled.enabled) {
6055 /* no frame set while outside task */
6056 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6057
6058 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6059 }
6060#endif
6061 /* join barrier after parallel region */
6062 __kmp_join_barrier(gtid);
6063 }
6064 }
6065
6066#if OMPD_SUPPORT
6067 if (ompd_state & OMPD_ENABLE_BP)
6068 ompd_bp_thread_end();
6069#endif
6070
6071#if OMPT_SUPPORT
6072 if (ompt_enabled.ompt_callback_thread_end) {
6073 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6074 }
6075#endif
6076
6077 this_thr->th.th_task_team = NULL;
6078 /* run the destructors for the threadprivate data for this thread */
6079 __kmp_common_destroy_gtid(gtid);
6080
6081 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6082 KMP_MB();
6083
6084#if OMP_PROFILING_SUPPORT
6085 llvm::timeTraceProfilerFinishThread();
6086#endif
6087 return this_thr;
6088}
6089
6090/* ------------------------------------------------------------------------ */
6091
6092void __kmp_internal_end_dest(void *specific_gtid) {
6093 // Make sure no significant bits are lost
6094 int gtid;
6095 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6096
6097 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6098 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6099 * this is because 0 is reserved for the nothing-stored case */
6100
6101 __kmp_internal_end_thread(gtid);
6102}
6103
6104#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6105
6106__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6107 __kmp_internal_end_atexit();
6108}
6109
6110#endif
6111
6112/* [Windows] josh: when the atexit handler is called, there may still be more
6113 than one thread alive */
6114void __kmp_internal_end_atexit(void) {
6115 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6116 /* [Windows]
6117 josh: ideally, we want to completely shutdown the library in this atexit
6118 handler, but stat code that depends on thread specific data for gtid fails
6119 because that data becomes unavailable at some point during the shutdown, so
6120 we call __kmp_internal_end_thread instead. We should eventually remove the
6121 dependency on __kmp_get_specific_gtid in the stat code and use
6122 __kmp_internal_end_library to cleanly shutdown the library.
6123
6124 // TODO: Can some of this comment about GVS be removed?
6125 I suspect that the offending stat code is executed when the calling thread
6126 tries to clean up a dead root thread's data structures, resulting in GVS
6127 code trying to close the GVS structures for that thread, but since the stat
6128 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6129 the calling thread is cleaning up itself instead of another thread, it get
6130 confused. This happens because allowing a thread to unregister and cleanup
6131 another thread is a recent modification for addressing an issue.
6132 Based on the current design (20050722), a thread may end up
6133 trying to unregister another thread only if thread death does not trigger
6134 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6135 thread specific data destructor function to detect thread death. For
6136 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6137 is nothing. Thus, the workaround is applicable only for Windows static
6138 stat library. */
6139 __kmp_internal_end_library(-1);
6140#if KMP_OS_WINDOWS
6141 __kmp_close_console();
6142#endif
6143}
6144
6145static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6146 // It is assumed __kmp_forkjoin_lock is acquired.
6147
6148 int gtid;
6149
6150 KMP_DEBUG_ASSERT(thread != NULL);
6151
6152 gtid = thread->th.th_info.ds.ds_gtid;
6153
6154 if (!is_root) {
6155 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6156 /* Assume the threads are at the fork barrier here */
6157 KA_TRACE(
6158 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6159 gtid));
6160 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6161 while (
6162 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6163 KMP_CPU_PAUSE();
6164 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6165 } else {
6166 /* Need release fence here to prevent seg faults for tree forkjoin
6167 barrier (GEH) */
6168 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6169 thread);
6170 __kmp_release_64(&flag);
6171 }
6172 }
6173
6174 // Terminate OS thread.
6175 __kmp_reap_worker(thread);
6176
6177 // The thread was killed asynchronously. If it was actively
6178 // spinning in the thread pool, decrement the global count.
6179 //
6180 // There is a small timing hole here - if the worker thread was just waking
6181 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6182 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6183 // the global counter might not get updated.
6184 //
6185 // Currently, this can only happen as the library is unloaded,
6186 // so there are no harmful side effects.
6187 if (thread->th.th_active_in_pool) {
6188 thread->th.th_active_in_pool = FALSE;
6189 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6190 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6191 }
6192 }
6193
6194 __kmp_free_implicit_task(thread);
6195
6196// Free the fast memory for tasking
6197#if USE_FAST_MEMORY
6198 __kmp_free_fast_memory(thread);
6199#endif /* USE_FAST_MEMORY */
6200
6201 __kmp_suspend_uninitialize_thread(thread);
6202
6203 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6204 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6205
6206 --__kmp_all_nth;
6207 // __kmp_nth was decremented when thread is added to the pool.
6208
6209#ifdef KMP_ADJUST_BLOCKTIME
6210 /* Adjust blocktime back to user setting or default if necessary */
6211 /* Middle initialization might never have occurred */
6212 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6213 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6214 if (__kmp_nth <= __kmp_avail_proc) {
6215 __kmp_zero_bt = FALSE;
6216 }
6217 }
6218#endif /* KMP_ADJUST_BLOCKTIME */
6219
6220 /* free the memory being used */
6221 if (__kmp_env_consistency_check) {
6222 if (thread->th.th_cons) {
6223 __kmp_free_cons_stack(thread->th.th_cons);
6224 thread->th.th_cons = NULL;
6225 }
6226 }
6227
6228 if (thread->th.th_pri_common != NULL) {
6229 __kmp_free(thread->th.th_pri_common);
6230 thread->th.th_pri_common = NULL;
6231 }
6232
6233#if KMP_USE_BGET
6234 if (thread->th.th_local.bget_data != NULL) {
6235 __kmp_finalize_bget(thread);
6236 }
6237#endif
6238
6239#if KMP_AFFINITY_SUPPORTED
6240 if (thread->th.th_affin_mask != NULL) {
6241 KMP_CPU_FREE(thread->th.th_affin_mask);
6242 thread->th.th_affin_mask = NULL;
6243 }
6244#endif /* KMP_AFFINITY_SUPPORTED */
6245
6246#if KMP_USE_HIER_SCHED
6247 if (thread->th.th_hier_bar_data != NULL) {
6248 __kmp_free(thread->th.th_hier_bar_data);
6249 thread->th.th_hier_bar_data = NULL;
6250 }
6251#endif
6252
6253 __kmp_reap_team(thread->th.th_serial_team);
6254 thread->th.th_serial_team = NULL;
6255 __kmp_free(thread);
6256
6257 KMP_MB();
6258
6259} // __kmp_reap_thread
6260
6261static void __kmp_itthash_clean(kmp_info_t *th) {
6262#if USE_ITT_NOTIFY
6263 if (__kmp_itt_region_domains.count > 0) {
6264 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6265 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6266 while (bucket) {
6267 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6268 __kmp_thread_free(th, bucket);
6269 bucket = next;
6270 }
6271 }
6272 }
6273 if (__kmp_itt_barrier_domains.count > 0) {
6274 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6275 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6276 while (bucket) {
6277 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6278 __kmp_thread_free(th, bucket);
6279 bucket = next;
6280 }
6281 }
6282 }
6283#endif
6284}
6285
6286static void __kmp_internal_end(void) {
6287 int i;
6288
6289 /* First, unregister the library */
6290 __kmp_unregister_library();
6291
6292#if KMP_OS_WINDOWS
6293 /* In Win static library, we can't tell when a root actually dies, so we
6294 reclaim the data structures for any root threads that have died but not
6295 unregistered themselves, in order to shut down cleanly.
6296 In Win dynamic library we also can't tell when a thread dies. */
6297 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6298// dead roots
6299#endif
6300
6301 for (i = 0; i < __kmp_threads_capacity; i++)
6302 if (__kmp_root[i])
6303 if (__kmp_root[i]->r.r_active)
6304 break;
6305 KMP_MB(); /* Flush all pending memory write invalidates. */
6306 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6307
6308 if (i < __kmp_threads_capacity) {
6309#if KMP_USE_MONITOR
6310 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6311 KMP_MB(); /* Flush all pending memory write invalidates. */
6312
6313 // Need to check that monitor was initialized before reaping it. If we are
6314 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6315 // __kmp_monitor will appear to contain valid data, but it is only valid in
6316 // the parent process, not the child.
6317 // New behavior (201008): instead of keying off of the flag
6318 // __kmp_init_parallel, the monitor thread creation is keyed off
6319 // of the new flag __kmp_init_monitor.
6320 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6321 if (TCR_4(__kmp_init_monitor)) {
6322 __kmp_reap_monitor(&__kmp_monitor);
6323 TCW_4(__kmp_init_monitor, 0);
6324 }
6325 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6326 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6327#endif // KMP_USE_MONITOR
6328 } else {
6329/* TODO move this to cleanup code */
6330#ifdef KMP_DEBUG
6331 /* make sure that everything has properly ended */
6332 for (i = 0; i < __kmp_threads_capacity; i++) {
6333 if (__kmp_root[i]) {
6334 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6335 // there can be uber threads alive here
6336 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6337 }
6338 }
6339#endif
6340
6341 KMP_MB();
6342
6343 // Reap the worker threads.
6344 // This is valid for now, but be careful if threads are reaped sooner.
6345 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6346 // Get the next thread from the pool.
6347 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6348 __kmp_thread_pool = thread->th.th_next_pool;
6349 // Reap it.
6350 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6351 thread->th.th_next_pool = NULL;
6352 thread->th.th_in_pool = FALSE;
6353 __kmp_reap_thread(thread, 0);
6354 }
6355 __kmp_thread_pool_insert_pt = NULL;
6356
6357 // Reap teams.
6358 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6359 // Get the next team from the pool.
6360 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6361 __kmp_team_pool = team->t.t_next_pool;
6362 // Reap it.
6363 team->t.t_next_pool = NULL;
6364 __kmp_reap_team(team);
6365 }
6366
6367 __kmp_reap_task_teams();
6368
6369#if KMP_OS_UNIX
6370 // Threads that are not reaped should not access any resources since they
6371 // are going to be deallocated soon, so the shutdown sequence should wait
6372 // until all threads either exit the final spin-waiting loop or begin
6373 // sleeping after the given blocktime.
6374 for (i = 0; i < __kmp_threads_capacity; i++) {
6375 kmp_info_t *thr = __kmp_threads[i];
6376 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6377 KMP_CPU_PAUSE();
6378 }
6379#endif
6380
6381 for (i = 0; i < __kmp_threads_capacity; ++i) {
6382 // TBD: Add some checking...
6383 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6384 }
6385
6386 /* Make sure all threadprivate destructors get run by joining with all
6387 worker threads before resetting this flag */
6388 TCW_SYNC_4(__kmp_init_common, FALSE);
6389
6390 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6391 KMP_MB();
6392
6393#if KMP_USE_MONITOR
6394 // See note above: One of the possible fixes for CQ138434 / CQ140126
6395 //
6396 // FIXME: push both code fragments down and CSE them?
6397 // push them into __kmp_cleanup() ?
6398 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6399 if (TCR_4(__kmp_init_monitor)) {
6400 __kmp_reap_monitor(&__kmp_monitor);
6401 TCW_4(__kmp_init_monitor, 0);
6402 }
6403 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6404 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6405#endif
6406 } /* else !__kmp_global.t_active */
6407 TCW_4(__kmp_init_gtid, FALSE);
6408 KMP_MB(); /* Flush all pending memory write invalidates. */
6409
6410 __kmp_cleanup();
6411#if OMPT_SUPPORT
6412 ompt_fini();
6413#endif
6414}
6415
6416void __kmp_internal_end_library(int gtid_req) {
6417 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6418 /* this shouldn't be a race condition because __kmp_internal_end() is the
6419 only place to clear __kmp_serial_init */
6420 /* we'll check this later too, after we get the lock */
6421 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6422 // redundant, because the next check will work in any case.
6423 if (__kmp_global.g.g_abort) {
6424 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6425 /* TODO abort? */
6426 return;
6427 }
6428 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6429 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6430 return;
6431 }
6432
6433 // If hidden helper team has been initialized, we need to deinit it
6434 if (TCR_4(__kmp_init_hidden_helper) &&
6435 !TCR_4(__kmp_hidden_helper_team_done)) {
6436 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6437 // First release the main thread to let it continue its work
6438 __kmp_hidden_helper_main_thread_release();
6439 // Wait until the hidden helper team has been destroyed
6440 __kmp_hidden_helper_threads_deinitz_wait();
6441 }
6442
6443 KMP_MB(); /* Flush all pending memory write invalidates. */
6444 /* find out who we are and what we should do */
6445 {
6446 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6447 KA_TRACE(
6448 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6449 if (gtid == KMP_GTID_SHUTDOWN) {
6450 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6451 "already shutdown\n"));
6452 return;
6453 } else if (gtid == KMP_GTID_MONITOR) {
6454 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6455 "registered, or system shutdown\n"));
6456 return;
6457 } else if (gtid == KMP_GTID_DNE) {
6458 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6459 "shutdown\n"));
6460 /* we don't know who we are, but we may still shutdown the library */
6461 } else if (KMP_UBER_GTID(gtid)) {
6462 /* unregister ourselves as an uber thread. gtid is no longer valid */
6463 if (__kmp_root[gtid]->r.r_active) {
6464 __kmp_global.g.g_abort = -1;
6465 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6466 __kmp_unregister_library();
6467 KA_TRACE(10,
6468 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6469 gtid));
6470 return;
6471 } else {
6472 __kmp_itthash_clean(__kmp_threads[gtid]);
6473 KA_TRACE(
6474 10,
6475 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6476 __kmp_unregister_root_current_thread(gtid);
6477 }
6478 } else {
6479/* worker threads may call this function through the atexit handler, if they
6480 * call exit() */
6481/* For now, skip the usual subsequent processing and just dump the debug buffer.
6482 TODO: do a thorough shutdown instead */
6483#ifdef DUMP_DEBUG_ON_EXIT
6484 if (__kmp_debug_buf)
6485 __kmp_dump_debug_buffer();
6486#endif
6487 // added unregister library call here when we switch to shm linux
6488 // if we don't, it will leave lots of files in /dev/shm
6489 // cleanup shared memory file before exiting.
6490 __kmp_unregister_library();
6491 return;
6492 }
6493 }
6494 /* synchronize the termination process */
6495 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6496
6497 /* have we already finished */
6498 if (__kmp_global.g.g_abort) {
6499 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6500 /* TODO abort? */
6501 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6502 return;
6503 }
6504 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6505 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6506 return;
6507 }
6508
6509 /* We need this lock to enforce mutex between this reading of
6510 __kmp_threads_capacity and the writing by __kmp_register_root.
6511 Alternatively, we can use a counter of roots that is atomically updated by
6512 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6513 __kmp_internal_end_*. */
6514 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6515
6516 /* now we can safely conduct the actual termination */
6517 __kmp_internal_end();
6518
6519 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6520 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6521
6522 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6523
6524#ifdef DUMP_DEBUG_ON_EXIT
6525 if (__kmp_debug_buf)
6526 __kmp_dump_debug_buffer();
6527#endif
6528
6529#if KMP_OS_WINDOWS
6530 __kmp_close_console();
6531#endif
6532
6533 __kmp_fini_allocator();
6534
6535} // __kmp_internal_end_library
6536
6537void __kmp_internal_end_thread(int gtid_req) {
6538 int i;
6539
6540 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6541 /* this shouldn't be a race condition because __kmp_internal_end() is the
6542 * only place to clear __kmp_serial_init */
6543 /* we'll check this later too, after we get the lock */
6544 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6545 // redundant, because the next check will work in any case.
6546 if (__kmp_global.g.g_abort) {
6547 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6548 /* TODO abort? */
6549 return;
6550 }
6551 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6552 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6553 return;
6554 }
6555
6556 // If hidden helper team has been initialized, we need to deinit it
6557 if (TCR_4(__kmp_init_hidden_helper) &&
6558 !TCR_4(__kmp_hidden_helper_team_done)) {
6559 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6560 // First release the main thread to let it continue its work
6561 __kmp_hidden_helper_main_thread_release();
6562 // Wait until the hidden helper team has been destroyed
6563 __kmp_hidden_helper_threads_deinitz_wait();
6564 }
6565
6566 KMP_MB(); /* Flush all pending memory write invalidates. */
6567
6568 /* find out who we are and what we should do */
6569 {
6570 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6571 KA_TRACE(10,
6572 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6573 if (gtid == KMP_GTID_SHUTDOWN) {
6574 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6575 "already shutdown\n"));
6576 return;
6577 } else if (gtid == KMP_GTID_MONITOR) {
6578 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6579 "registered, or system shutdown\n"));
6580 return;
6581 } else if (gtid == KMP_GTID_DNE) {
6582 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6583 "shutdown\n"));
6584 return;
6585 /* we don't know who we are */
6586 } else if (KMP_UBER_GTID(gtid)) {
6587 /* unregister ourselves as an uber thread. gtid is no longer valid */
6588 if (__kmp_root[gtid]->r.r_active) {
6589 __kmp_global.g.g_abort = -1;
6590 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6591 KA_TRACE(10,
6592 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6593 gtid));
6594 return;
6595 } else {
6596 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6597 gtid));
6598 __kmp_unregister_root_current_thread(gtid);
6599 }
6600 } else {
6601 /* just a worker thread, let's leave */
6602 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6603
6604 if (gtid >= 0) {
6605 __kmp_threads[gtid]->th.th_task_team = NULL;
6606 }
6607
6608 KA_TRACE(10,
6609 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6610 gtid));
6611 return;
6612 }
6613 }
6614#if KMP_DYNAMIC_LIB
6615 if (__kmp_pause_status != kmp_hard_paused)
6616 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6617 // because we will better shutdown later in the library destructor.
6618 {
6619 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6620 return;
6621 }
6622#endif
6623 /* synchronize the termination process */
6624 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6625
6626 /* have we already finished */
6627 if (__kmp_global.g.g_abort) {
6628 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6629 /* TODO abort? */
6630 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6631 return;
6632 }
6633 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6634 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6635 return;
6636 }
6637
6638 /* We need this lock to enforce mutex between this reading of
6639 __kmp_threads_capacity and the writing by __kmp_register_root.
6640 Alternatively, we can use a counter of roots that is atomically updated by
6641 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6642 __kmp_internal_end_*. */
6643
6644 /* should we finish the run-time? are all siblings done? */
6645 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6646
6647 for (i = 0; i < __kmp_threads_capacity; ++i) {
6648 if (KMP_UBER_GTID(i)) {
6649 KA_TRACE(
6650 10,
6651 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6652 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6653 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6654 return;
6655 }
6656 }
6657
6658 /* now we can safely conduct the actual termination */
6659
6660 __kmp_internal_end();
6661
6662 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6663 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6664
6665 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6666
6667#ifdef DUMP_DEBUG_ON_EXIT
6668 if (__kmp_debug_buf)
6669 __kmp_dump_debug_buffer();
6670#endif
6671} // __kmp_internal_end_thread
6672
6673// -----------------------------------------------------------------------------
6674// Library registration stuff.
6675
6676static long __kmp_registration_flag = 0;
6677// Random value used to indicate library initialization.
6678static char *__kmp_registration_str = NULL;
6679// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6680
6681static inline char *__kmp_reg_status_name() {
6682/* On RHEL 3u5 if linked statically, getpid() returns different values in
6683 each thread. If registration and unregistration go in different threads
6684 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6685 env var can not be found, because the name will contain different pid. */
6686// macOS* complains about name being too long with additional getuid()
6687#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6688 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6689 (int)getuid());
6690#else
6691 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6692#endif
6693} // __kmp_reg_status_get
6694
6695#if defined(KMP_USE_SHM)
6696bool __kmp_shm_available = false;
6697bool __kmp_tmp_available = false;
6698// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6699char *temp_reg_status_file_name = nullptr;
6700#endif
6701
6702void __kmp_register_library_startup(void) {
6703
6704 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6705 int done = 0;
6706 union {
6707 double dtime;
6708 long ltime;
6709 } time;
6710#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6711 __kmp_initialize_system_tick();
6712#endif
6713 __kmp_read_system_time(&time.dtime);
6714 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6715 __kmp_registration_str =
6716 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6717 __kmp_registration_flag, KMP_LIBRARY_FILE);
6718
6719 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6720 __kmp_registration_str));
6721
6722 while (!done) {
6723
6724 char *value = NULL; // Actual value of the environment variable.
6725
6726#if defined(KMP_USE_SHM)
6727 char *shm_name = nullptr;
6728 char *data1 = nullptr;
6729 __kmp_shm_available = __kmp_detect_shm();
6730 if (__kmp_shm_available) {
6731 int fd1 = -1;
6732 shm_name = __kmp_str_format("/%s", name);
6733 int shm_preexist = 0;
6734 fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6735 if ((fd1 == -1) && (errno == EEXIST)) {
6736 // file didn't open because it already exists.
6737 // try opening existing file
6738 fd1 = shm_open(shm_name, O_RDWR, 0600);
6739 if (fd1 == -1) { // file didn't open
6740 KMP_WARNING(FunctionError, "Can't open SHM");
6741 __kmp_shm_available = false;
6742 } else { // able to open existing file
6743 shm_preexist = 1;
6744 }
6745 }
6746 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6747 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6748 KMP_WARNING(FunctionError, "Can't set size of SHM");
6749 __kmp_shm_available = false;
6750 }
6751 }
6752 if (__kmp_shm_available) { // SHM exists, now map it
6753 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6754 fd1, 0);
6755 if (data1 == MAP_FAILED) { // failed to map shared memory
6756 KMP_WARNING(FunctionError, "Can't map SHM");
6757 __kmp_shm_available = false;
6758 }
6759 }
6760 if (__kmp_shm_available) { // SHM mapped
6761 if (shm_preexist == 0) { // set data to SHM, set value
6762 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6763 }
6764 // Read value from either what we just wrote or existing file.
6765 value = __kmp_str_format("%s", data1); // read value from SHM
6766 munmap(data1, SHM_SIZE);
6767 }
6768 if (fd1 != -1)
6769 close(fd1);
6770 }
6771 if (!__kmp_shm_available)
6772 __kmp_tmp_available = __kmp_detect_tmp();
6773 if (!__kmp_shm_available && __kmp_tmp_available) {
6774 // SHM failed to work due to an error other than that the file already
6775 // exists. Try to create a temp file under /tmp.
6776 // If /tmp isn't accessible, fall back to using environment variable.
6777 // TODO: /tmp might not always be the temporary directory. For now we will
6778 // not consider TMPDIR.
6779 int fd1 = -1;
6780 temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6781 int tmp_preexist = 0;
6782 fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6783 if ((fd1 == -1) && (errno == EEXIST)) {
6784 // file didn't open because it already exists.
6785 // try opening existing file
6786 fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6787 if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6788 KMP_WARNING(FunctionError, "Can't open TEMP");
6789 __kmp_tmp_available = false;
6790 } else {
6791 tmp_preexist = 1;
6792 }
6793 }
6794 if (__kmp_tmp_available && tmp_preexist == 0) {
6795 // we created /tmp file now set size
6796 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6797 KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6798 __kmp_tmp_available = false;
6799 }
6800 }
6801 if (__kmp_tmp_available) {
6802 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6803 fd1, 0);
6804 if (data1 == MAP_FAILED) { // failed to map /tmp
6805 KMP_WARNING(FunctionError, "Can't map /tmp");
6806 __kmp_tmp_available = false;
6807 }
6808 }
6809 if (__kmp_tmp_available) {
6810 if (tmp_preexist == 0) { // set data to TMP, set value
6811 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6812 }
6813 // Read value from either what we just wrote or existing file.
6814 value = __kmp_str_format("%s", data1); // read value from SHM
6815 munmap(data1, SHM_SIZE);
6816 }
6817 if (fd1 != -1)
6818 close(fd1);
6819 }
6820 if (!__kmp_shm_available && !__kmp_tmp_available) {
6821 // no /dev/shm and no /tmp -- fall back to environment variable
6822 // Set environment variable, but do not overwrite if it exists.
6823 __kmp_env_set(name, __kmp_registration_str, 0);
6824 // read value to see if it got set
6825 value = __kmp_env_get(name);
6826 }
6827#else // Windows and unix with static library
6828 // Set environment variable, but do not overwrite if it exists.
6829 __kmp_env_set(name, __kmp_registration_str, 0);
6830 // read value to see if it got set
6831 value = __kmp_env_get(name);
6832#endif
6833
6834 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6835 done = 1; // Ok, environment variable set successfully, exit the loop.
6836 } else {
6837 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6838 // Check whether it alive or dead.
6839 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6840 char *tail = value;
6841 char *flag_addr_str = NULL;
6842 char *flag_val_str = NULL;
6843 char const *file_name = NULL;
6844 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6845 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6846 file_name = tail;
6847 if (tail != NULL) {
6848 unsigned long *flag_addr = 0;
6849 unsigned long flag_val = 0;
6850 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6851 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6852 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6853 // First, check whether environment-encoded address is mapped into
6854 // addr space.
6855 // If so, dereference it to see if it still has the right value.
6856 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6857 neighbor = 1;
6858 } else {
6859 // If not, then we know the other copy of the library is no longer
6860 // running.
6861 neighbor = 2;
6862 }
6863 }
6864 }
6865 switch (neighbor) {
6866 case 0: // Cannot parse environment variable -- neighbor status unknown.
6867 // Assume it is the incompatible format of future version of the
6868 // library. Assume the other library is alive.
6869 // WARN( ... ); // TODO: Issue a warning.
6870 file_name = "unknown library";
6871 KMP_FALLTHROUGH();
6872 // Attention! Falling to the next case. That's intentional.
6873 case 1: { // Neighbor is alive.
6874 // Check it is allowed.
6875 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6876 if (!__kmp_str_match_true(duplicate_ok)) {
6877 // That's not allowed. Issue fatal error.
6878 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6879 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6880 }
6881 KMP_INTERNAL_FREE(duplicate_ok);
6882 __kmp_duplicate_library_ok = 1;
6883 done = 1; // Exit the loop.
6884 } break;
6885 case 2: { // Neighbor is dead.
6886
6887#if defined(KMP_USE_SHM)
6888 if (__kmp_shm_available) { // close shared memory.
6889 shm_unlink(shm_name); // this removes file in /dev/shm
6890 } else if (__kmp_tmp_available) {
6891 unlink(temp_reg_status_file_name); // this removes the temp file
6892 } else {
6893 // Clear the variable and try to register library again.
6894 __kmp_env_unset(name);
6895 }
6896#else
6897 // Clear the variable and try to register library again.
6898 __kmp_env_unset(name);
6899#endif
6900 } break;
6901 default: {
6902 KMP_DEBUG_ASSERT(0);
6903 } break;
6904 }
6905 }
6906 KMP_INTERNAL_FREE((void *)value);
6907#if defined(KMP_USE_SHM)
6908 if (shm_name)
6909 KMP_INTERNAL_FREE((void *)shm_name);
6910#endif
6911 } // while
6912 KMP_INTERNAL_FREE((void *)name);
6913
6914} // func __kmp_register_library_startup
6915
6916void __kmp_unregister_library(void) {
6917
6918 char *name = __kmp_reg_status_name();
6919 char *value = NULL;
6920
6921#if defined(KMP_USE_SHM)
6922 char *shm_name = nullptr;
6923 int fd1;
6924 if (__kmp_shm_available) {
6925 shm_name = __kmp_str_format("/%s", name);
6926 fd1 = shm_open(shm_name, O_RDONLY, 0600);
6927 if (fd1 != -1) { // File opened successfully
6928 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6929 if (data1 != MAP_FAILED) {
6930 value = __kmp_str_format("%s", data1); // read value from SHM
6931 munmap(data1, SHM_SIZE);
6932 }
6933 close(fd1);
6934 }
6935 } else if (__kmp_tmp_available) { // try /tmp
6936 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6937 if (fd1 != -1) { // File opened successfully
6938 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6939 if (data1 != MAP_FAILED) {
6940 value = __kmp_str_format("%s", data1); // read value from /tmp
6941 munmap(data1, SHM_SIZE);
6942 }
6943 close(fd1);
6944 }
6945 } else { // fall back to envirable
6946 value = __kmp_env_get(name);
6947 }
6948#else
6949 value = __kmp_env_get(name);
6950#endif
6951
6952 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6953 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6954 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6955// Ok, this is our variable. Delete it.
6956#if defined(KMP_USE_SHM)
6957 if (__kmp_shm_available) {
6958 shm_unlink(shm_name); // this removes file in /dev/shm
6959 } else if (__kmp_tmp_available) {
6960 unlink(temp_reg_status_file_name); // this removes the temp file
6961 } else {
6962 __kmp_env_unset(name);
6963 }
6964#else
6965 __kmp_env_unset(name);
6966#endif
6967 }
6968
6969#if defined(KMP_USE_SHM)
6970 if (shm_name)
6971 KMP_INTERNAL_FREE(shm_name);
6972 if (temp_reg_status_file_name)
6973 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6974#endif
6975
6976 KMP_INTERNAL_FREE(__kmp_registration_str);
6977 KMP_INTERNAL_FREE(value);
6978 KMP_INTERNAL_FREE(name);
6979
6980 __kmp_registration_flag = 0;
6981 __kmp_registration_str = NULL;
6982
6983} // __kmp_unregister_library
6984
6985// End of Library registration stuff.
6986// -----------------------------------------------------------------------------
6987
6988#if KMP_MIC_SUPPORTED
6989
6990static void __kmp_check_mic_type() {
6991 kmp_cpuid_t cpuid_state = {0};
6992 kmp_cpuid_t *cs_p = &cpuid_state;
6993 __kmp_x86_cpuid(1, 0, cs_p);
6994 // We don't support mic1 at the moment
6995 if ((cs_p->eax & 0xff0) == 0xB10) {
6996 __kmp_mic_type = mic2;
6997 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6998 __kmp_mic_type = mic3;
6999 } else {
7000 __kmp_mic_type = non_mic;
7001 }
7002}
7003
7004#endif /* KMP_MIC_SUPPORTED */
7005
7006#if KMP_HAVE_UMWAIT
7007static void __kmp_user_level_mwait_init() {
7008 struct kmp_cpuid buf;
7009 __kmp_x86_cpuid(7, 0, &buf);
7010 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7011 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7012 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7013 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7014 __kmp_umwait_enabled));
7015}
7016#elif KMP_HAVE_MWAIT
7017#ifndef AT_INTELPHIUSERMWAIT
7018// Spurious, non-existent value that should always fail to return anything.
7019// Will be replaced with the correct value when we know that.
7020#define AT_INTELPHIUSERMWAIT 10000
7021#endif
7022// getauxval() function is available in RHEL7 and SLES12. If a system with an
7023// earlier OS is used to build the RTL, we'll use the following internal
7024// function when the entry is not found.
7025unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7026unsigned long getauxval(unsigned long) { return 0; }
7027
7028static void __kmp_user_level_mwait_init() {
7029 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7030 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7031 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7032 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7033 if (__kmp_mic_type == mic3) {
7034 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7035 if ((res & 0x1) || __kmp_user_level_mwait) {
7036 __kmp_mwait_enabled = TRUE;
7037 if (__kmp_user_level_mwait) {
7038 KMP_INFORM(EnvMwaitWarn);
7039 }
7040 } else {
7041 __kmp_mwait_enabled = FALSE;
7042 }
7043 }
7044 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7045 "__kmp_mwait_enabled = %d\n",
7046 __kmp_mic_type, __kmp_mwait_enabled));
7047}
7048#endif /* KMP_HAVE_UMWAIT */
7049
7050static void __kmp_do_serial_initialize(void) {
7051 int i, gtid;
7052 size_t size;
7053
7054 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7055
7056 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7057 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7058 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7059 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7060 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7061
7062#if OMPT_SUPPORT
7063 ompt_pre_init();
7064#endif
7065#if OMPD_SUPPORT
7066 __kmp_env_dump();
7067 ompd_init();
7068#endif
7069
7070 __kmp_validate_locks();
7071
7072#if ENABLE_LIBOMPTARGET
7073 /* Initialize functions from libomptarget */
7074 __kmp_init_omptarget();
7075#endif
7076
7077 /* Initialize internal memory allocator */
7078 __kmp_init_allocator();
7079
7080 /* Register the library startup via an environment variable or via mapped
7081 shared memory file and check to see whether another copy of the library is
7082 already registered. Since forked child process is often terminated, we
7083 postpone the registration till middle initialization in the child */
7084 if (__kmp_need_register_serial)
7085 __kmp_register_library_startup();
7086
7087 /* TODO reinitialization of library */
7088 if (TCR_4(__kmp_global.g.g_done)) {
7089 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7090 }
7091
7092 __kmp_global.g.g_abort = 0;
7093 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7094
7095/* initialize the locks */
7096#if KMP_USE_ADAPTIVE_LOCKS
7097#if KMP_DEBUG_ADAPTIVE_LOCKS
7098 __kmp_init_speculative_stats();
7099#endif
7100#endif
7101#if KMP_STATS_ENABLED
7102 __kmp_stats_init();
7103#endif
7104 __kmp_init_lock(&__kmp_global_lock);
7105 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7106 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7107 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7108 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7109 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7110 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7111 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7112 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7113 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7114 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7115 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7116 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7117 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7118 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7119 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7120#if KMP_USE_MONITOR
7121 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7122#endif
7123 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7124
7125 /* conduct initialization and initial setup of configuration */
7126
7127 __kmp_runtime_initialize();
7128
7129#if KMP_MIC_SUPPORTED
7130 __kmp_check_mic_type();
7131#endif
7132
7133// Some global variable initialization moved here from kmp_env_initialize()
7134#ifdef KMP_DEBUG
7135 kmp_diag = 0;
7136#endif
7137 __kmp_abort_delay = 0;
7138
7139 // From __kmp_init_dflt_team_nth()
7140 /* assume the entire machine will be used */
7141 __kmp_dflt_team_nth_ub = __kmp_xproc;
7142 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7143 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7144 }
7145 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7146 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7147 }
7148 __kmp_max_nth = __kmp_sys_max_nth;
7149 __kmp_cg_max_nth = __kmp_sys_max_nth;
7150 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7151 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7152 __kmp_teams_max_nth = __kmp_sys_max_nth;
7153 }
7154
7155 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7156 // part
7157 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7158#if KMP_USE_MONITOR
7159 __kmp_monitor_wakeups =
7160 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7161 __kmp_bt_intervals =
7162 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7163#endif
7164 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7165 __kmp_library = library_throughput;
7166 // From KMP_SCHEDULE initialization
7167 __kmp_static = kmp_sch_static_balanced;
7168// AC: do not use analytical here, because it is non-monotonous
7169//__kmp_guided = kmp_sch_guided_iterative_chunked;
7170//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7171// need to repeat assignment
7172// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7173// bit control and barrier method control parts
7174#if KMP_FAST_REDUCTION_BARRIER
7175#define kmp_reduction_barrier_gather_bb ((int)1)
7176#define kmp_reduction_barrier_release_bb ((int)1)
7177#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7178#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7179#endif // KMP_FAST_REDUCTION_BARRIER
7180 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7181 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7182 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7183 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7184 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7185#if KMP_FAST_REDUCTION_BARRIER
7186 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7187 // lin_64 ): hyper,1
7188 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7189 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7190 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7191 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7192 }
7193#endif // KMP_FAST_REDUCTION_BARRIER
7194 }
7195#if KMP_FAST_REDUCTION_BARRIER
7196#undef kmp_reduction_barrier_release_pat
7197#undef kmp_reduction_barrier_gather_pat
7198#undef kmp_reduction_barrier_release_bb
7199#undef kmp_reduction_barrier_gather_bb
7200#endif // KMP_FAST_REDUCTION_BARRIER
7201#if KMP_MIC_SUPPORTED
7202 if (__kmp_mic_type == mic2) { // KNC
7203 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7204 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7205 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7206 1; // forkjoin release
7207 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7208 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7209 }
7210#if KMP_FAST_REDUCTION_BARRIER
7211 if (__kmp_mic_type == mic2) { // KNC
7212 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7213 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7214 }
7215#endif // KMP_FAST_REDUCTION_BARRIER
7216#endif // KMP_MIC_SUPPORTED
7217
7218// From KMP_CHECKS initialization
7219#ifdef KMP_DEBUG
7220 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7221#else
7222 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7223#endif
7224
7225 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7226 __kmp_foreign_tp = TRUE;
7227
7228 __kmp_global.g.g_dynamic = FALSE;
7229 __kmp_global.g.g_dynamic_mode = dynamic_default;
7230
7231 __kmp_init_nesting_mode();
7232
7233 __kmp_env_initialize(NULL);
7234
7235#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7236 __kmp_user_level_mwait_init();
7237#endif
7238// Print all messages in message catalog for testing purposes.
7239#ifdef KMP_DEBUG
7240 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7241 if (__kmp_str_match_true(val)) {
7242 kmp_str_buf_t buffer;
7243 __kmp_str_buf_init(&buffer);
7244 __kmp_i18n_dump_catalog(&buffer);
7245 __kmp_printf("%s", buffer.str);
7246 __kmp_str_buf_free(&buffer);
7247 }
7248 __kmp_env_free(&val);
7249#endif
7250
7251 __kmp_threads_capacity =
7252 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7253 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7254 __kmp_tp_capacity = __kmp_default_tp_capacity(
7255 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7256
7257 // If the library is shut down properly, both pools must be NULL. Just in
7258 // case, set them to NULL -- some memory may leak, but subsequent code will
7259 // work even if pools are not freed.
7260 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7261 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7262 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7263 __kmp_thread_pool = NULL;
7264 __kmp_thread_pool_insert_pt = NULL;
7265 __kmp_team_pool = NULL;
7266
7267 /* Allocate all of the variable sized records */
7268 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7269 * expandable */
7270 /* Since allocation is cache-aligned, just add extra padding at the end */
7271 size =
7272 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7273 CACHE_LINE;
7274 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7275 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7276 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7277
7278 /* init thread counts */
7279 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7280 0); // Asserts fail if the library is reinitializing and
7281 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7282 __kmp_all_nth = 0;
7283 __kmp_nth = 0;
7284
7285 /* setup the uber master thread and hierarchy */
7286 gtid = __kmp_register_root(TRUE);
7287 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7288 KMP_ASSERT(KMP_UBER_GTID(gtid));
7289 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7290
7291 KMP_MB(); /* Flush all pending memory write invalidates. */
7292
7293 __kmp_common_initialize();
7294
7295#if KMP_OS_UNIX
7296 /* invoke the child fork handler */
7297 __kmp_register_atfork();
7298#endif
7299
7300#if !KMP_DYNAMIC_LIB || \
7301 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7302 {
7303 /* Invoke the exit handler when the program finishes, only for static
7304 library and macOS* dynamic. For other dynamic libraries, we already
7305 have _fini and DllMain. */
7306 int rc = atexit(__kmp_internal_end_atexit);
7307 if (rc != 0) {
7308 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7309 __kmp_msg_null);
7310 }
7311 }
7312#endif
7313
7314#if KMP_HANDLE_SIGNALS
7315#if KMP_OS_UNIX
7316 /* NOTE: make sure that this is called before the user installs their own
7317 signal handlers so that the user handlers are called first. this way they
7318 can return false, not call our handler, avoid terminating the library, and
7319 continue execution where they left off. */
7320 __kmp_install_signals(FALSE);
7321#endif /* KMP_OS_UNIX */
7322#if KMP_OS_WINDOWS
7323 __kmp_install_signals(TRUE);
7324#endif /* KMP_OS_WINDOWS */
7325#endif
7326
7327 /* we have finished the serial initialization */
7328 __kmp_init_counter++;
7329
7330 __kmp_init_serial = TRUE;
7331
7332 if (__kmp_version) {
7333 __kmp_print_version_1();
7334 }
7335
7336 if (__kmp_settings) {
7337 __kmp_env_print();
7338 }
7339
7340 if (__kmp_display_env || __kmp_display_env_verbose) {
7341 __kmp_env_print_2();
7342 }
7343
7344#if OMPT_SUPPORT
7345 ompt_post_init();
7346#endif
7347
7348 KMP_MB();
7349
7350 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7351}
7352
7353void __kmp_serial_initialize(void) {
7354 if (__kmp_init_serial) {
7355 return;
7356 }
7357 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7358 if (__kmp_init_serial) {
7359 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7360 return;
7361 }
7362 __kmp_do_serial_initialize();
7363 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7364}
7365
7366static void __kmp_do_middle_initialize(void) {
7367 int i, j;
7368 int prev_dflt_team_nth;
7369
7370 if (!__kmp_init_serial) {
7371 __kmp_do_serial_initialize();
7372 }
7373
7374 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7375
7376 if (UNLIKELY(!__kmp_need_register_serial)) {
7377 // We are in a forked child process. The registration was skipped during
7378 // serial initialization in __kmp_atfork_child handler. Do it here.
7379 __kmp_register_library_startup();
7380 }
7381
7382 // Save the previous value for the __kmp_dflt_team_nth so that
7383 // we can avoid some reinitialization if it hasn't changed.
7384 prev_dflt_team_nth = __kmp_dflt_team_nth;
7385
7386#if KMP_AFFINITY_SUPPORTED
7387 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7388 // number of cores on the machine.
7389 __kmp_affinity_initialize(__kmp_affinity);
7390
7391#endif /* KMP_AFFINITY_SUPPORTED */
7392
7393 KMP_ASSERT(__kmp_xproc > 0);
7394 if (__kmp_avail_proc == 0) {
7395 __kmp_avail_proc = __kmp_xproc;
7396 }
7397
7398 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7399 // correct them now
7400 j = 0;
7401 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7402 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7403 __kmp_avail_proc;
7404 j++;
7405 }
7406
7407 if (__kmp_dflt_team_nth == 0) {
7408#ifdef KMP_DFLT_NTH_CORES
7409 // Default #threads = #cores
7410 __kmp_dflt_team_nth = __kmp_ncores;
7411 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7412 "__kmp_ncores (%d)\n",
7413 __kmp_dflt_team_nth));
7414#else
7415 // Default #threads = #available OS procs
7416 __kmp_dflt_team_nth = __kmp_avail_proc;
7417 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7418 "__kmp_avail_proc(%d)\n",
7419 __kmp_dflt_team_nth));
7420#endif /* KMP_DFLT_NTH_CORES */
7421 }
7422
7423 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7424 __kmp_dflt_team_nth = KMP_MIN_NTH;
7425 }
7426 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7427 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7428 }
7429
7430 if (__kmp_nesting_mode > 0)
7431 __kmp_set_nesting_mode_threads();
7432
7433 // There's no harm in continuing if the following check fails,
7434 // but it indicates an error in the previous logic.
7435 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7436
7437 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7438 // Run through the __kmp_threads array and set the num threads icv for each
7439 // root thread that is currently registered with the RTL (which has not
7440 // already explicitly set its nthreads-var with a call to
7441 // omp_set_num_threads()).
7442 for (i = 0; i < __kmp_threads_capacity; i++) {
7443 kmp_info_t *thread = __kmp_threads[i];
7444 if (thread == NULL)
7445 continue;
7446 if (thread->th.th_current_task->td_icvs.nproc != 0)
7447 continue;
7448
7449 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7450 }
7451 }
7452 KA_TRACE(
7453 20,
7454 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7455 __kmp_dflt_team_nth));
7456
7457#ifdef KMP_ADJUST_BLOCKTIME
7458 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7459 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7460 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7461 if (__kmp_nth > __kmp_avail_proc) {
7462 __kmp_zero_bt = TRUE;
7463 }
7464 }
7465#endif /* KMP_ADJUST_BLOCKTIME */
7466
7467 /* we have finished middle initialization */
7468 TCW_SYNC_4(__kmp_init_middle, TRUE);
7469
7470 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7471}
7472
7473void __kmp_middle_initialize(void) {
7474 if (__kmp_init_middle) {
7475 return;
7476 }
7477 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7478 if (__kmp_init_middle) {
7479 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7480 return;
7481 }
7482 __kmp_do_middle_initialize();
7483 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7484}
7485
7486void __kmp_parallel_initialize(void) {
7487 int gtid = __kmp_entry_gtid(); // this might be a new root
7488
7489 /* synchronize parallel initialization (for sibling) */
7490 if (TCR_4(__kmp_init_parallel))
7491 return;
7492 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7493 if (TCR_4(__kmp_init_parallel)) {
7494 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7495 return;
7496 }
7497
7498 /* TODO reinitialization after we have already shut down */
7499 if (TCR_4(__kmp_global.g.g_done)) {
7500 KA_TRACE(
7501 10,
7502 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7503 __kmp_infinite_loop();
7504 }
7505
7506 /* jc: The lock __kmp_initz_lock is already held, so calling
7507 __kmp_serial_initialize would cause a deadlock. So we call
7508 __kmp_do_serial_initialize directly. */
7509 if (!__kmp_init_middle) {
7510 __kmp_do_middle_initialize();
7511 }
7512 __kmp_assign_root_init_mask();
7513 __kmp_resume_if_hard_paused();
7514
7515 /* begin initialization */
7516 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7517 KMP_ASSERT(KMP_UBER_GTID(gtid));
7518
7519#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7520 // Save the FP control regs.
7521 // Worker threads will set theirs to these values at thread startup.
7522 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7523 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7524 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7525#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7526
7527#if KMP_OS_UNIX
7528#if KMP_HANDLE_SIGNALS
7529 /* must be after __kmp_serial_initialize */
7530 __kmp_install_signals(TRUE);
7531#endif
7532#endif
7533
7534 __kmp_suspend_initialize();
7535
7536#if defined(USE_LOAD_BALANCE)
7537 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7538 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7539 }
7540#else
7541 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7542 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7543 }
7544#endif
7545
7546 if (__kmp_version) {
7547 __kmp_print_version_2();
7548 }
7549
7550 /* we have finished parallel initialization */
7551 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7552
7553 KMP_MB();
7554 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7555
7556 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7557}
7558
7559void __kmp_hidden_helper_initialize() {
7560 if (TCR_4(__kmp_init_hidden_helper))
7561 return;
7562
7563 // __kmp_parallel_initialize is required before we initialize hidden helper
7564 if (!TCR_4(__kmp_init_parallel))
7565 __kmp_parallel_initialize();
7566
7567 // Double check. Note that this double check should not be placed before
7568 // __kmp_parallel_initialize as it will cause dead lock.
7569 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7570 if (TCR_4(__kmp_init_hidden_helper)) {
7571 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7572 return;
7573 }
7574
7575#if KMP_AFFINITY_SUPPORTED
7576 // Initialize hidden helper affinity settings.
7577 // The above __kmp_parallel_initialize() will initialize
7578 // regular affinity (and topology) if not already done.
7579 if (!__kmp_hh_affinity.flags.initialized)
7580 __kmp_affinity_initialize(__kmp_hh_affinity);
7581#endif
7582
7583 // Set the count of hidden helper tasks to be executed to zero
7584 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7585
7586 // Set the global variable indicating that we're initializing hidden helper
7587 // team/threads
7588 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7589
7590 // Platform independent initialization
7591 __kmp_do_initialize_hidden_helper_threads();
7592
7593 // Wait here for the finish of initialization of hidden helper teams
7594 __kmp_hidden_helper_threads_initz_wait();
7595
7596 // We have finished hidden helper initialization
7597 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7598
7599 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600}
7601
7602/* ------------------------------------------------------------------------ */
7603
7604void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7605 kmp_team_t *team) {
7606 kmp_disp_t *dispatch;
7607
7608 KMP_MB();
7609
7610 /* none of the threads have encountered any constructs, yet. */
7611 this_thr->th.th_local.this_construct = 0;
7612#if KMP_CACHE_MANAGE
7613 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7614#endif /* KMP_CACHE_MANAGE */
7615 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7616 KMP_DEBUG_ASSERT(dispatch);
7617 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7618 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7619 // this_thr->th.th_info.ds.ds_tid ] );
7620
7621 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7622 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7623 if (__kmp_env_consistency_check)
7624 __kmp_push_parallel(gtid, team->t.t_ident);
7625
7626 KMP_MB(); /* Flush all pending memory write invalidates. */
7627}
7628
7629void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7630 kmp_team_t *team) {
7631 if (__kmp_env_consistency_check)
7632 __kmp_pop_parallel(gtid, team->t.t_ident);
7633
7634 __kmp_finish_implicit_task(this_thr);
7635}
7636
7637int __kmp_invoke_task_func(int gtid) {
7638 int rc;
7639 int tid = __kmp_tid_from_gtid(gtid);
7640 kmp_info_t *this_thr = __kmp_threads[gtid];
7641 kmp_team_t *team = this_thr->th.th_team;
7642
7643 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7644#if USE_ITT_BUILD
7645 if (__itt_stack_caller_create_ptr) {
7646 // inform ittnotify about entering user's code
7647 if (team->t.t_stack_id != NULL) {
7648 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7649 } else {
7650 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7651 __kmp_itt_stack_callee_enter(
7652 (__itt_caller)team->t.t_parent->t.t_stack_id);
7653 }
7654 }
7655#endif /* USE_ITT_BUILD */
7656#if INCLUDE_SSC_MARKS
7657 SSC_MARK_INVOKING();
7658#endif
7659
7660#if OMPT_SUPPORT
7661 void *dummy;
7662 void **exit_frame_p;
7663 ompt_data_t *my_task_data;
7664 ompt_data_t *my_parallel_data;
7665 int ompt_team_size;
7666
7667 if (ompt_enabled.enabled) {
7668 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7669 .ompt_task_info.frame.exit_frame.ptr);
7670 } else {
7671 exit_frame_p = &dummy;
7672 }
7673
7674 my_task_data =
7675 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7676 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7677 if (ompt_enabled.ompt_callback_implicit_task) {
7678 ompt_team_size = team->t.t_nproc;
7679 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7680 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7681 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7682 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7683 }
7684#endif
7685
7686#if KMP_STATS_ENABLED
7687 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7688 if (previous_state == stats_state_e::TEAMS_REGION) {
7689 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7690 } else {
7691 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7692 }
7693 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7694#endif
7695
7696 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7697 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7698#if OMPT_SUPPORT
7699 ,
7700 exit_frame_p
7701#endif
7702 );
7703#if OMPT_SUPPORT
7704 *exit_frame_p = NULL;
7705 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7706#endif
7707
7708#if KMP_STATS_ENABLED
7709 if (previous_state == stats_state_e::TEAMS_REGION) {
7710 KMP_SET_THREAD_STATE(previous_state);
7711 }
7712 KMP_POP_PARTITIONED_TIMER();
7713#endif
7714
7715#if USE_ITT_BUILD
7716 if (__itt_stack_caller_create_ptr) {
7717 // inform ittnotify about leaving user's code
7718 if (team->t.t_stack_id != NULL) {
7719 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7720 } else {
7721 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7722 __kmp_itt_stack_callee_leave(
7723 (__itt_caller)team->t.t_parent->t.t_stack_id);
7724 }
7725 }
7726#endif /* USE_ITT_BUILD */
7727 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7728
7729 return rc;
7730}
7731
7732void __kmp_teams_master(int gtid) {
7733 // This routine is called by all primary threads in teams construct
7734 kmp_info_t *thr = __kmp_threads[gtid];
7735 kmp_team_t *team = thr->th.th_team;
7736 ident_t *loc = team->t.t_ident;
7737 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7738 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7739 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7740 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7741 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7742
7743 // This thread is a new CG root. Set up the proper variables.
7744 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7745 tmp->cg_root = thr; // Make thr the CG root
7746 // Init to thread limit stored when league primary threads were forked
7747 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7748 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7749 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7750 " cg_nthreads to 1\n",
7751 thr, tmp));
7752 tmp->up = thr->th.th_cg_roots;
7753 thr->th.th_cg_roots = tmp;
7754
7755// Launch league of teams now, but not let workers execute
7756// (they hang on fork barrier until next parallel)
7757#if INCLUDE_SSC_MARKS
7758 SSC_MARK_FORKING();
7759#endif
7760 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7761 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7762 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7763#if INCLUDE_SSC_MARKS
7764 SSC_MARK_JOINING();
7765#endif
7766 // If the team size was reduced from the limit, set it to the new size
7767 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7768 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7769 // AC: last parameter "1" eliminates join barrier which won't work because
7770 // worker threads are in a fork barrier waiting for more parallel regions
7771 __kmp_join_call(loc, gtid
7772#if OMPT_SUPPORT
7773 ,
7774 fork_context_intel
7775#endif
7776 ,
7777 1);
7778}
7779
7780int __kmp_invoke_teams_master(int gtid) {
7781 kmp_info_t *this_thr = __kmp_threads[gtid];
7782 kmp_team_t *team = this_thr->th.th_team;
7783#if KMP_DEBUG
7784 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7785 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7786 (void *)__kmp_teams_master);
7787#endif
7788 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7789#if OMPT_SUPPORT
7790 int tid = __kmp_tid_from_gtid(gtid);
7791 ompt_data_t *task_data =
7792 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7793 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7794 if (ompt_enabled.ompt_callback_implicit_task) {
7795 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7796 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7797 ompt_task_initial);
7798 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7799 }
7800#endif
7801 __kmp_teams_master(gtid);
7802#if OMPT_SUPPORT
7803 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7804#endif
7805 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7806 return 1;
7807}
7808
7809/* this sets the requested number of threads for the next parallel region
7810 encountered by this team. since this should be enclosed in the forkjoin
7811 critical section it should avoid race conditions with asymmetrical nested
7812 parallelism */
7813void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7814 kmp_info_t *thr = __kmp_threads[gtid];
7815
7816 if (num_threads > 0)
7817 thr->th.th_set_nproc = num_threads;
7818}
7819
7820void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7821 int *num_threads_list) {
7822 kmp_info_t *thr = __kmp_threads[gtid];
7823
7824 KMP_DEBUG_ASSERT(list_length > 1);
7825
7826 if (num_threads_list[0] > 0)
7827 thr->th.th_set_nproc = num_threads_list[0];
7828 thr->th.th_set_nested_nth =
7829 (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7830 for (kmp_uint32 i = 0; i < list_length; ++i)
7831 thr->th.th_set_nested_nth[i] = num_threads_list[i];
7832 thr->th.th_set_nested_nth_sz = list_length;
7833}
7834
7835void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7836 const char *msg) {
7837 kmp_info_t *thr = __kmp_threads[gtid];
7838 thr->th.th_nt_strict = true;
7839 thr->th.th_nt_loc = loc;
7840 // if sev is unset make fatal
7841 if (sev == severity_warning)
7842 thr->th.th_nt_sev = sev;
7843 else
7844 thr->th.th_nt_sev = severity_fatal;
7845 // if msg is unset, use an appropriate message
7846 if (msg)
7847 thr->th.th_nt_msg = msg;
7848 else
7849 thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7850 "strict num_threads clause.";
7851}
7852
7853static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7854 int num_threads) {
7855 KMP_DEBUG_ASSERT(thr);
7856 // Remember the number of threads for inner parallel regions
7857 if (!TCR_4(__kmp_init_middle))
7858 __kmp_middle_initialize(); // get internal globals calculated
7859 __kmp_assign_root_init_mask();
7860 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7861 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7862
7863 if (num_threads == 0) {
7864 if (__kmp_teams_thread_limit > 0) {
7865 num_threads = __kmp_teams_thread_limit;
7866 } else {
7867 num_threads = __kmp_avail_proc / num_teams;
7868 }
7869 // adjust num_threads w/o warning as it is not user setting
7870 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7871 // no thread_limit clause specified - do not change thread-limit-var ICV
7872 if (num_threads > __kmp_dflt_team_nth) {
7873 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7874 }
7875 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7876 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7877 } // prevent team size to exceed thread-limit-var
7878 if (num_teams * num_threads > __kmp_teams_max_nth) {
7879 num_threads = __kmp_teams_max_nth / num_teams;
7880 }
7881 if (num_threads == 0) {
7882 num_threads = 1;
7883 }
7884 } else {
7885 if (num_threads < 0) {
7886 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7887 __kmp_msg_null);
7888 num_threads = 1;
7889 }
7890 // This thread will be the primary thread of the league primary threads
7891 // Store new thread limit; old limit is saved in th_cg_roots list
7892 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7893 // num_threads = min(num_threads, nthreads-var)
7894 if (num_threads > __kmp_dflt_team_nth) {
7895 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7896 }
7897 if (num_teams * num_threads > __kmp_teams_max_nth) {
7898 int new_threads = __kmp_teams_max_nth / num_teams;
7899 if (new_threads == 0) {
7900 new_threads = 1;
7901 }
7902 if (new_threads != num_threads) {
7903 if (!__kmp_reserve_warn) { // user asked for too many threads
7904 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7905 __kmp_msg(kmp_ms_warning,
7906 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7907 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7908 }
7909 }
7910 num_threads = new_threads;
7911 }
7912 }
7913 thr->th.th_teams_size.nth = num_threads;
7914}
7915
7916/* this sets the requested number of teams for the teams region and/or
7917 the number of threads for the next parallel region encountered */
7918void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7919 int num_threads) {
7920 kmp_info_t *thr = __kmp_threads[gtid];
7921 if (num_teams < 0) {
7922 // OpenMP specification requires requested values to be positive,
7923 // but people can send us any value, so we'd better check
7924 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7925 __kmp_msg_null);
7926 num_teams = 1;
7927 }
7928 if (num_teams == 0) {
7929 if (__kmp_nteams > 0) {
7930 num_teams = __kmp_nteams;
7931 } else {
7932 num_teams = 1; // default number of teams is 1.
7933 }
7934 }
7935 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7936 if (!__kmp_reserve_warn) {
7937 __kmp_reserve_warn = 1;
7938 __kmp_msg(kmp_ms_warning,
7939 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7940 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7941 }
7942 num_teams = __kmp_teams_max_nth;
7943 }
7944 // Set number of teams (number of threads in the outer "parallel" of the
7945 // teams)
7946 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7947
7948 __kmp_push_thread_limit(thr, num_teams, num_threads);
7949}
7950
7951/* This sets the requested number of teams for the teams region and/or
7952 the number of threads for the next parallel region encountered */
7953void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7954 int num_teams_ub, int num_threads) {
7955 kmp_info_t *thr = __kmp_threads[gtid];
7956 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7957 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7958 KMP_DEBUG_ASSERT(num_threads >= 0);
7959
7960 if (num_teams_lb > num_teams_ub) {
7961 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7962 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7963 }
7964
7965 int num_teams = 1; // defalt number of teams is 1.
7966
7967 if (num_teams_lb == 0 && num_teams_ub > 0)
7968 num_teams_lb = num_teams_ub;
7969
7970 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7971 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7972 if (num_teams > __kmp_teams_max_nth) {
7973 if (!__kmp_reserve_warn) {
7974 __kmp_reserve_warn = 1;
7975 __kmp_msg(kmp_ms_warning,
7976 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7977 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7978 }
7979 num_teams = __kmp_teams_max_nth;
7980 }
7981 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7982 num_teams = num_teams_ub;
7983 } else { // num_teams_lb <= num_teams <= num_teams_ub
7984 if (num_threads <= 0) {
7985 if (num_teams_ub > __kmp_teams_max_nth) {
7986 num_teams = num_teams_lb;
7987 } else {
7988 num_teams = num_teams_ub;
7989 }
7990 } else {
7991 num_teams = (num_threads > __kmp_teams_max_nth)
7992 ? num_teams
7993 : __kmp_teams_max_nth / num_threads;
7994 if (num_teams < num_teams_lb) {
7995 num_teams = num_teams_lb;
7996 } else if (num_teams > num_teams_ub) {
7997 num_teams = num_teams_ub;
7998 }
7999 }
8000 }
8001 // Set number of teams (number of threads in the outer "parallel" of the
8002 // teams)
8003 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8004
8005 __kmp_push_thread_limit(thr, num_teams, num_threads);
8006}
8007
8008// Set the proc_bind var to use in the following parallel region.
8009void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8010 kmp_info_t *thr = __kmp_threads[gtid];
8011 thr->th.th_set_proc_bind = proc_bind;
8012}
8013
8014/* Launch the worker threads into the microtask. */
8015
8016void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8017 kmp_info_t *this_thr = __kmp_threads[gtid];
8018
8019#ifdef KMP_DEBUG
8020 int f;
8021#endif /* KMP_DEBUG */
8022
8023 KMP_DEBUG_ASSERT(team);
8024 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8025 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8026 KMP_MB(); /* Flush all pending memory write invalidates. */
8027
8028 team->t.t_construct = 0; /* no single directives seen yet */
8029 team->t.t_ordered.dt.t_value =
8030 0; /* thread 0 enters the ordered section first */
8031
8032 /* Reset the identifiers on the dispatch buffer */
8033 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8034 if (team->t.t_max_nproc > 1) {
8035 int i;
8036 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8037 team->t.t_disp_buffer[i].buffer_index = i;
8038 team->t.t_disp_buffer[i].doacross_buf_idx = i;
8039 }
8040 } else {
8041 team->t.t_disp_buffer[0].buffer_index = 0;
8042 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8043 }
8044
8045 KMP_MB(); /* Flush all pending memory write invalidates. */
8046 KMP_ASSERT(this_thr->th.th_team == team);
8047
8048#ifdef KMP_DEBUG
8049 for (f = 0; f < team->t.t_nproc; f++) {
8050 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8051 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8052 }
8053#endif /* KMP_DEBUG */
8054
8055 /* release the worker threads so they may begin working */
8056 __kmp_fork_barrier(gtid, 0);
8057}
8058
8059void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8060 kmp_info_t *this_thr = __kmp_threads[gtid];
8061
8062 KMP_DEBUG_ASSERT(team);
8063 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8064 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8065 KMP_MB(); /* Flush all pending memory write invalidates. */
8066
8067 /* Join barrier after fork */
8068
8069#ifdef KMP_DEBUG
8070 if (__kmp_threads[gtid] &&
8071 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8072 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8073 __kmp_threads[gtid]);
8074 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8075 "team->t.t_nproc=%d\n",
8076 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8077 team->t.t_nproc);
8078 __kmp_print_structure();
8079 }
8080 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8081 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8082#endif /* KMP_DEBUG */
8083
8084 __kmp_join_barrier(gtid); /* wait for everyone */
8085#if OMPT_SUPPORT
8086 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8087 if (ompt_enabled.enabled &&
8088 (ompt_state == ompt_state_wait_barrier_teams ||
8089 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8090 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8091 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8092 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8093#if OMPT_OPTIONAL
8094 void *codeptr = NULL;
8095 if (KMP_MASTER_TID(ds_tid) &&
8096 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8097 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8098 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8099
8100 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8101 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8102 sync_kind = ompt_sync_region_barrier_teams;
8103 if (ompt_enabled.ompt_callback_sync_region_wait) {
8104 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8105 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8106 }
8107 if (ompt_enabled.ompt_callback_sync_region) {
8108 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8109 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8110 }
8111#endif
8112 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8113 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8114 ompt_scope_end, NULL, task_data, 0, ds_tid,
8115 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8116 }
8117 }
8118#endif
8119
8120 KMP_MB(); /* Flush all pending memory write invalidates. */
8121 KMP_ASSERT(this_thr->th.th_team == team);
8122}
8123
8124/* ------------------------------------------------------------------------ */
8125
8126#ifdef USE_LOAD_BALANCE
8127
8128// Return the worker threads actively spinning in the hot team, if we
8129// are at the outermost level of parallelism. Otherwise, return 0.
8130static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8131 int i;
8132 int retval;
8133 kmp_team_t *hot_team;
8134
8135 if (root->r.r_active) {
8136 return 0;
8137 }
8138 hot_team = root->r.r_hot_team;
8139 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8140 return hot_team->t.t_nproc - 1; // Don't count primary thread
8141 }
8142
8143 // Skip the primary thread - it is accounted for elsewhere.
8144 retval = 0;
8145 for (i = 1; i < hot_team->t.t_nproc; i++) {
8146 if (hot_team->t.t_threads[i]->th.th_active) {
8147 retval++;
8148 }
8149 }
8150 return retval;
8151}
8152
8153// Perform an automatic adjustment to the number of
8154// threads used by the next parallel region.
8155static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8156 int retval;
8157 int pool_active;
8158 int hot_team_active;
8159 int team_curr_active;
8160 int system_active;
8161
8162 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8163 set_nproc));
8164 KMP_DEBUG_ASSERT(root);
8165 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8166 ->th.th_current_task->td_icvs.dynamic == TRUE);
8167 KMP_DEBUG_ASSERT(set_nproc > 1);
8168
8169 if (set_nproc == 1) {
8170 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8171 return 1;
8172 }
8173
8174 // Threads that are active in the thread pool, active in the hot team for this
8175 // particular root (if we are at the outer par level), and the currently
8176 // executing thread (to become the primary thread) are available to add to the
8177 // new team, but are currently contributing to the system load, and must be
8178 // accounted for.
8179 pool_active = __kmp_thread_pool_active_nth;
8180 hot_team_active = __kmp_active_hot_team_nproc(root);
8181 team_curr_active = pool_active + hot_team_active + 1;
8182
8183 // Check the system load.
8184 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8185 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8186 "hot team active = %d\n",
8187 system_active, pool_active, hot_team_active));
8188
8189 if (system_active < 0) {
8190 // There was an error reading the necessary info from /proc, so use the
8191 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8192 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8193 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8194 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8195
8196 // Make this call behave like the thread limit algorithm.
8197 retval = __kmp_avail_proc - __kmp_nth +
8198 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8199 if (retval > set_nproc) {
8200 retval = set_nproc;
8201 }
8202 if (retval < KMP_MIN_NTH) {
8203 retval = KMP_MIN_NTH;
8204 }
8205
8206 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8207 retval));
8208 return retval;
8209 }
8210
8211 // There is a slight delay in the load balance algorithm in detecting new
8212 // running procs. The real system load at this instant should be at least as
8213 // large as the #active omp thread that are available to add to the team.
8214 if (system_active < team_curr_active) {
8215 system_active = team_curr_active;
8216 }
8217 retval = __kmp_avail_proc - system_active + team_curr_active;
8218 if (retval > set_nproc) {
8219 retval = set_nproc;
8220 }
8221 if (retval < KMP_MIN_NTH) {
8222 retval = KMP_MIN_NTH;
8223 }
8224
8225 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8226 return retval;
8227} // __kmp_load_balance_nproc()
8228
8229#endif /* USE_LOAD_BALANCE */
8230
8231/* ------------------------------------------------------------------------ */
8232
8233/* NOTE: this is called with the __kmp_init_lock held */
8234void __kmp_cleanup(void) {
8235 int f;
8236
8237 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8238
8239 if (TCR_4(__kmp_init_parallel)) {
8240#if KMP_HANDLE_SIGNALS
8241 __kmp_remove_signals();
8242#endif
8243 TCW_4(__kmp_init_parallel, FALSE);
8244 }
8245
8246 if (TCR_4(__kmp_init_middle)) {
8247#if KMP_AFFINITY_SUPPORTED
8248 __kmp_affinity_uninitialize();
8249#endif /* KMP_AFFINITY_SUPPORTED */
8250 __kmp_cleanup_hierarchy();
8251 TCW_4(__kmp_init_middle, FALSE);
8252 }
8253
8254 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8255
8256 if (__kmp_init_serial) {
8257 __kmp_runtime_destroy();
8258 __kmp_init_serial = FALSE;
8259 }
8260
8261 __kmp_cleanup_threadprivate_caches();
8262
8263 for (f = 0; f < __kmp_threads_capacity; f++) {
8264 if (__kmp_root[f] != NULL) {
8265 __kmp_free(__kmp_root[f]);
8266 __kmp_root[f] = NULL;
8267 }
8268 }
8269 __kmp_free(__kmp_threads);
8270 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8271 // there is no need in freeing __kmp_root.
8272 __kmp_threads = NULL;
8273 __kmp_root = NULL;
8274 __kmp_threads_capacity = 0;
8275
8276 // Free old __kmp_threads arrays if they exist.
8277 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8278 while (ptr) {
8279 kmp_old_threads_list_t *next = ptr->next;
8280 __kmp_free(ptr->threads);
8281 __kmp_free(ptr);
8282 ptr = next;
8283 }
8284 __kmp_old_threads_list = NULL;
8285
8286#if KMP_USE_DYNAMIC_LOCK
8287 __kmp_cleanup_indirect_user_locks();
8288#else
8289 __kmp_cleanup_user_locks();
8290#endif
8291#if OMPD_SUPPORT
8292 if (ompd_env_block) {
8293 __kmp_free(ompd_env_block);
8294 ompd_env_block = NULL;
8295 ompd_env_block_size = 0;
8296 }
8297#endif
8298
8299#if KMP_AFFINITY_SUPPORTED
8300 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8301 __kmp_cpuinfo_file = NULL;
8302#endif /* KMP_AFFINITY_SUPPORTED */
8303
8304#if KMP_USE_ADAPTIVE_LOCKS
8305#if KMP_DEBUG_ADAPTIVE_LOCKS
8306 __kmp_print_speculative_stats();
8307#endif
8308#endif
8309 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8310 __kmp_nested_nth.nth = NULL;
8311 __kmp_nested_nth.size = 0;
8312 __kmp_nested_nth.used = 0;
8313
8314 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8315 __kmp_nested_proc_bind.bind_types = NULL;
8316 __kmp_nested_proc_bind.size = 0;
8317 __kmp_nested_proc_bind.used = 0;
8318 __kmp_dflt_team_nth = 0;
8319 __kmp_dflt_team_nth_ub = 0;
8320 if (__kmp_affinity_format) {
8321 KMP_INTERNAL_FREE(__kmp_affinity_format);
8322 __kmp_affinity_format = NULL;
8323 }
8324
8325 __kmp_i18n_catclose();
8326
8327 if (__kmp_nesting_nth_level)
8328 KMP_INTERNAL_FREE(__kmp_nesting_nth_level);
8329
8330#if KMP_USE_HIER_SCHED
8331 __kmp_hier_scheds.deallocate();
8332#endif
8333
8334#if KMP_STATS_ENABLED
8335 __kmp_stats_fini();
8336#endif
8337
8338 __kmpc_destroy_allocator(KMP_GTID_SHUTDOWN, __kmp_def_allocator);
8339 __kmp_def_allocator = omp_default_mem_alloc;
8340
8341 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8342}
8343
8344/* ------------------------------------------------------------------------ */
8345
8346int __kmp_ignore_mppbeg(void) {
8347 char *env;
8348
8349 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8350 if (__kmp_str_match_false(env))
8351 return FALSE;
8352 }
8353 // By default __kmpc_begin() is no-op.
8354 return TRUE;
8355}
8356
8357int __kmp_ignore_mppend(void) {
8358 char *env;
8359
8360 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8361 if (__kmp_str_match_false(env))
8362 return FALSE;
8363 }
8364 // By default __kmpc_end() is no-op.
8365 return TRUE;
8366}
8367
8368void __kmp_internal_begin(void) {
8369 int gtid;
8370 kmp_root_t *root;
8371
8372 /* this is a very important step as it will register new sibling threads
8373 and assign these new uber threads a new gtid */
8374 gtid = __kmp_entry_gtid();
8375 root = __kmp_threads[gtid]->th.th_root;
8376 KMP_ASSERT(KMP_UBER_GTID(gtid));
8377
8378 if (root->r.r_begin)
8379 return;
8380 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8381 if (root->r.r_begin) {
8382 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8383 return;
8384 }
8385
8386 root->r.r_begin = TRUE;
8387
8388 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8389}
8390
8391/* ------------------------------------------------------------------------ */
8392
8393void __kmp_user_set_library(enum library_type arg) {
8394 int gtid;
8395 kmp_root_t *root;
8396 kmp_info_t *thread;
8397
8398 /* first, make sure we are initialized so we can get our gtid */
8399
8400 gtid = __kmp_entry_gtid();
8401 thread = __kmp_threads[gtid];
8402
8403 root = thread->th.th_root;
8404
8405 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8406 library_serial));
8407 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8408 thread */
8409 KMP_WARNING(SetLibraryIncorrectCall);
8410 return;
8411 }
8412
8413 switch (arg) {
8414 case library_serial:
8415 thread->th.th_set_nproc = 0;
8416 set__nproc(thread, 1);
8417 break;
8418 case library_turnaround:
8419 thread->th.th_set_nproc = 0;
8420 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8421 : __kmp_dflt_team_nth_ub);
8422 break;
8423 case library_throughput:
8424 thread->th.th_set_nproc = 0;
8425 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8426 : __kmp_dflt_team_nth_ub);
8427 break;
8428 default:
8429 KMP_FATAL(UnknownLibraryType, arg);
8430 }
8431
8432 __kmp_aux_set_library(arg);
8433}
8434
8435void __kmp_aux_set_stacksize(size_t arg) {
8436 if (!__kmp_init_serial)
8437 __kmp_serial_initialize();
8438
8439#if KMP_OS_DARWIN
8440 if (arg & (0x1000 - 1)) {
8441 arg &= ~(0x1000 - 1);
8442 if (arg + 0x1000) /* check for overflow if we round up */
8443 arg += 0x1000;
8444 }
8445#endif
8446 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8447
8448 /* only change the default stacksize before the first parallel region */
8449 if (!TCR_4(__kmp_init_parallel)) {
8450 size_t value = arg; /* argument is in bytes */
8451
8452 if (value < __kmp_sys_min_stksize)
8453 value = __kmp_sys_min_stksize;
8454 else if (value > KMP_MAX_STKSIZE)
8455 value = KMP_MAX_STKSIZE;
8456
8457 __kmp_stksize = value;
8458
8459 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8460 }
8461
8462 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8463}
8464
8465/* set the behaviour of the runtime library */
8466/* TODO this can cause some odd behaviour with sibling parallelism... */
8467void __kmp_aux_set_library(enum library_type arg) {
8468 __kmp_library = arg;
8469
8470 switch (__kmp_library) {
8471 case library_serial: {
8472 KMP_INFORM(LibraryIsSerial);
8473 } break;
8474 case library_turnaround:
8475 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8476 __kmp_use_yield = 2; // only yield when oversubscribed
8477 break;
8478 case library_throughput:
8479 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8480 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8481 break;
8482 default:
8483 KMP_FATAL(UnknownLibraryType, arg);
8484 }
8485}
8486
8487/* Getting team information common for all team API */
8488// Returns NULL if not in teams construct
8489static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8490 kmp_info_t *thr = __kmp_entry_thread();
8491 teams_serialized = 0;
8492 if (thr->th.th_teams_microtask) {
8493 kmp_team_t *team = thr->th.th_team;
8494 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8495 int ii = team->t.t_level;
8496 teams_serialized = team->t.t_serialized;
8497 int level = tlevel + 1;
8498 KMP_DEBUG_ASSERT(ii >= tlevel);
8499 while (ii > level) {
8500 for (teams_serialized = team->t.t_serialized;
8501 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8502 }
8503 if (team->t.t_serialized && (!teams_serialized)) {
8504 team = team->t.t_parent;
8505 continue;
8506 }
8507 if (ii > level) {
8508 team = team->t.t_parent;
8509 ii--;
8510 }
8511 }
8512 return team;
8513 }
8514 return NULL;
8515}
8516
8517int __kmp_aux_get_team_num() {
8518 int serialized;
8519 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8520 if (team) {
8521 if (serialized > 1) {
8522 return 0; // teams region is serialized ( 1 team of 1 thread ).
8523 } else {
8524 return team->t.t_master_tid;
8525 }
8526 }
8527 return 0;
8528}
8529
8530int __kmp_aux_get_num_teams() {
8531 int serialized;
8532 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8533 if (team) {
8534 if (serialized > 1) {
8535 return 1;
8536 } else {
8537 return team->t.t_parent->t.t_nproc;
8538 }
8539 }
8540 return 1;
8541}
8542
8543/* ------------------------------------------------------------------------ */
8544
8545/*
8546 * Affinity Format Parser
8547 *
8548 * Field is in form of: %[[[0].]size]type
8549 * % and type are required (%% means print a literal '%')
8550 * type is either single char or long name surrounded by {},
8551 * e.g., N or {num_threads}
8552 * 0 => leading zeros
8553 * . => right justified when size is specified
8554 * by default output is left justified
8555 * size is the *minimum* field length
8556 * All other characters are printed as is
8557 *
8558 * Available field types:
8559 * L {thread_level} - omp_get_level()
8560 * n {thread_num} - omp_get_thread_num()
8561 * h {host} - name of host machine
8562 * P {process_id} - process id (integer)
8563 * T {thread_identifier} - native thread identifier (integer)
8564 * N {num_threads} - omp_get_num_threads()
8565 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8566 * a {thread_affinity} - comma separated list of integers or integer ranges
8567 * (values of affinity mask)
8568 *
8569 * Implementation-specific field types can be added
8570 * If a type is unknown, print "undefined"
8571 */
8572
8573// Structure holding the short name, long name, and corresponding data type
8574// for snprintf. A table of these will represent the entire valid keyword
8575// field types.
8576typedef struct kmp_affinity_format_field_t {
8577 char short_name; // from spec e.g., L -> thread level
8578 const char *long_name; // from spec thread_level -> thread level
8579 char field_format; // data type for snprintf (typically 'd' or 's'
8580 // for integer or string)
8581} kmp_affinity_format_field_t;
8582
8583static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8584#if KMP_AFFINITY_SUPPORTED
8585 {'A', "thread_affinity", 's'},
8586#endif
8587 {'t', "team_num", 'd'},
8588 {'T', "num_teams", 'd'},
8589 {'L', "nesting_level", 'd'},
8590 {'n', "thread_num", 'd'},
8591 {'N', "num_threads", 'd'},
8592 {'a', "ancestor_tnum", 'd'},
8593 {'H', "host", 's'},
8594 {'P', "process_id", 'd'},
8595 {'i', "native_thread_id", 'd'}};
8596
8597// Return the number of characters it takes to hold field
8598static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8599 const char **ptr,
8600 kmp_str_buf_t *field_buffer) {
8601 int rc, format_index, field_value;
8602 const char *width_left, *width_right;
8603 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8604 static const int FORMAT_SIZE = 20;
8605 char format[FORMAT_SIZE] = {0};
8606 char absolute_short_name = 0;
8607
8608 KMP_DEBUG_ASSERT(gtid >= 0);
8609 KMP_DEBUG_ASSERT(th);
8610 KMP_DEBUG_ASSERT(**ptr == '%');
8611 KMP_DEBUG_ASSERT(field_buffer);
8612
8613 __kmp_str_buf_clear(field_buffer);
8614
8615 // Skip the initial %
8616 (*ptr)++;
8617
8618 // Check for %% first
8619 if (**ptr == '%') {
8620 __kmp_str_buf_cat(field_buffer, "%", 1);
8621 (*ptr)++; // skip over the second %
8622 return 1;
8623 }
8624
8625 // Parse field modifiers if they are present
8626 pad_zeros = false;
8627 if (**ptr == '0') {
8628 pad_zeros = true;
8629 (*ptr)++; // skip over 0
8630 }
8631 right_justify = false;
8632 if (**ptr == '.') {
8633 right_justify = true;
8634 (*ptr)++; // skip over .
8635 }
8636 // Parse width of field: [width_left, width_right)
8637 width_left = width_right = NULL;
8638 if (**ptr >= '0' && **ptr <= '9') {
8639 width_left = *ptr;
8640 SKIP_DIGITS(*ptr);
8641 width_right = *ptr;
8642 }
8643
8644 // Create the format for KMP_SNPRINTF based on flags parsed above
8645 format_index = 0;
8646 format[format_index++] = '%';
8647 if (!right_justify)
8648 format[format_index++] = '-';
8649 if (pad_zeros)
8650 format[format_index++] = '0';
8651 if (width_left && width_right) {
8652 int i = 0;
8653 // Only allow 8 digit number widths.
8654 // This also prevents overflowing format variable
8655 while (i < 8 && width_left < width_right) {
8656 format[format_index++] = *width_left;
8657 width_left++;
8658 i++;
8659 }
8660 }
8661
8662 // Parse a name (long or short)
8663 // Canonicalize the name into absolute_short_name
8664 found_valid_name = false;
8665 parse_long_name = (**ptr == '{');
8666 if (parse_long_name)
8667 (*ptr)++; // skip initial left brace
8668 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8669 sizeof(__kmp_affinity_format_table[0]);
8670 ++i) {
8671 char short_name = __kmp_affinity_format_table[i].short_name;
8672 const char *long_name = __kmp_affinity_format_table[i].long_name;
8673 char field_format = __kmp_affinity_format_table[i].field_format;
8674 if (parse_long_name) {
8675 size_t length = KMP_STRLEN(long_name);
8676 if (strncmp(*ptr, long_name, length) == 0) {
8677 found_valid_name = true;
8678 (*ptr) += length; // skip the long name
8679 }
8680 } else if (**ptr == short_name) {
8681 found_valid_name = true;
8682 (*ptr)++; // skip the short name
8683 }
8684 if (found_valid_name) {
8685 format[format_index++] = field_format;
8686 format[format_index++] = '\0';
8687 absolute_short_name = short_name;
8688 break;
8689 }
8690 }
8691 if (parse_long_name) {
8692 if (**ptr != '}') {
8693 absolute_short_name = 0;
8694 } else {
8695 (*ptr)++; // skip over the right brace
8696 }
8697 }
8698
8699 // Attempt to fill the buffer with the requested
8700 // value using snprintf within __kmp_str_buf_print()
8701 switch (absolute_short_name) {
8702 case 't':
8703 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8704 break;
8705 case 'T':
8706 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8707 break;
8708 case 'L':
8709 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8710 break;
8711 case 'n':
8712 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8713 break;
8714 case 'H': {
8715 static const int BUFFER_SIZE = 256;
8716 char buf[BUFFER_SIZE];
8717 __kmp_expand_host_name(buf, BUFFER_SIZE);
8718 rc = __kmp_str_buf_print(field_buffer, format, buf);
8719 } break;
8720 case 'P':
8721 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8722 break;
8723 case 'i':
8724 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8725 break;
8726 case 'N':
8727 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8728 break;
8729 case 'a':
8730 field_value =
8731 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8732 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8733 break;
8734#if KMP_AFFINITY_SUPPORTED
8735 case 'A': {
8736 if (th->th.th_affin_mask) {
8737 kmp_str_buf_t buf;
8738 __kmp_str_buf_init(&buf);
8739 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8740 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8741 __kmp_str_buf_free(&buf);
8742 } else {
8743 rc = __kmp_str_buf_print(field_buffer, "%s", "disabled");
8744 }
8745 } break;
8746#endif
8747 default:
8748 // According to spec, If an implementation does not have info for field
8749 // type, then "undefined" is printed
8750 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8751 // Skip the field
8752 if (parse_long_name) {
8753 SKIP_TOKEN(*ptr);
8754 if (**ptr == '}')
8755 (*ptr)++;
8756 } else {
8757 (*ptr)++;
8758 }
8759 }
8760
8761 KMP_ASSERT(format_index <= FORMAT_SIZE);
8762 return rc;
8763}
8764
8765/*
8766 * Return number of characters needed to hold the affinity string
8767 * (not including null byte character)
8768 * The resultant string is printed to buffer, which the caller can then
8769 * handle afterwards
8770 */
8771size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8772 kmp_str_buf_t *buffer) {
8773 const char *parse_ptr;
8774 size_t retval;
8775 const kmp_info_t *th;
8776 kmp_str_buf_t field;
8777
8778 KMP_DEBUG_ASSERT(buffer);
8779 KMP_DEBUG_ASSERT(gtid >= 0);
8780
8781 __kmp_str_buf_init(&field);
8782 __kmp_str_buf_clear(buffer);
8783
8784 th = __kmp_threads[gtid];
8785 retval = 0;
8786
8787 // If format is NULL or zero-length string, then we use
8788 // affinity-format-var ICV
8789 parse_ptr = format;
8790 if (parse_ptr == NULL || *parse_ptr == '\0') {
8791 parse_ptr = __kmp_affinity_format;
8792 }
8793 KMP_DEBUG_ASSERT(parse_ptr);
8794
8795 while (*parse_ptr != '\0') {
8796 // Parse a field
8797 if (*parse_ptr == '%') {
8798 // Put field in the buffer
8799 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8800 __kmp_str_buf_catbuf(buffer, &field);
8801 retval += rc;
8802 } else {
8803 // Put literal character in buffer
8804 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8805 retval++;
8806 parse_ptr++;
8807 }
8808 }
8809 __kmp_str_buf_free(&field);
8810 return retval;
8811}
8812
8813// Displays the affinity string to stdout
8814void __kmp_aux_display_affinity(int gtid, const char *format) {
8815 kmp_str_buf_t buf;
8816 __kmp_str_buf_init(&buf);
8817 __kmp_aux_capture_affinity(gtid, format, &buf);
8818 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8819 __kmp_str_buf_free(&buf);
8820}
8821
8822/* ------------------------------------------------------------------------ */
8823void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8824 int blocktime = arg; /* argument is in microseconds */
8825#if KMP_USE_MONITOR
8826 int bt_intervals;
8827#endif
8828 kmp_int8 bt_set;
8829
8830 __kmp_save_internal_controls(thread);
8831
8832 /* Normalize and set blocktime for the teams */
8833 if (blocktime < KMP_MIN_BLOCKTIME)
8834 blocktime = KMP_MIN_BLOCKTIME;
8835 else if (blocktime > KMP_MAX_BLOCKTIME)
8836 blocktime = KMP_MAX_BLOCKTIME;
8837
8838 set__blocktime_team(thread->th.th_team, tid, blocktime);
8839 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8840
8841#if KMP_USE_MONITOR
8842 /* Calculate and set blocktime intervals for the teams */
8843 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8844
8845 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8846 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8847#endif
8848
8849 /* Set whether blocktime has been set to "TRUE" */
8850 bt_set = TRUE;
8851
8852 set__bt_set_team(thread->th.th_team, tid, bt_set);
8853 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8854#if KMP_USE_MONITOR
8855 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8856 "bt_intervals=%d, monitor_updates=%d\n",
8857 __kmp_gtid_from_tid(tid, thread->th.th_team),
8858 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8859 __kmp_monitor_wakeups));
8860#else
8861 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8862 __kmp_gtid_from_tid(tid, thread->th.th_team),
8863 thread->th.th_team->t.t_id, tid, blocktime));
8864#endif
8865}
8866
8867void __kmp_aux_set_defaults(char const *str, size_t len) {
8868 if (!__kmp_init_serial) {
8869 __kmp_serial_initialize();
8870 }
8871 __kmp_env_initialize(str);
8872
8873 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8874 __kmp_env_print();
8875 }
8876} // __kmp_aux_set_defaults
8877
8878/* ------------------------------------------------------------------------ */
8879/* internal fast reduction routines */
8880
8881PACKED_REDUCTION_METHOD_T
8882__kmp_determine_reduction_method(
8883 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8884 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8885 kmp_critical_name *lck) {
8886
8887 // Default reduction method: critical construct ( lck != NULL, like in current
8888 // PAROPT )
8889 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8890 // can be selected by RTL
8891 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8892 // can be selected by RTL
8893 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8894 // among generated by PAROPT.
8895
8896 PACKED_REDUCTION_METHOD_T retval;
8897
8898 int team_size;
8899
8900 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8901
8902#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8903 (loc && \
8904 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8905#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8906
8907 retval = critical_reduce_block;
8908
8909 // another choice of getting a team size (with 1 dynamic deference) is slower
8910 team_size = __kmp_get_team_num_threads(global_tid);
8911 if (team_size == 1) {
8912
8913 retval = empty_reduce_block;
8914
8915 } else {
8916
8917 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8918
8919#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8920 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8921 KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8922
8923#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8924 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || \
8925 KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8926
8927 int teamsize_cutoff = 4;
8928
8929#if KMP_MIC_SUPPORTED
8930 if (__kmp_mic_type != non_mic) {
8931 teamsize_cutoff = 8;
8932 }
8933#endif
8934 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8935 if (tree_available) {
8936 if (team_size <= teamsize_cutoff) {
8937 if (atomic_available) {
8938 retval = atomic_reduce_block;
8939 }
8940 } else {
8941 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8942 }
8943 } else if (atomic_available) {
8944 retval = atomic_reduce_block;
8945 }
8946#else
8947#error "Unknown or unsupported OS"
8948#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8949 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||
8950 // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8951
8952#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8953 KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC
8954
8955#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8956 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \
8957 KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8958
8959 // basic tuning
8960
8961 if (atomic_available) {
8962 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8963 retval = atomic_reduce_block;
8964 }
8965 } // otherwise: use critical section
8966
8967#elif KMP_OS_DARWIN
8968
8969 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8970 if (atomic_available && (num_vars <= 3)) {
8971 retval = atomic_reduce_block;
8972 } else if (tree_available) {
8973 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8974 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8975 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8976 }
8977 } // otherwise: use critical section
8978
8979#else
8980#error "Unknown or unsupported OS"
8981#endif
8982
8983#else
8984#error "Unknown or unsupported architecture"
8985#endif
8986 }
8987
8988 // KMP_FORCE_REDUCTION
8989
8990 // If the team is serialized (team_size == 1), ignore the forced reduction
8991 // method and stay with the unsynchronized method (empty_reduce_block)
8992 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8993 team_size != 1) {
8994
8995 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8996
8997 int atomic_available, tree_available;
8998
8999 switch ((forced_retval = __kmp_force_reduction_method)) {
9000 case critical_reduce_block:
9001 KMP_ASSERT(lck); // lck should be != 0
9002 break;
9003
9004 case atomic_reduce_block:
9005 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9006 if (!atomic_available) {
9007 KMP_WARNING(RedMethodNotSupported, "atomic");
9008 forced_retval = critical_reduce_block;
9009 }
9010 break;
9011
9012 case tree_reduce_block:
9013 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9014 if (!tree_available) {
9015 KMP_WARNING(RedMethodNotSupported, "tree");
9016 forced_retval = critical_reduce_block;
9017 } else {
9018#if KMP_FAST_REDUCTION_BARRIER
9019 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9020#endif
9021 }
9022 break;
9023
9024 default:
9025 KMP_ASSERT(0); // "unsupported method specified"
9026 }
9027
9028 retval = forced_retval;
9029 }
9030
9031 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9032
9033#undef FAST_REDUCTION_TREE_METHOD_GENERATED
9034#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9035
9036 return (retval);
9037}
9038// this function is for testing set/get/determine reduce method
9039kmp_int32 __kmp_get_reduce_method(void) {
9040 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9041}
9042
9043// Soft pause sets up threads to ignore blocktime and just go to sleep.
9044// Spin-wait code checks __kmp_pause_status and reacts accordingly.
9045void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9046
9047// Hard pause shuts down the runtime completely. Resume happens naturally when
9048// OpenMP is used subsequently.
9049void __kmp_hard_pause() {
9050 __kmp_pause_status = kmp_hard_paused;
9051 __kmp_internal_end_thread(-1);
9052}
9053
9054// Soft resume sets __kmp_pause_status, and wakes up all threads.
9055void __kmp_resume_if_soft_paused() {
9056 if (__kmp_pause_status == kmp_soft_paused) {
9057 __kmp_pause_status = kmp_not_paused;
9058
9059 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9060 kmp_info_t *thread = __kmp_threads[gtid];
9061 if (thread) { // Wake it if sleeping
9062 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9063 thread);
9064 if (fl.is_sleeping())
9065 fl.resume(gtid);
9066 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9067 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9068 } else { // thread holds the lock and may sleep soon
9069 do { // until either the thread sleeps, or we can get the lock
9070 if (fl.is_sleeping()) {
9071 fl.resume(gtid);
9072 break;
9073 } else if (__kmp_try_suspend_mx(thread)) {
9074 __kmp_unlock_suspend_mx(thread);
9075 break;
9076 }
9077 } while (1);
9078 }
9079 }
9080 }
9081 }
9082}
9083
9084// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9085// TODO: add warning messages
9086int __kmp_pause_resource(kmp_pause_status_t level) {
9087 if (level == kmp_not_paused) { // requesting resume
9088 if (__kmp_pause_status == kmp_not_paused) {
9089 // error message about runtime not being paused, so can't resume
9090 return 1;
9091 } else {
9092 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9093 __kmp_pause_status == kmp_hard_paused);
9094 __kmp_pause_status = kmp_not_paused;
9095 return 0;
9096 }
9097 } else if (level == kmp_soft_paused) { // requesting soft pause
9098 if (__kmp_pause_status != kmp_not_paused) {
9099 // error message about already being paused
9100 return 1;
9101 } else {
9102 __kmp_soft_pause();
9103 return 0;
9104 }
9105 } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
9106 // requesting hard pause or stop_tool pause
9107 if (__kmp_pause_status != kmp_not_paused) {
9108 // error message about already being paused
9109 return 1;
9110 } else {
9111 __kmp_hard_pause();
9112 return 0;
9113 }
9114 } else {
9115 // error message about invalid level
9116 return 1;
9117 }
9118}
9119
9120void __kmp_omp_display_env(int verbose) {
9121 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9122 if (__kmp_init_serial == 0)
9123 __kmp_do_serial_initialize();
9124 __kmp_display_env_impl(!verbose, verbose);
9125 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9126}
9127
9128// The team size is changing, so distributed barrier must be modified
9129void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9130 int new_nthreads) {
9131 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9132 bp_dist_bar);
9133 kmp_info_t **other_threads = team->t.t_threads;
9134
9135 // We want all the workers to stop waiting on the barrier while we adjust the
9136 // size of the team.
9137 for (int f = 1; f < old_nthreads; ++f) {
9138 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9139 // Ignore threads that are already inactive or not present in the team
9140 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9141 // teams construct causes thread_limit to get passed in, and some of
9142 // those could be inactive; just ignore them
9143 continue;
9144 }
9145 // If thread is transitioning still to in_use state, wait for it
9146 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9147 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9148 KMP_CPU_PAUSE();
9149 }
9150 // The thread should be in_use now
9151 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9152 // Transition to unused state
9153 team->t.t_threads[f]->th.th_used_in_team.store(2);
9154 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9155 }
9156 // Release all the workers
9157 team->t.b->go_release();
9158
9159 KMP_MFENCE();
9160
9161 // Workers should see transition status 2 and move to 0; but may need to be
9162 // woken up first
9163 int count = old_nthreads - 1;
9164 while (count > 0) {
9165 count = old_nthreads - 1;
9166 for (int f = 1; f < old_nthreads; ++f) {
9167 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9168 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9169 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9170 void *, other_threads[f]->th.th_sleep_loc);
9171 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9172 }
9173 } else {
9174 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9175 count--;
9176 }
9177 }
9178 }
9179 // Now update the barrier size
9180 team->t.b->update_num_threads(new_nthreads);
9181 team->t.b->go_reset();
9182}
9183
9184void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9185 // Add the threads back to the team
9186 KMP_DEBUG_ASSERT(team);
9187 // Threads were paused and pointed at th_used_in_team temporarily during a
9188 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9189 // the thread that it should transition itself back into the team. Then, if
9190 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9191 // to wake it up.
9192 for (int f = 1; f < new_nthreads; ++f) {
9193 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9194 (void)KMP_COMPARE_AND_STORE_ACQ32(
9195 &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
9196 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9197 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9198 (kmp_flag_32<false, false> *)NULL);
9199 }
9200 }
9201 // The threads should be transitioning to the team; when they are done, they
9202 // should have set th_used_in_team to 1. This loop forces master to wait until
9203 // all threads have moved into the team and are waiting in the barrier.
9204 int count = new_nthreads - 1;
9205 while (count > 0) {
9206 count = new_nthreads - 1;
9207 for (int f = 1; f < new_nthreads; ++f) {
9208 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9209 count--;
9210 }
9211 }
9212 }
9213}
9214
9215// Globals and functions for hidden helper task
9216kmp_info_t **__kmp_hidden_helper_threads;
9217kmp_info_t *__kmp_hidden_helper_main_thread;
9218std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9219#if KMP_OS_LINUX
9220kmp_int32 __kmp_hidden_helper_threads_num = 8;
9221kmp_int32 __kmp_enable_hidden_helper = TRUE;
9222#else
9223kmp_int32 __kmp_hidden_helper_threads_num = 0;
9224kmp_int32 __kmp_enable_hidden_helper = FALSE;
9225#endif
9226
9227namespace {
9228std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9229
9230void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9231 // This is an explicit synchronization on all hidden helper threads in case
9232 // that when a regular thread pushes a hidden helper task to one hidden
9233 // helper thread, the thread has not been awaken once since they're released
9234 // by the main thread after creating the team.
9235 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9236 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9237 __kmp_hidden_helper_threads_num)
9238 ;
9239
9240 // If main thread, then wait for signal
9241 if (__kmpc_master(nullptr, *gtid)) {
9242 // First, unset the initial state and release the initial thread
9243 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9244 __kmp_hidden_helper_initz_release();
9245 __kmp_hidden_helper_main_thread_wait();
9246 // Now wake up all worker threads
9247 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9248 __kmp_hidden_helper_worker_thread_signal();
9249 }
9250 }
9251}
9252} // namespace
9253
9254void __kmp_hidden_helper_threads_initz_routine() {
9255 // Create a new root for hidden helper team/threads
9256 const int gtid = __kmp_register_root(TRUE);
9257 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9258 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9259 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9260 __kmp_hidden_helper_threads_num;
9261
9262 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9263
9264 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9265
9266 // Set the initialization flag to FALSE
9267 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9268
9269 __kmp_hidden_helper_threads_deinitz_release();
9270}
9271
9272/* Nesting Mode:
9273 Set via KMP_NESTING_MODE, which takes an integer.
9274 Note: we skip duplicate topology levels, and skip levels with only
9275 one entity.
9276 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9277 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9278 in the topology, and initializes the number of threads at each of those
9279 levels to the number of entities at each level, respectively, below the
9280 entity at the parent level.
9281 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9282 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9283 the user to turn nesting on explicitly. This is an even more experimental
9284 option to this experimental feature, and may change or go away in the
9285 future.
9286*/
9287
9288// Allocate space to store nesting levels
9289void __kmp_init_nesting_mode() {
9290 int levels = KMP_HW_LAST;
9291 __kmp_nesting_mode_nlevels = levels;
9292 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9293 for (int i = 0; i < levels; ++i)
9294 __kmp_nesting_nth_level[i] = 0;
9295 if (__kmp_nested_nth.size < levels) {
9296 __kmp_nested_nth.nth =
9297 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9298 __kmp_nested_nth.size = levels;
9299 }
9300}
9301
9302// Set # threads for top levels of nesting; must be called after topology set
9303void __kmp_set_nesting_mode_threads() {
9304 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9305
9306 if (__kmp_nesting_mode == 1)
9307 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9308 else if (__kmp_nesting_mode > 1)
9309 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9310
9311 if (__kmp_topology) { // use topology info
9312 int loc, hw_level;
9313 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9314 loc < __kmp_nesting_mode_nlevels;
9315 loc++, hw_level++) {
9316 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9317 if (__kmp_nesting_nth_level[loc] == 1)
9318 loc--;
9319 }
9320 // Make sure all cores are used
9321 if (__kmp_nesting_mode > 1 && loc > 1) {
9322 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9323 int num_cores = __kmp_topology->get_count(core_level);
9324 int upper_levels = 1;
9325 for (int level = 0; level < loc - 1; ++level)
9326 upper_levels *= __kmp_nesting_nth_level[level];
9327 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9328 __kmp_nesting_nth_level[loc - 1] =
9329 num_cores / __kmp_nesting_nth_level[loc - 2];
9330 }
9331 __kmp_nesting_mode_nlevels = loc;
9332 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9333 } else { // no topology info available; provide a reasonable guesstimation
9334 if (__kmp_avail_proc >= 4) {
9335 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9336 __kmp_nesting_nth_level[1] = 2;
9337 __kmp_nesting_mode_nlevels = 2;
9338 } else {
9339 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9340 __kmp_nesting_mode_nlevels = 1;
9341 }
9342 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9343 }
9344 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9345 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9346 }
9347 set__nproc(thread, __kmp_nesting_nth_level[0]);
9348 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9349 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9350 if (get__max_active_levels(thread) > 1) {
9351 // if max levels was set, set nesting mode levels to same
9352 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9353 }
9354 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9355 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9356}
9357
9358// Empty symbols to export (see exports_so.txt) when feature is disabled
9359extern "C" {
9360#if !KMP_STATS_ENABLED
9361void __kmp_reset_stats() {}
9362#endif
9363#if !USE_DEBUGGER
9364int __kmp_omp_debug_struct_info = FALSE;
9365int __kmp_debugging = FALSE;
9366#endif
9367#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9368void __kmp_itt_fini_ittlib() {}
9369void __kmp_itt_init_ittlib() {}
9370#endif
9371}
9372
9373// end of file
@ KMP_IDENT_AUTOPAR
Definition kmp.h:192
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition kmp_stats.h:63
sched_type
Definition kmp.h:350
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition kmp.h:357
@ kmp_sch_static
Definition kmp.h:353
@ kmp_sch_guided_chunked
Definition kmp.h:355
Definition kmp.h:227
kmp_int32 flags
Definition kmp.h:229