LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34#if OMPD_SUPPORT
35#include "ompd-specific.h"
36#endif
37
38#if OMP_PROFILING_SUPPORT
39#include "llvm/Support/TimeProfiler.h"
40static char *ProfileTraceFile = nullptr;
41#endif
42
43/* these are temporary issues to be dealt with */
44#define KMP_USE_PRCTL 0
45
46#if KMP_OS_WINDOWS
47#include <process.h>
48#endif
49
50#if KMP_OS_WINDOWS
51// windows does not need include files as it doesn't use shared memory
52#else
53#include <sys/mman.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#define SHM_SIZE 1024
57#endif
58
59#if defined(KMP_GOMP_COMPAT)
60char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62#endif /* defined(KMP_GOMP_COMPAT) */
63
64char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66
67#ifdef KMP_DEBUG
68char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70#endif /* KMP_DEBUG */
71
72#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73
74/* ------------------------------------------------------------------------ */
75
76#if KMP_USE_MONITOR
77kmp_info_t __kmp_monitor;
78#endif
79
80/* Forward declarations */
81
82void __kmp_cleanup(void);
83
84static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89#if KMP_AFFINITY_SUPPORTED
90static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92#endif
93static void __kmp_do_serial_initialize(void);
94void __kmp_fork_barrier(int gtid, int tid);
95void __kmp_join_barrier(int gtid);
96void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
98
99#ifdef USE_LOAD_BALANCE
100static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101#endif
102
103static int __kmp_expand_threads(int nNeed);
104#if KMP_OS_WINDOWS
105static int __kmp_unregister_root_other_thread(int gtid);
106#endif
107static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109
110void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111 int new_nthreads);
112void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113
114/* Calculate the identifier of the current thread */
115/* fast (and somewhat portable) way to get unique identifier of executing
116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117int __kmp_get_global_thread_id() {
118 int i;
119 kmp_info_t **other_threads;
120 size_t stack_data;
121 char *stack_addr;
122 size_t stack_size;
123 char *stack_base;
124
125 KA_TRACE(
126 1000,
127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128 __kmp_nth, __kmp_all_nth));
129
130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133 __kmp_init_gtid for this to work. */
134
135 if (!TCR_4(__kmp_init_gtid))
136 return KMP_GTID_DNE;
137
138#ifdef KMP_TDATA_GTID
139 if (TCR_4(__kmp_gtid_mode) >= 3) {
140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141 return __kmp_gtid;
142 }
143#endif
144 if (TCR_4(__kmp_gtid_mode) >= 2) {
145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146 return __kmp_gtid_get_specific();
147 }
148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149
150 stack_addr = (char *)&stack_data;
151 other_threads = __kmp_threads;
152
153 /* ATT: The code below is a source of potential bugs due to unsynchronized
154 access to __kmp_threads array. For example:
155 1. Current thread loads other_threads[i] to thr and checks it, it is
156 non-NULL.
157 2. Current thread is suspended by OS.
158 3. Another thread unregisters and finishes (debug versions of free()
159 may fill memory with something like 0xEF).
160 4. Current thread is resumed.
161 5. Current thread reads junk from *thr.
162 TODO: Fix it. --ln */
163
164 for (i = 0; i < __kmp_threads_capacity; i++) {
165
166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167 if (!thr)
168 continue;
169
170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172
173 /* stack grows down -- search through all of the active threads */
174
175 if (stack_addr <= stack_base) {
176 size_t stack_diff = stack_base - stack_addr;
177
178 if (stack_diff <= stack_size) {
179 /* The only way we can be closer than the allocated */
180 /* stack size is if we are running on this thread. */
181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182 return i;
183 }
184 }
185 }
186
187 /* get specific to try and determine our gtid */
188 KA_TRACE(1000,
189 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190 "thread, using TLS\n"));
191 i = __kmp_gtid_get_specific();
192
193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194
195 /* if we havn't been assigned a gtid, then return code */
196 if (i < 0)
197 return i;
198
199 /* dynamically updated stack window for uber threads to avoid get_specific
200 call */
201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202 KMP_FATAL(StackOverflow, i);
203 }
204
205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206 if (stack_addr > stack_base) {
207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210 stack_base);
211 } else {
212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213 stack_base - stack_addr);
214 }
215
216 /* Reprint stack bounds for ubermaster since they have been refined */
217 if (__kmp_storage_map) {
218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221 other_threads[i]->th.th_info.ds.ds_stacksize,
222 "th_%d stack (refinement)", i);
223 }
224 return i;
225}
226
227int __kmp_get_global_thread_id_reg() {
228 int gtid;
229
230 if (!__kmp_init_serial) {
231 gtid = KMP_GTID_DNE;
232 } else
233#ifdef KMP_TDATA_GTID
234 if (TCR_4(__kmp_gtid_mode) >= 3) {
235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236 gtid = __kmp_gtid;
237 } else
238#endif
239 if (TCR_4(__kmp_gtid_mode) >= 2) {
240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241 gtid = __kmp_gtid_get_specific();
242 } else {
243 KA_TRACE(1000,
244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245 gtid = __kmp_get_global_thread_id();
246 }
247
248 /* we must be a new uber master sibling thread */
249 if (gtid == KMP_GTID_DNE) {
250 KA_TRACE(10,
251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252 "Registering a new gtid.\n"));
253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254 if (!__kmp_init_serial) {
255 __kmp_do_serial_initialize();
256 gtid = __kmp_gtid_get_specific();
257 } else {
258 gtid = __kmp_register_root(FALSE);
259 }
260 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262 }
263
264 KMP_DEBUG_ASSERT(gtid >= 0);
265
266 return gtid;
267}
268
269/* caller must hold forkjoin_lock */
270void __kmp_check_stack_overlap(kmp_info_t *th) {
271 int f;
272 char *stack_beg = NULL;
273 char *stack_end = NULL;
274 int gtid;
275
276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277 if (__kmp_storage_map) {
278 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280
281 gtid = __kmp_gtid_from_thread(th);
282
283 if (gtid == KMP_GTID_MONITOR) {
284 __kmp_print_storage_map_gtid(
285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286 "th_%s stack (%s)", "mon",
287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288 } else {
289 __kmp_print_storage_map_gtid(
290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291 "th_%d stack (%s)", gtid,
292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293 }
294 }
295
296 /* No point in checking ubermaster threads since they use refinement and
297 * cannot overlap */
298 gtid = __kmp_gtid_from_thread(th);
299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300 KA_TRACE(10,
301 ("__kmp_check_stack_overlap: performing extensive checking\n"));
302 if (stack_beg == NULL) {
303 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305 }
306
307 for (f = 0; f < __kmp_threads_capacity; f++) {
308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309
310 if (f_th && f_th != th) {
311 char *other_stack_end =
312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313 char *other_stack_beg =
314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317
318 /* Print the other stack values before the abort */
319 if (__kmp_storage_map)
320 __kmp_print_storage_map_gtid(
321 -1, other_stack_beg, other_stack_end,
322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324
325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326 __kmp_msg_null);
327 }
328 }
329 }
330 }
331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332}
333
334/* ------------------------------------------------------------------------ */
335
336void __kmp_infinite_loop(void) {
337 static int done = FALSE;
338
339 while (!done) {
340 KMP_YIELD(TRUE);
341 }
342}
343
344#define MAX_MESSAGE 512
345
346void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347 char const *format, ...) {
348 char buffer[MAX_MESSAGE];
349 va_list ap;
350
351 va_start(ap, format);
352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353 p2, (unsigned long)size, format);
354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355 __kmp_vprintf(kmp_err, buffer, ap);
356#if KMP_PRINT_DATA_PLACEMENT
357 int node;
358 if (gtid >= 0) {
359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360 if (__kmp_storage_map_verbose) {
361 node = __kmp_get_host_node(p1);
362 if (node < 0) /* doesn't work, so don't try this next time */
363 __kmp_storage_map_verbose = FALSE;
364 else {
365 char *last;
366 int lastNode;
367 int localProc = __kmp_get_cpu_from_gtid(gtid);
368
369 const int page_size = KMP_GET_PAGE_SIZE();
370
371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373 if (localProc >= 0)
374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375 localProc >> 1);
376 else
377 __kmp_printf_no_lock(" GTID %d\n", gtid);
378#if KMP_USE_PRCTL
379 /* The more elaborate format is disabled for now because of the prctl
380 * hanging bug. */
381 do {
382 last = p1;
383 lastNode = node;
384 /* This loop collates adjacent pages with the same host node. */
385 do {
386 (char *)p1 += page_size;
387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389 lastNode);
390 } while (p1 <= p2);
391#else
392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393 (char *)p1 + (page_size - 1),
394 __kmp_get_host_node(p1));
395 if (p1 < p2) {
396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397 (char *)p2 + (page_size - 1),
398 __kmp_get_host_node(p2));
399 }
400#endif
401 }
402 }
403 } else
404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405 }
406#endif /* KMP_PRINT_DATA_PLACEMENT */
407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408
409 va_end(ap);
410}
411
412void __kmp_warn(char const *format, ...) {
413 char buffer[MAX_MESSAGE];
414 va_list ap;
415
416 if (__kmp_generate_warnings == kmp_warnings_off) {
417 return;
418 }
419
420 va_start(ap, format);
421
422 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
423 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
424 __kmp_vprintf(kmp_err, buffer, ap);
425 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
426
427 va_end(ap);
428}
429
430void __kmp_abort_process() {
431 // Later threads may stall here, but that's ok because abort() will kill them.
432 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
433
434 if (__kmp_debug_buf) {
435 __kmp_dump_debug_buffer();
436 }
437
438 if (KMP_OS_WINDOWS) {
439 // Let other threads know of abnormal termination and prevent deadlock
440 // if abort happened during library initialization or shutdown
441 __kmp_global.g.g_abort = SIGABRT;
442
443 /* On Windows* OS by default abort() causes pop-up error box, which stalls
444 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
445 boxes. _set_abort_behavior() works well, but this function is not
446 available in VS7 (this is not problem for DLL, but it is a problem for
447 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
448 help, at least in some versions of MS C RTL.
449
450 It seems following sequence is the only way to simulate abort() and
451 avoid pop-up error box. */
452 raise(SIGABRT);
453 _exit(3); // Just in case, if signal ignored, exit anyway.
454 } else {
455 __kmp_unregister_library();
456 abort();
457 }
458
459 __kmp_infinite_loop();
460 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
461
462} // __kmp_abort_process
463
464void __kmp_abort_thread(void) {
465 // TODO: Eliminate g_abort global variable and this function.
466 // In case of abort just call abort(), it will kill all the threads.
467 __kmp_infinite_loop();
468} // __kmp_abort_thread
469
470/* Print out the storage map for the major kmp_info_t thread data structures
471 that are allocated together. */
472
473static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
474 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
475 gtid);
476
477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
478 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
479
480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
481 sizeof(kmp_local_t), "th_%d.th_local", gtid);
482
483 __kmp_print_storage_map_gtid(
484 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
485 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
486
487 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
488 &thr->th.th_bar[bs_plain_barrier + 1],
489 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
490 gtid);
491
492 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
493 &thr->th.th_bar[bs_forkjoin_barrier + 1],
494 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
495 gtid);
496
497#if KMP_FAST_REDUCTION_BARRIER
498 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
499 &thr->th.th_bar[bs_reduction_barrier + 1],
500 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
501 gtid);
502#endif // KMP_FAST_REDUCTION_BARRIER
503}
504
505/* Print out the storage map for the major kmp_team_t team data structures
506 that are allocated together. */
507
508static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
509 int team_id, int num_thr) {
510 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
511 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
512 header, team_id);
513
514 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
515 &team->t.t_bar[bs_last_barrier],
516 sizeof(kmp_balign_team_t) * bs_last_barrier,
517 "%s_%d.t_bar", header, team_id);
518
519 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
520 &team->t.t_bar[bs_plain_barrier + 1],
521 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
522 header, team_id);
523
524 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
525 &team->t.t_bar[bs_forkjoin_barrier + 1],
526 sizeof(kmp_balign_team_t),
527 "%s_%d.t_bar[forkjoin]", header, team_id);
528
529#if KMP_FAST_REDUCTION_BARRIER
530 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
531 &team->t.t_bar[bs_reduction_barrier + 1],
532 sizeof(kmp_balign_team_t),
533 "%s_%d.t_bar[reduction]", header, team_id);
534#endif // KMP_FAST_REDUCTION_BARRIER
535
536 __kmp_print_storage_map_gtid(
537 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
538 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
539
540 __kmp_print_storage_map_gtid(
541 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
542 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
543
544 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
545 &team->t.t_disp_buffer[num_disp_buff],
546 sizeof(dispatch_shared_info_t) * num_disp_buff,
547 "%s_%d.t_disp_buffer", header, team_id);
548}
549
550static void __kmp_init_allocator() {
551 __kmp_init_memkind();
552 __kmp_init_target_mem();
553}
554static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
555
556/* ------------------------------------------------------------------------ */
557
558#if ENABLE_LIBOMPTARGET
559static void __kmp_init_omptarget() {
560 __kmp_init_target_task();
561}
562#endif
563
564/* ------------------------------------------------------------------------ */
565
566#if KMP_DYNAMIC_LIB
567#if KMP_OS_WINDOWS
568
569BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
570 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
571
572 switch (fdwReason) {
573
574 case DLL_PROCESS_ATTACH:
575 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
576
577 return TRUE;
578
579 case DLL_PROCESS_DETACH:
580 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
581
582 // According to Windows* documentation for DllMain entry point:
583 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
584 // lpReserved == NULL when FreeLibrary() is called,
585 // lpReserved != NULL when the process is terminated.
586 // When FreeLibrary() is called, worker threads remain alive. So the
587 // runtime's state is consistent and executing proper shutdown is OK.
588 // When the process is terminated, worker threads have exited or been
589 // forcefully terminated by the OS and only the shutdown thread remains.
590 // This can leave the runtime in an inconsistent state.
591 // Hence, only attempt proper cleanup when FreeLibrary() is called.
592 // Otherwise, rely on OS to reclaim resources.
593 if (lpReserved == NULL)
594 __kmp_internal_end_library(__kmp_gtid_get_specific());
595
596 return TRUE;
597
598 case DLL_THREAD_ATTACH:
599 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
600
601 /* if we want to register new siblings all the time here call
602 * __kmp_get_gtid(); */
603 return TRUE;
604
605 case DLL_THREAD_DETACH:
606 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
607
608 __kmp_internal_end_thread(__kmp_gtid_get_specific());
609 return TRUE;
610 }
611
612 return TRUE;
613}
614
615#endif /* KMP_OS_WINDOWS */
616#endif /* KMP_DYNAMIC_LIB */
617
618/* __kmp_parallel_deo -- Wait until it's our turn. */
619void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
620 int gtid = *gtid_ref;
621#ifdef BUILD_PARALLEL_ORDERED
622 kmp_team_t *team = __kmp_team_from_gtid(gtid);
623#endif /* BUILD_PARALLEL_ORDERED */
624
625 if (__kmp_env_consistency_check) {
626 if (__kmp_threads[gtid]->th.th_root->r.r_active)
627#if KMP_USE_DYNAMIC_LOCK
628 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
629#else
630 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
631#endif
632 }
633#ifdef BUILD_PARALLEL_ORDERED
634 if (!team->t.t_serialized) {
635 KMP_MB();
636 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
637 NULL);
638 KMP_MB();
639 }
640#endif /* BUILD_PARALLEL_ORDERED */
641}
642
643/* __kmp_parallel_dxo -- Signal the next task. */
644void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
645 int gtid = *gtid_ref;
646#ifdef BUILD_PARALLEL_ORDERED
647 int tid = __kmp_tid_from_gtid(gtid);
648 kmp_team_t *team = __kmp_team_from_gtid(gtid);
649#endif /* BUILD_PARALLEL_ORDERED */
650
651 if (__kmp_env_consistency_check) {
652 if (__kmp_threads[gtid]->th.th_root->r.r_active)
653 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
654 }
655#ifdef BUILD_PARALLEL_ORDERED
656 if (!team->t.t_serialized) {
657 KMP_MB(); /* Flush all pending memory write invalidates. */
658
659 /* use the tid of the next thread in this team */
660 /* TODO replace with general release procedure */
661 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
662
663 KMP_MB(); /* Flush all pending memory write invalidates. */
664 }
665#endif /* BUILD_PARALLEL_ORDERED */
666}
667
668/* ------------------------------------------------------------------------ */
669/* The BARRIER for a SINGLE process section is always explicit */
670
671int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
672 int status;
673 kmp_info_t *th;
674 kmp_team_t *team;
675
676 if (!TCR_4(__kmp_init_parallel))
677 __kmp_parallel_initialize();
678 __kmp_resume_if_soft_paused();
679
680 th = __kmp_threads[gtid];
681 team = th->th.th_team;
682 status = 0;
683
684 th->th.th_ident = id_ref;
685
686 if (team->t.t_serialized) {
687 status = 1;
688 } else {
689 kmp_int32 old_this = th->th.th_local.this_construct;
690
691 ++th->th.th_local.this_construct;
692 /* try to set team count to thread count--success means thread got the
693 single block */
694 /* TODO: Should this be acquire or release? */
695 if (team->t.t_construct == old_this) {
696 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
697 th->th.th_local.this_construct);
698 }
699#if USE_ITT_BUILD
700 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
701 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
702 team->t.t_active_level == 1) {
703 // Only report metadata by primary thread of active team at level 1
704 __kmp_itt_metadata_single(id_ref);
705 }
706#endif /* USE_ITT_BUILD */
707 }
708
709 if (__kmp_env_consistency_check) {
710 if (status && push_ws) {
711 __kmp_push_workshare(gtid, ct_psingle, id_ref);
712 } else {
713 __kmp_check_workshare(gtid, ct_psingle, id_ref);
714 }
715 }
716#if USE_ITT_BUILD
717 if (status) {
718 __kmp_itt_single_start(gtid);
719 }
720#endif /* USE_ITT_BUILD */
721 return status;
722}
723
724void __kmp_exit_single(int gtid) {
725#if USE_ITT_BUILD
726 __kmp_itt_single_end(gtid);
727#endif /* USE_ITT_BUILD */
728 if (__kmp_env_consistency_check)
729 __kmp_pop_workshare(gtid, ct_psingle, NULL);
730}
731
732/* determine if we can go parallel or must use a serialized parallel region and
733 * how many threads we can use
734 * set_nproc is the number of threads requested for the team
735 * returns 0 if we should serialize or only use one thread,
736 * otherwise the number of threads to use
737 * The forkjoin lock is held by the caller. */
738static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
739 int master_tid, int set_nthreads,
740 int enter_teams) {
741 int capacity;
742 int new_nthreads;
743 KMP_DEBUG_ASSERT(__kmp_init_serial);
744 KMP_DEBUG_ASSERT(root && parent_team);
745 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
746
747 // If dyn-var is set, dynamically adjust the number of desired threads,
748 // according to the method specified by dynamic_mode.
749 new_nthreads = set_nthreads;
750 if (!get__dynamic_2(parent_team, master_tid)) {
751 ;
752 }
753#ifdef USE_LOAD_BALANCE
754 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
755 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
756 if (new_nthreads == 1) {
757 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
758 "reservation to 1 thread\n",
759 master_tid));
760 return 1;
761 }
762 if (new_nthreads < set_nthreads) {
763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
764 "reservation to %d threads\n",
765 master_tid, new_nthreads));
766 }
767 }
768#endif /* USE_LOAD_BALANCE */
769 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
770 new_nthreads = __kmp_avail_proc - __kmp_nth +
771 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
772 if (new_nthreads <= 1) {
773 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
774 "reservation to 1 thread\n",
775 master_tid));
776 return 1;
777 }
778 if (new_nthreads < set_nthreads) {
779 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
780 "reservation to %d threads\n",
781 master_tid, new_nthreads));
782 } else {
783 new_nthreads = set_nthreads;
784 }
785 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
786 if (set_nthreads > 2) {
787 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
788 new_nthreads = (new_nthreads % set_nthreads) + 1;
789 if (new_nthreads == 1) {
790 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
791 "reservation to 1 thread\n",
792 master_tid));
793 return 1;
794 }
795 if (new_nthreads < set_nthreads) {
796 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
797 "reservation to %d threads\n",
798 master_tid, new_nthreads));
799 }
800 }
801 } else {
802 KMP_ASSERT(0);
803 }
804
805 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
806 if (__kmp_nth + new_nthreads -
807 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
808 __kmp_max_nth) {
809 int tl_nthreads = __kmp_max_nth - __kmp_nth +
810 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
811 if (tl_nthreads <= 0) {
812 tl_nthreads = 1;
813 }
814
815 // If dyn-var is false, emit a 1-time warning.
816 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
817 __kmp_reserve_warn = 1;
818 __kmp_msg(kmp_ms_warning,
819 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
820 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
821 }
822 if (tl_nthreads == 1) {
823 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
824 "reduced reservation to 1 thread\n",
825 master_tid));
826 return 1;
827 }
828 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
829 "reservation to %d threads\n",
830 master_tid, tl_nthreads));
831 new_nthreads = tl_nthreads;
832 }
833
834 // Respect OMP_THREAD_LIMIT
835 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
836 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
837 if (cg_nthreads + new_nthreads -
838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839 max_cg_threads) {
840 int tl_nthreads = max_cg_threads - cg_nthreads +
841 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842 if (tl_nthreads <= 0) {
843 tl_nthreads = 1;
844 }
845
846 // If dyn-var is false, emit a 1-time warning.
847 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848 __kmp_reserve_warn = 1;
849 __kmp_msg(kmp_ms_warning,
850 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852 }
853 if (tl_nthreads == 1) {
854 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
855 "reduced reservation to 1 thread\n",
856 master_tid));
857 return 1;
858 }
859 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
860 "reservation to %d threads\n",
861 master_tid, tl_nthreads));
862 new_nthreads = tl_nthreads;
863 }
864
865 // Check if the threads array is large enough, or needs expanding.
866 // See comment in __kmp_register_root() about the adjustment if
867 // __kmp_threads[0] == NULL.
868 capacity = __kmp_threads_capacity;
869 if (TCR_PTR(__kmp_threads[0]) == NULL) {
870 --capacity;
871 }
872 // If it is not for initializing the hidden helper team, we need to take
873 // __kmp_hidden_helper_threads_num out of the capacity because it is included
874 // in __kmp_threads_capacity.
875 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
876 capacity -= __kmp_hidden_helper_threads_num;
877 }
878 if (__kmp_nth + new_nthreads -
879 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
880 capacity) {
881 // Expand the threads array.
882 int slotsRequired = __kmp_nth + new_nthreads -
883 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
884 capacity;
885 int slotsAdded = __kmp_expand_threads(slotsRequired);
886 if (slotsAdded < slotsRequired) {
887 // The threads array was not expanded enough.
888 new_nthreads -= (slotsRequired - slotsAdded);
889 KMP_ASSERT(new_nthreads >= 1);
890
891 // If dyn-var is false, emit a 1-time warning.
892 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
893 __kmp_reserve_warn = 1;
894 if (__kmp_tp_cached) {
895 __kmp_msg(kmp_ms_warning,
896 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
897 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
898 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
899 } else {
900 __kmp_msg(kmp_ms_warning,
901 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
902 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
903 }
904 }
905 }
906 }
907
908#ifdef KMP_DEBUG
909 if (new_nthreads == 1) {
910 KC_TRACE(10,
911 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
912 "dead roots and rechecking; requested %d threads\n",
913 __kmp_get_gtid(), set_nthreads));
914 } else {
915 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
916 " %d threads\n",
917 __kmp_get_gtid(), new_nthreads, set_nthreads));
918 }
919#endif // KMP_DEBUG
920 return new_nthreads;
921}
922
923/* Allocate threads from the thread pool and assign them to the new team. We are
924 assured that there are enough threads available, because we checked on that
925 earlier within critical section forkjoin */
926static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
927 kmp_info_t *master_th, int master_gtid,
928 int fork_teams_workers) {
929 int i;
930 int use_hot_team;
931
932 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
933 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
934 KMP_MB();
935
936 /* first, let's setup the primary thread */
937 master_th->th.th_info.ds.ds_tid = 0;
938 master_th->th.th_team = team;
939 master_th->th.th_team_nproc = team->t.t_nproc;
940 master_th->th.th_team_master = master_th;
941 master_th->th.th_team_serialized = FALSE;
942 master_th->th.th_dispatch = &team->t.t_dispatch[0];
943
944/* make sure we are not the optimized hot team */
945#if KMP_NESTED_HOT_TEAMS
946 use_hot_team = 0;
947 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
948 if (hot_teams) { // hot teams array is not allocated if
949 // KMP_HOT_TEAMS_MAX_LEVEL=0
950 int level = team->t.t_active_level - 1; // index in array of hot teams
951 if (master_th->th.th_teams_microtask) { // are we inside the teams?
952 if (master_th->th.th_teams_size.nteams > 1) {
953 ++level; // level was not increased in teams construct for
954 // team_of_masters
955 }
956 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
957 master_th->th.th_teams_level == team->t.t_level) {
958 ++level; // level was not increased in teams construct for
959 // team_of_workers before the parallel
960 } // team->t.t_level will be increased inside parallel
961 }
962 if (level < __kmp_hot_teams_max_level) {
963 if (hot_teams[level].hot_team) {
964 // hot team has already been allocated for given level
965 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
966 use_hot_team = 1; // the team is ready to use
967 } else {
968 use_hot_team = 0; // AC: threads are not allocated yet
969 hot_teams[level].hot_team = team; // remember new hot team
970 hot_teams[level].hot_team_nth = team->t.t_nproc;
971 }
972 } else {
973 use_hot_team = 0;
974 }
975 }
976#else
977 use_hot_team = team == root->r.r_hot_team;
978#endif
979 if (!use_hot_team) {
980
981 /* install the primary thread */
982 team->t.t_threads[0] = master_th;
983 __kmp_initialize_info(master_th, team, 0, master_gtid);
984
985 /* now, install the worker threads */
986 for (i = 1; i < team->t.t_nproc; i++) {
987
988 /* fork or reallocate a new thread and install it in team */
989 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
990 team->t.t_threads[i] = thr;
991 KMP_DEBUG_ASSERT(thr);
992 KMP_DEBUG_ASSERT(thr->th.th_team == team);
993 /* align team and thread arrived states */
994 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
995 "T#%d(%d:%d) join =%llu, plain=%llu\n",
996 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
997 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
998 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
999 team->t.t_bar[bs_plain_barrier].b_arrived));
1000 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1001 thr->th.th_teams_level = master_th->th.th_teams_level;
1002 thr->th.th_teams_size = master_th->th.th_teams_size;
1003 { // Initialize threads' barrier data.
1004 int b;
1005 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1006 for (b = 0; b < bs_last_barrier; ++b) {
1007 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1008 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1009#if USE_DEBUGGER
1010 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1011#endif
1012 }
1013 }
1014 }
1015
1016#if KMP_AFFINITY_SUPPORTED
1017 // Do not partition the places list for teams construct workers who
1018 // haven't actually been forked to do real work yet. This partitioning
1019 // will take place in the parallel region nested within the teams construct.
1020 if (!fork_teams_workers) {
1021 __kmp_partition_places(team);
1022 }
1023#endif
1024
1025 if (team->t.t_nproc > 1 &&
1026 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1027 team->t.b->update_num_threads(team->t.t_nproc);
1028 __kmp_add_threads_to_team(team, team->t.t_nproc);
1029 }
1030 }
1031
1032 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1033 for (i = 0; i < team->t.t_nproc; i++) {
1034 kmp_info_t *thr = team->t.t_threads[i];
1035 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1036 thr->th.th_prev_level != team->t.t_level) {
1037 team->t.t_display_affinity = 1;
1038 break;
1039 }
1040 }
1041 }
1042
1043 KMP_MB();
1044}
1045
1046#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1047// Propagate any changes to the floating point control registers out to the team
1048// We try to avoid unnecessary writes to the relevant cache line in the team
1049// structure, so we don't make changes unless they are needed.
1050inline static void propagateFPControl(kmp_team_t *team) {
1051 if (__kmp_inherit_fp_control) {
1052 kmp_int16 x87_fpu_control_word;
1053 kmp_uint32 mxcsr;
1054
1055 // Get primary thread's values of FPU control flags (both X87 and vector)
1056 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1057 __kmp_store_mxcsr(&mxcsr);
1058 mxcsr &= KMP_X86_MXCSR_MASK;
1059
1060 // There is no point looking at t_fp_control_saved here.
1061 // If it is TRUE, we still have to update the values if they are different
1062 // from those we now have. If it is FALSE we didn't save anything yet, but
1063 // our objective is the same. We have to ensure that the values in the team
1064 // are the same as those we have.
1065 // So, this code achieves what we need whether or not t_fp_control_saved is
1066 // true. By checking whether the value needs updating we avoid unnecessary
1067 // writes that would put the cache-line into a written state, causing all
1068 // threads in the team to have to read it again.
1069 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1070 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1071 // Although we don't use this value, other code in the runtime wants to know
1072 // whether it should restore them. So we must ensure it is correct.
1073 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1074 } else {
1075 // Similarly here. Don't write to this cache-line in the team structure
1076 // unless we have to.
1077 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1078 }
1079}
1080
1081// Do the opposite, setting the hardware registers to the updated values from
1082// the team.
1083inline static void updateHWFPControl(kmp_team_t *team) {
1084 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1085 // Only reset the fp control regs if they have been changed in the team.
1086 // the parallel region that we are exiting.
1087 kmp_int16 x87_fpu_control_word;
1088 kmp_uint32 mxcsr;
1089 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1090 __kmp_store_mxcsr(&mxcsr);
1091 mxcsr &= KMP_X86_MXCSR_MASK;
1092
1093 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1094 __kmp_clear_x87_fpu_status_word();
1095 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1096 }
1097
1098 if (team->t.t_mxcsr != mxcsr) {
1099 __kmp_load_mxcsr(&team->t.t_mxcsr);
1100 }
1101 }
1102}
1103#else
1104#define propagateFPControl(x) ((void)0)
1105#define updateHWFPControl(x) ((void)0)
1106#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1107
1108static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1109 int realloc); // forward declaration
1110
1111/* Run a parallel region that has been serialized, so runs only in a team of the
1112 single primary thread. */
1113void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1114 kmp_info_t *this_thr;
1115 kmp_team_t *serial_team;
1116
1117 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1118
1119 /* Skip all this code for autopar serialized loops since it results in
1120 unacceptable overhead */
1121 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1122 return;
1123
1124 if (!TCR_4(__kmp_init_parallel))
1125 __kmp_parallel_initialize();
1126 __kmp_resume_if_soft_paused();
1127
1128 this_thr = __kmp_threads[global_tid];
1129 serial_team = this_thr->th.th_serial_team;
1130
1131 /* utilize the serialized team held by this thread */
1132 KMP_DEBUG_ASSERT(serial_team);
1133 KMP_MB();
1134
1135 if (__kmp_tasking_mode != tskm_immediate_exec) {
1136 KMP_DEBUG_ASSERT(
1137 this_thr->th.th_task_team ==
1138 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1139 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1140 NULL);
1141 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1142 "team %p, new task_team = NULL\n",
1143 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1144 this_thr->th.th_task_team = NULL;
1145 }
1146
1147 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1148 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1149 proc_bind = proc_bind_false;
1150 } else if (proc_bind == proc_bind_default) {
1151 // No proc_bind clause was specified, so use the current value
1152 // of proc-bind-var for this parallel region.
1153 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1154 }
1155 // Reset for next parallel region
1156 this_thr->th.th_set_proc_bind = proc_bind_default;
1157
1158 // Reset num_threads for next parallel region
1159 this_thr->th.th_set_nproc = 0;
1160
1161#if OMPT_SUPPORT
1162 ompt_data_t ompt_parallel_data = ompt_data_none;
1163 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1164 if (ompt_enabled.enabled &&
1165 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1166
1167 ompt_task_info_t *parent_task_info;
1168 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1169
1170 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1171 if (ompt_enabled.ompt_callback_parallel_begin) {
1172 int team_size = 1;
1173
1174 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1175 &(parent_task_info->task_data), &(parent_task_info->frame),
1176 &ompt_parallel_data, team_size,
1177 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1178 }
1179 }
1180#endif // OMPT_SUPPORT
1181
1182 if (this_thr->th.th_team != serial_team) {
1183 // Nested level will be an index in the nested nthreads array
1184 int level = this_thr->th.th_team->t.t_level;
1185
1186 if (serial_team->t.t_serialized) {
1187 /* this serial team was already used
1188 TODO increase performance by making this locks more specific */
1189 kmp_team_t *new_team;
1190
1191 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1192
1193 new_team =
1194 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1195#if OMPT_SUPPORT
1196 ompt_parallel_data,
1197#endif
1198 proc_bind, &this_thr->th.th_current_task->td_icvs,
1199 0 USE_NESTED_HOT_ARG(NULL));
1200 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1201 KMP_ASSERT(new_team);
1202
1203 /* setup new serialized team and install it */
1204 new_team->t.t_threads[0] = this_thr;
1205 new_team->t.t_parent = this_thr->th.th_team;
1206 serial_team = new_team;
1207 this_thr->th.th_serial_team = serial_team;
1208
1209 KF_TRACE(
1210 10,
1211 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1212 global_tid, serial_team));
1213
1214 /* TODO the above breaks the requirement that if we run out of resources,
1215 then we can still guarantee that serialized teams are ok, since we may
1216 need to allocate a new one */
1217 } else {
1218 KF_TRACE(
1219 10,
1220 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1221 global_tid, serial_team));
1222 }
1223
1224 /* we have to initialize this serial team */
1225 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1226 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1227 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1228 serial_team->t.t_ident = loc;
1229 serial_team->t.t_serialized = 1;
1230 serial_team->t.t_nproc = 1;
1231 serial_team->t.t_parent = this_thr->th.th_team;
1232 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1233 this_thr->th.th_team = serial_team;
1234 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1235
1236 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1237 this_thr->th.th_current_task));
1238 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1239 this_thr->th.th_current_task->td_flags.executing = 0;
1240
1241 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1242
1243 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1244 implicit task for each serialized task represented by
1245 team->t.t_serialized? */
1246 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1247 &this_thr->th.th_current_task->td_parent->td_icvs);
1248
1249 // Thread value exists in the nested nthreads array for the next nested
1250 // level
1251 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1252 this_thr->th.th_current_task->td_icvs.nproc =
1253 __kmp_nested_nth.nth[level + 1];
1254 }
1255
1256 if (__kmp_nested_proc_bind.used &&
1257 (level + 1 < __kmp_nested_proc_bind.used)) {
1258 this_thr->th.th_current_task->td_icvs.proc_bind =
1259 __kmp_nested_proc_bind.bind_types[level + 1];
1260 }
1261
1262#if USE_DEBUGGER
1263 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1264#endif
1265 this_thr->th.th_info.ds.ds_tid = 0;
1266
1267 /* set thread cache values */
1268 this_thr->th.th_team_nproc = 1;
1269 this_thr->th.th_team_master = this_thr;
1270 this_thr->th.th_team_serialized = 1;
1271
1272 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1273 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1274 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1275
1276 propagateFPControl(serial_team);
1277
1278 /* check if we need to allocate dispatch buffers stack */
1279 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1280 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1281 serial_team->t.t_dispatch->th_disp_buffer =
1282 (dispatch_private_info_t *)__kmp_allocate(
1283 sizeof(dispatch_private_info_t));
1284 }
1285 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1286
1287 KMP_MB();
1288
1289 } else {
1290 /* this serialized team is already being used,
1291 * that's fine, just add another nested level */
1292 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1293 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1294 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1295 ++serial_team->t.t_serialized;
1296 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1297
1298 // Nested level will be an index in the nested nthreads array
1299 int level = this_thr->th.th_team->t.t_level;
1300 // Thread value exists in the nested nthreads array for the next nested
1301 // level
1302 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1303 this_thr->th.th_current_task->td_icvs.nproc =
1304 __kmp_nested_nth.nth[level + 1];
1305 }
1306 serial_team->t.t_level++;
1307 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1308 "of serial team %p to %d\n",
1309 global_tid, serial_team, serial_team->t.t_level));
1310
1311 /* allocate/push dispatch buffers stack */
1312 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1313 {
1314 dispatch_private_info_t *disp_buffer =
1315 (dispatch_private_info_t *)__kmp_allocate(
1316 sizeof(dispatch_private_info_t));
1317 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1318 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1319 }
1320 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1321
1322 KMP_MB();
1323 }
1324 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1325
1326 // Perform the display affinity functionality for
1327 // serialized parallel regions
1328 if (__kmp_display_affinity) {
1329 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1330 this_thr->th.th_prev_num_threads != 1) {
1331 // NULL means use the affinity-format-var ICV
1332 __kmp_aux_display_affinity(global_tid, NULL);
1333 this_thr->th.th_prev_level = serial_team->t.t_level;
1334 this_thr->th.th_prev_num_threads = 1;
1335 }
1336 }
1337
1338 if (__kmp_env_consistency_check)
1339 __kmp_push_parallel(global_tid, NULL);
1340#if OMPT_SUPPORT
1341 serial_team->t.ompt_team_info.master_return_address = codeptr;
1342 if (ompt_enabled.enabled &&
1343 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1344 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1345 OMPT_GET_FRAME_ADDRESS(0);
1346
1347 ompt_lw_taskteam_t lw_taskteam;
1348 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1349 &ompt_parallel_data, codeptr);
1350
1351 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1352 // don't use lw_taskteam after linking. content was swaped
1353
1354 /* OMPT implicit task begin */
1355 if (ompt_enabled.ompt_callback_implicit_task) {
1356 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1357 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1358 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1359 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1360 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1361 __kmp_tid_from_gtid(global_tid);
1362 }
1363
1364 /* OMPT state */
1365 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1366 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1367 OMPT_GET_FRAME_ADDRESS(0);
1368 }
1369#endif
1370}
1371
1372// Test if this fork is for a team closely nested in a teams construct
1373static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1374 microtask_t microtask, int level,
1375 int teams_level, kmp_va_list ap) {
1376 return (master_th->th.th_teams_microtask && ap &&
1377 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1378}
1379
1380// Test if this fork is for the teams construct, i.e. to form the outer league
1381// of teams
1382static inline bool __kmp_is_entering_teams(int active_level, int level,
1383 int teams_level, kmp_va_list ap) {
1384 return ((ap == NULL && active_level == 0) ||
1385 (ap && teams_level > 0 && teams_level == level));
1386}
1387
1388// AC: This is start of parallel that is nested inside teams construct.
1389// The team is actual (hot), all workers are ready at the fork barrier.
1390// No lock needed to initialize the team a bit, then free workers.
1391static inline int
1392__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1393 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1394 enum fork_context_e call_context, microtask_t microtask,
1395 launch_t invoker, int master_set_numthreads, int level,
1396#if OMPT_SUPPORT
1397 ompt_data_t ompt_parallel_data, void *return_address,
1398#endif
1399 kmp_va_list ap) {
1400 void **argv;
1401 int i;
1402
1403 parent_team->t.t_ident = loc;
1404 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1405 parent_team->t.t_argc = argc;
1406 argv = (void **)parent_team->t.t_argv;
1407 for (i = argc - 1; i >= 0; --i) {
1408 *argv++ = va_arg(kmp_va_deref(ap), void *);
1409 }
1410 // Increment our nested depth levels, but not increase the serialization
1411 if (parent_team == master_th->th.th_serial_team) {
1412 // AC: we are in serialized parallel
1413 __kmpc_serialized_parallel(loc, gtid);
1414 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1415
1416 if (call_context == fork_context_gnu) {
1417 // AC: need to decrement t_serialized for enquiry functions to work
1418 // correctly, will restore at join time
1419 parent_team->t.t_serialized--;
1420 return TRUE;
1421 }
1422
1423#if OMPD_SUPPORT
1424 parent_team->t.t_pkfn = microtask;
1425#endif
1426
1427#if OMPT_SUPPORT
1428 void *dummy;
1429 void **exit_frame_p;
1430 ompt_data_t *implicit_task_data;
1431 ompt_lw_taskteam_t lw_taskteam;
1432
1433 if (ompt_enabled.enabled) {
1434 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1435 &ompt_parallel_data, return_address);
1436 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1437
1438 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1439 // Don't use lw_taskteam after linking. Content was swapped.
1440
1441 /* OMPT implicit task begin */
1442 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1443 if (ompt_enabled.ompt_callback_implicit_task) {
1444 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1445 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1446 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1447 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1448 }
1449
1450 /* OMPT state */
1451 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1452 } else {
1453 exit_frame_p = &dummy;
1454 }
1455#endif
1456
1457 // AC: need to decrement t_serialized for enquiry functions to work
1458 // correctly, will restore at join time
1459 parent_team->t.t_serialized--;
1460
1461 {
1462 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1463 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1464 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1465#if OMPT_SUPPORT
1466 ,
1467 exit_frame_p
1468#endif
1469 );
1470 }
1471
1472#if OMPT_SUPPORT
1473 if (ompt_enabled.enabled) {
1474 *exit_frame_p = NULL;
1475 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1476 if (ompt_enabled.ompt_callback_implicit_task) {
1477 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1478 ompt_scope_end, NULL, implicit_task_data, 1,
1479 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1480 }
1481 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1482 __ompt_lw_taskteam_unlink(master_th);
1483 if (ompt_enabled.ompt_callback_parallel_end) {
1484 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1485 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1486 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1487 }
1488 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1489 }
1490#endif
1491 return TRUE;
1492 }
1493
1494 parent_team->t.t_pkfn = microtask;
1495 parent_team->t.t_invoke = invoker;
1496 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1497 parent_team->t.t_active_level++;
1498 parent_team->t.t_level++;
1499 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1500
1501 // If the threads allocated to the team are less than the thread limit, update
1502 // the thread limit here. th_teams_size.nth is specific to this team nested
1503 // in a teams construct, the team is fully created, and we're about to do
1504 // the actual fork. Best to do this here so that the subsequent uses below
1505 // and in the join have the correct value.
1506 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1507
1508#if OMPT_SUPPORT
1509 if (ompt_enabled.enabled) {
1510 ompt_lw_taskteam_t lw_taskteam;
1511 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1512 return_address);
1513 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1514 }
1515#endif
1516
1517 /* Change number of threads in the team if requested */
1518 if (master_set_numthreads) { // The parallel has num_threads clause
1519 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1520 // AC: only can reduce number of threads dynamically, can't increase
1521 kmp_info_t **other_threads = parent_team->t.t_threads;
1522 // NOTE: if using distributed barrier, we need to run this code block
1523 // even when the team size appears not to have changed from the max.
1524 int old_proc = master_th->th.th_teams_size.nth;
1525 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1526 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1527 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1528 }
1529 parent_team->t.t_nproc = master_set_numthreads;
1530 for (i = 0; i < master_set_numthreads; ++i) {
1531 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1532 }
1533 }
1534 // Keep extra threads hot in the team for possible next parallels
1535 master_th->th.th_set_nproc = 0;
1536 }
1537
1538#if USE_DEBUGGER
1539 if (__kmp_debugging) { // Let debugger override number of threads.
1540 int nth = __kmp_omp_num_threads(loc);
1541 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1542 master_set_numthreads = nth;
1543 }
1544 }
1545#endif
1546
1547 // Figure out the proc_bind policy for the nested parallel within teams
1548 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1549 // proc_bind_default means don't update
1550 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1551 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1552 proc_bind = proc_bind_false;
1553 } else {
1554 // No proc_bind clause specified; use current proc-bind-var
1555 if (proc_bind == proc_bind_default) {
1556 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1557 }
1558 /* else: The proc_bind policy was specified explicitly on parallel clause.
1559 This overrides proc-bind-var for this parallel region, but does not
1560 change proc-bind-var. */
1561 // Figure the value of proc-bind-var for the child threads.
1562 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1563 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1564 master_th->th.th_current_task->td_icvs.proc_bind)) {
1565 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1566 }
1567 }
1568 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1569 // Need to change the bind-var ICV to correct value for each implicit task
1570 if (proc_bind_icv != proc_bind_default &&
1571 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1572 kmp_info_t **other_threads = parent_team->t.t_threads;
1573 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1574 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1575 }
1576 }
1577 // Reset for next parallel region
1578 master_th->th.th_set_proc_bind = proc_bind_default;
1579
1580#if USE_ITT_BUILD && USE_ITT_NOTIFY
1581 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1582 KMP_ITT_DEBUG) &&
1583 __kmp_forkjoin_frames_mode == 3 &&
1584 parent_team->t.t_active_level == 1 // only report frames at level 1
1585 && master_th->th.th_teams_size.nteams == 1) {
1586 kmp_uint64 tmp_time = __itt_get_timestamp();
1587 master_th->th.th_frame_time = tmp_time;
1588 parent_team->t.t_region_time = tmp_time;
1589 }
1590 if (__itt_stack_caller_create_ptr) {
1591 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1592 // create new stack stitching id before entering fork barrier
1593 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1594 }
1595#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1596#if KMP_AFFINITY_SUPPORTED
1597 __kmp_partition_places(parent_team);
1598#endif
1599
1600 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1601 "master_th=%p, gtid=%d\n",
1602 root, parent_team, master_th, gtid));
1603 __kmp_internal_fork(loc, gtid, parent_team);
1604 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1605 "master_th=%p, gtid=%d\n",
1606 root, parent_team, master_th, gtid));
1607
1608 if (call_context == fork_context_gnu)
1609 return TRUE;
1610
1611 /* Invoke microtask for PRIMARY thread */
1612 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1613 parent_team->t.t_id, parent_team->t.t_pkfn));
1614
1615 if (!parent_team->t.t_invoke(gtid)) {
1616 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1617 }
1618 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1619 parent_team->t.t_id, parent_team->t.t_pkfn));
1620 KMP_MB(); /* Flush all pending memory write invalidates. */
1621
1622 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1623
1624 return TRUE;
1625}
1626
1627// Create a serialized parallel region
1628static inline int
1629__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1630 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1631 kmp_info_t *master_th, kmp_team_t *parent_team,
1632#if OMPT_SUPPORT
1633 ompt_data_t *ompt_parallel_data, void **return_address,
1634 ompt_data_t **parent_task_data,
1635#endif
1636 kmp_va_list ap) {
1637 kmp_team_t *team;
1638 int i;
1639 void **argv;
1640
1641/* josh todo: hypothetical question: what do we do for OS X*? */
1642#if KMP_OS_LINUX && \
1643 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1644 void *args[argc];
1645#else
1646 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1647#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1648 KMP_ARCH_AARCH64) */
1649
1650 KA_TRACE(
1651 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1652
1653 __kmpc_serialized_parallel(loc, gtid);
1654
1655#if OMPD_SUPPORT
1656 master_th->th.th_serial_team->t.t_pkfn = microtask;
1657#endif
1658
1659 if (call_context == fork_context_intel) {
1660 /* TODO this sucks, use the compiler itself to pass args! :) */
1661 master_th->th.th_serial_team->t.t_ident = loc;
1662 if (!ap) {
1663 // revert change made in __kmpc_serialized_parallel()
1664 master_th->th.th_serial_team->t.t_level--;
1665// Get args from parent team for teams construct
1666
1667#if OMPT_SUPPORT
1668 void *dummy;
1669 void **exit_frame_p;
1670 ompt_task_info_t *task_info;
1671 ompt_lw_taskteam_t lw_taskteam;
1672
1673 if (ompt_enabled.enabled) {
1674 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1675 ompt_parallel_data, *return_address);
1676
1677 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1678 // don't use lw_taskteam after linking. content was swaped
1679 task_info = OMPT_CUR_TASK_INFO(master_th);
1680 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1681 if (ompt_enabled.ompt_callback_implicit_task) {
1682 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1683 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1684 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1685 &(task_info->task_data), 1,
1686 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1687 }
1688
1689 /* OMPT state */
1690 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1691 } else {
1692 exit_frame_p = &dummy;
1693 }
1694#endif
1695
1696 {
1697 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1698 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1699 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1700#if OMPT_SUPPORT
1701 ,
1702 exit_frame_p
1703#endif
1704 );
1705 }
1706
1707#if OMPT_SUPPORT
1708 if (ompt_enabled.enabled) {
1709 *exit_frame_p = NULL;
1710 if (ompt_enabled.ompt_callback_implicit_task) {
1711 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1712 ompt_scope_end, NULL, &(task_info->task_data), 1,
1713 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1714 }
1715 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1716 __ompt_lw_taskteam_unlink(master_th);
1717 if (ompt_enabled.ompt_callback_parallel_end) {
1718 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1719 ompt_parallel_data, *parent_task_data,
1720 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1721 }
1722 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1723 }
1724#endif
1725 } else if (microtask == (microtask_t)__kmp_teams_master) {
1726 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1727 team = master_th->th.th_team;
1728 // team->t.t_pkfn = microtask;
1729 team->t.t_invoke = invoker;
1730 __kmp_alloc_argv_entries(argc, team, TRUE);
1731 team->t.t_argc = argc;
1732 argv = (void **)team->t.t_argv;
1733 if (ap) {
1734 for (i = argc - 1; i >= 0; --i)
1735 *argv++ = va_arg(kmp_va_deref(ap), void *);
1736 } else {
1737 for (i = 0; i < argc; ++i)
1738 // Get args from parent team for teams construct
1739 argv[i] = parent_team->t.t_argv[i];
1740 }
1741 // AC: revert change made in __kmpc_serialized_parallel()
1742 // because initial code in teams should have level=0
1743 team->t.t_level--;
1744 // AC: call special invoker for outer "parallel" of teams construct
1745 invoker(gtid);
1746#if OMPT_SUPPORT
1747 if (ompt_enabled.enabled) {
1748 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1749 if (ompt_enabled.ompt_callback_implicit_task) {
1750 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1751 ompt_scope_end, NULL, &(task_info->task_data), 0,
1752 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1753 }
1754 if (ompt_enabled.ompt_callback_parallel_end) {
1755 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1756 ompt_parallel_data, *parent_task_data,
1757 OMPT_INVOKER(call_context) | ompt_parallel_league,
1758 *return_address);
1759 }
1760 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1761 }
1762#endif
1763 } else {
1764 argv = args;
1765 for (i = argc - 1; i >= 0; --i)
1766 *argv++ = va_arg(kmp_va_deref(ap), void *);
1767 KMP_MB();
1768
1769#if OMPT_SUPPORT
1770 void *dummy;
1771 void **exit_frame_p;
1772 ompt_task_info_t *task_info;
1773 ompt_lw_taskteam_t lw_taskteam;
1774 ompt_data_t *implicit_task_data;
1775
1776 if (ompt_enabled.enabled) {
1777 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1778 ompt_parallel_data, *return_address);
1779 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1780 // don't use lw_taskteam after linking. content was swaped
1781 task_info = OMPT_CUR_TASK_INFO(master_th);
1782 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1783
1784 /* OMPT implicit task begin */
1785 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1786 if (ompt_enabled.ompt_callback_implicit_task) {
1787 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1788 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1789 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1790 ompt_task_implicit);
1791 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1792 }
1793
1794 /* OMPT state */
1795 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1796 } else {
1797 exit_frame_p = &dummy;
1798 }
1799#endif
1800
1801 {
1802 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1803 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1804 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1805#if OMPT_SUPPORT
1806 ,
1807 exit_frame_p
1808#endif
1809 );
1810 }
1811
1812#if OMPT_SUPPORT
1813 if (ompt_enabled.enabled) {
1814 *exit_frame_p = NULL;
1815 if (ompt_enabled.ompt_callback_implicit_task) {
1816 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1817 ompt_scope_end, NULL, &(task_info->task_data), 1,
1818 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1819 }
1820
1821 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1822 __ompt_lw_taskteam_unlink(master_th);
1823 if (ompt_enabled.ompt_callback_parallel_end) {
1824 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1825 ompt_parallel_data, *parent_task_data,
1826 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1827 }
1828 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1829 }
1830#endif
1831 }
1832 } else if (call_context == fork_context_gnu) {
1833#if OMPT_SUPPORT
1834 if (ompt_enabled.enabled) {
1835 ompt_lw_taskteam_t lwt;
1836 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1837 *return_address);
1838
1839 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1840 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1841 }
1842// don't use lw_taskteam after linking. content was swaped
1843#endif
1844
1845 // we were called from GNU native code
1846 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1847 return FALSE;
1848 } else {
1849 KMP_ASSERT2(call_context < fork_context_last,
1850 "__kmp_serial_fork_call: unknown fork_context parameter");
1851 }
1852
1853 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1854 KMP_MB();
1855 return FALSE;
1856}
1857
1858/* most of the work for a fork */
1859/* return true if we really went parallel, false if serialized */
1860int __kmp_fork_call(ident_t *loc, int gtid,
1861 enum fork_context_e call_context, // Intel, GNU, ...
1862 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1863 kmp_va_list ap) {
1864 void **argv;
1865 int i;
1866 int master_tid;
1867 int master_this_cons;
1868 kmp_team_t *team;
1869 kmp_team_t *parent_team;
1870 kmp_info_t *master_th;
1871 kmp_root_t *root;
1872 int nthreads;
1873 int master_active;
1874 int master_set_numthreads;
1875 int level;
1876 int active_level;
1877 int teams_level;
1878#if KMP_NESTED_HOT_TEAMS
1879 kmp_hot_team_ptr_t **p_hot_teams;
1880#endif
1881 { // KMP_TIME_BLOCK
1882 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1883 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1884
1885 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1886 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1887 /* Some systems prefer the stack for the root thread(s) to start with */
1888 /* some gap from the parent stack to prevent false sharing. */
1889 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1890 /* These 2 lines below are so this does not get optimized out */
1891 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1892 __kmp_stkpadding += (short)((kmp_int64)dummy);
1893 }
1894
1895 /* initialize if needed */
1896 KMP_DEBUG_ASSERT(
1897 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1898 if (!TCR_4(__kmp_init_parallel))
1899 __kmp_parallel_initialize();
1900 __kmp_resume_if_soft_paused();
1901
1902 /* setup current data */
1903 // AC: potentially unsafe, not in sync with library shutdown,
1904 // __kmp_threads can be freed
1905 master_th = __kmp_threads[gtid];
1906
1907 parent_team = master_th->th.th_team;
1908 master_tid = master_th->th.th_info.ds.ds_tid;
1909 master_this_cons = master_th->th.th_local.this_construct;
1910 root = master_th->th.th_root;
1911 master_active = root->r.r_active;
1912 master_set_numthreads = master_th->th.th_set_nproc;
1913
1914#if OMPT_SUPPORT
1915 ompt_data_t ompt_parallel_data = ompt_data_none;
1916 ompt_data_t *parent_task_data;
1917 ompt_frame_t *ompt_frame;
1918 void *return_address = NULL;
1919
1920 if (ompt_enabled.enabled) {
1921 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1922 NULL, NULL);
1923 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1924 }
1925#endif
1926
1927 // Assign affinity to root thread if it hasn't happened yet
1928 __kmp_assign_root_init_mask();
1929
1930 // Nested level will be an index in the nested nthreads array
1931 level = parent_team->t.t_level;
1932 // used to launch non-serial teams even if nested is not allowed
1933 active_level = parent_team->t.t_active_level;
1934 // needed to check nesting inside the teams
1935 teams_level = master_th->th.th_teams_level;
1936#if KMP_NESTED_HOT_TEAMS
1937 p_hot_teams = &master_th->th.th_hot_teams;
1938 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1939 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1940 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1941 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1942 // it is either actual or not needed (when active_level > 0)
1943 (*p_hot_teams)[0].hot_team_nth = 1;
1944 }
1945#endif
1946
1947#if OMPT_SUPPORT
1948 if (ompt_enabled.enabled) {
1949 if (ompt_enabled.ompt_callback_parallel_begin) {
1950 int team_size = master_set_numthreads
1951 ? master_set_numthreads
1952 : get__nproc_2(parent_team, master_tid);
1953 int flags = OMPT_INVOKER(call_context) |
1954 ((microtask == (microtask_t)__kmp_teams_master)
1955 ? ompt_parallel_league
1956 : ompt_parallel_team);
1957 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1958 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1959 return_address);
1960 }
1961 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1962 }
1963#endif
1964
1965 master_th->th.th_ident = loc;
1966
1967 // Parallel closely nested in teams construct:
1968 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1969 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1970 call_context, microtask, invoker,
1971 master_set_numthreads, level,
1972#if OMPT_SUPPORT
1973 ompt_parallel_data, return_address,
1974#endif
1975 ap);
1976 } // End parallel closely nested in teams construct
1977
1978#if KMP_DEBUG
1979 if (__kmp_tasking_mode != tskm_immediate_exec) {
1980 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1981 parent_team->t.t_task_team[master_th->th.th_task_state]);
1982 }
1983#endif
1984
1985 // Need this to happen before we determine the number of threads, not while
1986 // we are allocating the team
1987 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1988
1989 // Determine the number of threads
1990 int enter_teams =
1991 __kmp_is_entering_teams(active_level, level, teams_level, ap);
1992 if ((!enter_teams &&
1993 (parent_team->t.t_active_level >=
1994 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1995 (__kmp_library == library_serial)) {
1996 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1997 nthreads = 1;
1998 } else {
1999 nthreads = master_set_numthreads
2000 ? master_set_numthreads
2001 // TODO: get nproc directly from current task
2002 : get__nproc_2(parent_team, master_tid);
2003 // Check if we need to take forkjoin lock? (no need for serialized
2004 // parallel out of teams construct).
2005 if (nthreads > 1) {
2006 /* determine how many new threads we can use */
2007 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2008 /* AC: If we execute teams from parallel region (on host), then teams
2009 should be created but each can only have 1 thread if nesting is
2010 disabled. If teams called from serial region, then teams and their
2011 threads should be created regardless of the nesting setting. */
2012 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2013 nthreads, enter_teams);
2014 if (nthreads == 1) {
2015 // Free lock for single thread execution here; for multi-thread
2016 // execution it will be freed later after team of threads created
2017 // and initialized
2018 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2019 }
2020 }
2021 }
2022 KMP_DEBUG_ASSERT(nthreads > 0);
2023
2024 // If we temporarily changed the set number of threads then restore it now
2025 master_th->th.th_set_nproc = 0;
2026
2027 if (nthreads == 1) {
2028 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2029 invoker, master_th, parent_team,
2030#if OMPT_SUPPORT
2031 &ompt_parallel_data, &return_address,
2032 &parent_task_data,
2033#endif
2034 ap);
2035 } // if (nthreads == 1)
2036
2037 // GEH: only modify the executing flag in the case when not serialized
2038 // serialized case is handled in kmpc_serialized_parallel
2039 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2040 "curtask=%p, curtask_max_aclevel=%d\n",
2041 parent_team->t.t_active_level, master_th,
2042 master_th->th.th_current_task,
2043 master_th->th.th_current_task->td_icvs.max_active_levels));
2044 // TODO: GEH - cannot do this assertion because root thread not set up as
2045 // executing
2046 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2047 master_th->th.th_current_task->td_flags.executing = 0;
2048
2049 if (!master_th->th.th_teams_microtask || level > teams_level) {
2050 /* Increment our nested depth level */
2051 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2052 }
2053
2054 // See if we need to make a copy of the ICVs.
2055 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2056 if ((level + 1 < __kmp_nested_nth.used) &&
2057 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2058 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2059 } else {
2060 nthreads_icv = 0; // don't update
2061 }
2062
2063 // Figure out the proc_bind_policy for the new team.
2064 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2065 // proc_bind_default means don't update
2066 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2067 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2068 proc_bind = proc_bind_false;
2069 } else {
2070 // No proc_bind clause specified; use current proc-bind-var for this
2071 // parallel region
2072 if (proc_bind == proc_bind_default) {
2073 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2074 }
2075 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2076 if (master_th->th.th_teams_microtask &&
2077 microtask == (microtask_t)__kmp_teams_master) {
2078 proc_bind = __kmp_teams_proc_bind;
2079 }
2080 /* else: The proc_bind policy was specified explicitly on parallel clause.
2081 This overrides proc-bind-var for this parallel region, but does not
2082 change proc-bind-var. */
2083 // Figure the value of proc-bind-var for the child threads.
2084 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2085 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2086 master_th->th.th_current_task->td_icvs.proc_bind)) {
2087 // Do not modify the proc bind icv for the two teams construct forks
2088 // They just let the proc bind icv pass through
2089 if (!master_th->th.th_teams_microtask ||
2090 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2091 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2092 }
2093 }
2094
2095 // Reset for next parallel region
2096 master_th->th.th_set_proc_bind = proc_bind_default;
2097
2098 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2099 kmp_internal_control_t new_icvs;
2100 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2101 new_icvs.next = NULL;
2102 if (nthreads_icv > 0) {
2103 new_icvs.nproc = nthreads_icv;
2104 }
2105 if (proc_bind_icv != proc_bind_default) {
2106 new_icvs.proc_bind = proc_bind_icv;
2107 }
2108
2109 /* allocate a new parallel team */
2110 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2111 team = __kmp_allocate_team(root, nthreads, nthreads,
2112#if OMPT_SUPPORT
2113 ompt_parallel_data,
2114#endif
2115 proc_bind, &new_icvs,
2116 argc USE_NESTED_HOT_ARG(master_th));
2117 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2118 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2119 } else {
2120 /* allocate a new parallel team */
2121 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2122 team = __kmp_allocate_team(root, nthreads, nthreads,
2123#if OMPT_SUPPORT
2124 ompt_parallel_data,
2125#endif
2126 proc_bind,
2127 &master_th->th.th_current_task->td_icvs,
2128 argc USE_NESTED_HOT_ARG(master_th));
2129 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2130 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2131 &master_th->th.th_current_task->td_icvs);
2132 }
2133 KF_TRACE(
2134 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2135
2136 /* setup the new team */
2137 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2138 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2139 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2140 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2141 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2142#if OMPT_SUPPORT
2143 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2144 return_address);
2145#endif
2146 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2147 // TODO: parent_team->t.t_level == INT_MAX ???
2148 if (!master_th->th.th_teams_microtask || level > teams_level) {
2149 int new_level = parent_team->t.t_level + 1;
2150 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2151 new_level = parent_team->t.t_active_level + 1;
2152 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2153 } else {
2154 // AC: Do not increase parallel level at start of the teams construct
2155 int new_level = parent_team->t.t_level;
2156 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2157 new_level = parent_team->t.t_active_level;
2158 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2159 }
2160 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2161 // set primary thread's schedule as new run-time schedule
2162 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2163
2164 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2165 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2166
2167 // Update the floating point rounding in the team if required.
2168 propagateFPControl(team);
2169#if OMPD_SUPPORT
2170 if (ompd_state & OMPD_ENABLE_BP)
2171 ompd_bp_parallel_begin();
2172#endif
2173
2174 if (__kmp_tasking_mode != tskm_immediate_exec) {
2175 // Set primary thread's task team to team's task team. Unless this is hot
2176 // team, it should be NULL.
2177 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2178 parent_team->t.t_task_team[master_th->th.th_task_state]);
2179 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2180 "%p, new task_team %p / team %p\n",
2181 __kmp_gtid_from_thread(master_th),
2182 master_th->th.th_task_team, parent_team,
2183 team->t.t_task_team[master_th->th.th_task_state], team));
2184
2185 if (active_level || master_th->th.th_task_team) {
2186 // Take a memo of primary thread's task_state
2187 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2188 if (master_th->th.th_task_state_top >=
2189 master_th->th.th_task_state_stack_sz) { // increase size
2190 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2191 kmp_uint8 *old_stack, *new_stack;
2192 kmp_uint32 i;
2193 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2194 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2195 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2196 }
2197 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2198 ++i) { // zero-init rest of stack
2199 new_stack[i] = 0;
2200 }
2201 old_stack = master_th->th.th_task_state_memo_stack;
2202 master_th->th.th_task_state_memo_stack = new_stack;
2203 master_th->th.th_task_state_stack_sz = new_size;
2204 __kmp_free(old_stack);
2205 }
2206 // Store primary thread's task_state on stack
2207 master_th->th
2208 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2209 master_th->th.th_task_state;
2210 master_th->th.th_task_state_top++;
2211#if KMP_NESTED_HOT_TEAMS
2212 if (master_th->th.th_hot_teams &&
2213 active_level < __kmp_hot_teams_max_level &&
2214 team == master_th->th.th_hot_teams[active_level].hot_team) {
2215 // Restore primary thread's nested state if nested hot team
2216 master_th->th.th_task_state =
2217 master_th->th
2218 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2219 } else {
2220#endif
2221 master_th->th.th_task_state = 0;
2222#if KMP_NESTED_HOT_TEAMS
2223 }
2224#endif
2225 }
2226#if !KMP_NESTED_HOT_TEAMS
2227 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2228 (team == root->r.r_hot_team));
2229#endif
2230 }
2231
2232 KA_TRACE(
2233 20,
2234 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2235 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2236 team->t.t_nproc));
2237 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2238 (team->t.t_master_tid == 0 &&
2239 (team->t.t_parent == root->r.r_root_team ||
2240 team->t.t_parent->t.t_serialized)));
2241 KMP_MB();
2242
2243 /* now, setup the arguments */
2244 argv = (void **)team->t.t_argv;
2245 if (ap) {
2246 for (i = argc - 1; i >= 0; --i) {
2247 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2248 KMP_CHECK_UPDATE(*argv, new_argv);
2249 argv++;
2250 }
2251 } else {
2252 for (i = 0; i < argc; ++i) {
2253 // Get args from parent team for teams construct
2254 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2255 }
2256 }
2257
2258 /* now actually fork the threads */
2259 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2260 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2261 root->r.r_active = TRUE;
2262
2263 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2264 __kmp_setup_icv_copy(team, nthreads,
2265 &master_th->th.th_current_task->td_icvs, loc);
2266
2267#if OMPT_SUPPORT
2268 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2269#endif
2270
2271 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2272
2273#if USE_ITT_BUILD
2274 if (team->t.t_active_level == 1 // only report frames at level 1
2275 && !master_th->th.th_teams_microtask) { // not in teams construct
2276#if USE_ITT_NOTIFY
2277 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2278 (__kmp_forkjoin_frames_mode == 3 ||
2279 __kmp_forkjoin_frames_mode == 1)) {
2280 kmp_uint64 tmp_time = 0;
2281 if (__itt_get_timestamp_ptr)
2282 tmp_time = __itt_get_timestamp();
2283 // Internal fork - report frame begin
2284 master_th->th.th_frame_time = tmp_time;
2285 if (__kmp_forkjoin_frames_mode == 3)
2286 team->t.t_region_time = tmp_time;
2287 } else
2288// only one notification scheme (either "submit" or "forking/joined", not both)
2289#endif /* USE_ITT_NOTIFY */
2290 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2291 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2292 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2293 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2294 }
2295 }
2296#endif /* USE_ITT_BUILD */
2297
2298 /* now go on and do the work */
2299 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2300 KMP_MB();
2301 KF_TRACE(10,
2302 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2303 root, team, master_th, gtid));
2304
2305#if USE_ITT_BUILD
2306 if (__itt_stack_caller_create_ptr) {
2307 // create new stack stitching id before entering fork barrier
2308 if (!enter_teams) {
2309 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2310 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2311 } else if (parent_team->t.t_serialized) {
2312 // keep stack stitching id in the serialized parent_team;
2313 // current team will be used for parallel inside the teams;
2314 // if parent_team is active, then it already keeps stack stitching id
2315 // for the league of teams
2316 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2317 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2318 }
2319 }
2320#endif /* USE_ITT_BUILD */
2321
2322 // AC: skip __kmp_internal_fork at teams construct, let only primary
2323 // threads execute
2324 if (ap) {
2325 __kmp_internal_fork(loc, gtid, team);
2326 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2327 "master_th=%p, gtid=%d\n",
2328 root, team, master_th, gtid));
2329 }
2330
2331 if (call_context == fork_context_gnu) {
2332 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2333 return TRUE;
2334 }
2335
2336 /* Invoke microtask for PRIMARY thread */
2337 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2338 team->t.t_id, team->t.t_pkfn));
2339 } // END of timer KMP_fork_call block
2340
2341#if KMP_STATS_ENABLED
2342 // If beginning a teams construct, then change thread state
2343 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2344 if (!ap) {
2345 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2346 }
2347#endif
2348
2349 if (!team->t.t_invoke(gtid)) {
2350 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2351 }
2352
2353#if KMP_STATS_ENABLED
2354 // If was beginning of a teams construct, then reset thread state
2355 if (!ap) {
2356 KMP_SET_THREAD_STATE(previous_state);
2357 }
2358#endif
2359
2360 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2361 team->t.t_id, team->t.t_pkfn));
2362 KMP_MB(); /* Flush all pending memory write invalidates. */
2363
2364 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2365#if OMPT_SUPPORT
2366 if (ompt_enabled.enabled) {
2367 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2368 }
2369#endif
2370
2371 return TRUE;
2372}
2373
2374#if OMPT_SUPPORT
2375static inline void __kmp_join_restore_state(kmp_info_t *thread,
2376 kmp_team_t *team) {
2377 // restore state outside the region
2378 thread->th.ompt_thread_info.state =
2379 ((team->t.t_serialized) ? ompt_state_work_serial
2380 : ompt_state_work_parallel);
2381}
2382
2383static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2384 kmp_team_t *team, ompt_data_t *parallel_data,
2385 int flags, void *codeptr) {
2386 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2387 if (ompt_enabled.ompt_callback_parallel_end) {
2388 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2389 parallel_data, &(task_info->task_data), flags, codeptr);
2390 }
2391
2392 task_info->frame.enter_frame = ompt_data_none;
2393 __kmp_join_restore_state(thread, team);
2394}
2395#endif
2396
2397void __kmp_join_call(ident_t *loc, int gtid
2398#if OMPT_SUPPORT
2399 ,
2400 enum fork_context_e fork_context
2401#endif
2402 ,
2403 int exit_teams) {
2404 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2405 kmp_team_t *team;
2406 kmp_team_t *parent_team;
2407 kmp_info_t *master_th;
2408 kmp_root_t *root;
2409 int master_active;
2410
2411 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2412
2413 /* setup current data */
2414 master_th = __kmp_threads[gtid];
2415 root = master_th->th.th_root;
2416 team = master_th->th.th_team;
2417 parent_team = team->t.t_parent;
2418
2419 master_th->th.th_ident = loc;
2420
2421#if OMPT_SUPPORT
2422 void *team_microtask = (void *)team->t.t_pkfn;
2423 // For GOMP interface with serialized parallel, need the
2424 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2425 // and end-parallel events.
2426 if (ompt_enabled.enabled &&
2427 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2428 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2429 }
2430#endif
2431
2432#if KMP_DEBUG
2433 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2434 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2435 "th_task_team = %p\n",
2436 __kmp_gtid_from_thread(master_th), team,
2437 team->t.t_task_team[master_th->th.th_task_state],
2438 master_th->th.th_task_team));
2439 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2440 team->t.t_task_team[master_th->th.th_task_state]);
2441 }
2442#endif
2443
2444 if (team->t.t_serialized) {
2445 if (master_th->th.th_teams_microtask) {
2446 // We are in teams construct
2447 int level = team->t.t_level;
2448 int tlevel = master_th->th.th_teams_level;
2449 if (level == tlevel) {
2450 // AC: we haven't incremented it earlier at start of teams construct,
2451 // so do it here - at the end of teams construct
2452 team->t.t_level++;
2453 } else if (level == tlevel + 1) {
2454 // AC: we are exiting parallel inside teams, need to increment
2455 // serialization in order to restore it in the next call to
2456 // __kmpc_end_serialized_parallel
2457 team->t.t_serialized++;
2458 }
2459 }
2461
2462#if OMPT_SUPPORT
2463 if (ompt_enabled.enabled) {
2464 if (fork_context == fork_context_gnu) {
2465 __ompt_lw_taskteam_unlink(master_th);
2466 }
2467 __kmp_join_restore_state(master_th, parent_team);
2468 }
2469#endif
2470
2471 return;
2472 }
2473
2474 master_active = team->t.t_master_active;
2475
2476 if (!exit_teams) {
2477 // AC: No barrier for internal teams at exit from teams construct.
2478 // But there is barrier for external team (league).
2479 __kmp_internal_join(loc, gtid, team);
2480#if USE_ITT_BUILD
2481 if (__itt_stack_caller_create_ptr) {
2482 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2483 // destroy the stack stitching id after join barrier
2484 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2485 team->t.t_stack_id = NULL;
2486 }
2487#endif
2488 } else {
2489 master_th->th.th_task_state =
2490 0; // AC: no tasking in teams (out of any parallel)
2491#if USE_ITT_BUILD
2492 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2493 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2494 // destroy the stack stitching id on exit from the teams construct
2495 // if parent_team is active, then the id will be destroyed later on
2496 // by master of the league of teams
2497 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2498 parent_team->t.t_stack_id = NULL;
2499 }
2500#endif
2501 }
2502
2503 KMP_MB();
2504
2505#if OMPT_SUPPORT
2506 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2507 void *codeptr = team->t.ompt_team_info.master_return_address;
2508#endif
2509
2510#if USE_ITT_BUILD
2511 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2512 if (team->t.t_active_level == 1 &&
2513 (!master_th->th.th_teams_microtask || /* not in teams construct */
2514 master_th->th.th_teams_size.nteams == 1)) {
2515 master_th->th.th_ident = loc;
2516 // only one notification scheme (either "submit" or "forking/joined", not
2517 // both)
2518 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2519 __kmp_forkjoin_frames_mode == 3)
2520 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2521 master_th->th.th_frame_time, 0, loc,
2522 master_th->th.th_team_nproc, 1);
2523 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2524 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2525 __kmp_itt_region_joined(gtid);
2526 } // active_level == 1
2527#endif /* USE_ITT_BUILD */
2528
2529#if KMP_AFFINITY_SUPPORTED
2530 if (!exit_teams) {
2531 // Restore master thread's partition.
2532 master_th->th.th_first_place = team->t.t_first_place;
2533 master_th->th.th_last_place = team->t.t_last_place;
2534 }
2535#endif // KMP_AFFINITY_SUPPORTED
2536
2537 if (master_th->th.th_teams_microtask && !exit_teams &&
2538 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2539 team->t.t_level == master_th->th.th_teams_level + 1) {
2540// AC: We need to leave the team structure intact at the end of parallel
2541// inside the teams construct, so that at the next parallel same (hot) team
2542// works, only adjust nesting levels
2543#if OMPT_SUPPORT
2544 ompt_data_t ompt_parallel_data = ompt_data_none;
2545 if (ompt_enabled.enabled) {
2546 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2547 if (ompt_enabled.ompt_callback_implicit_task) {
2548 int ompt_team_size = team->t.t_nproc;
2549 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2550 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2551 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2552 }
2553 task_info->frame.exit_frame = ompt_data_none;
2554 task_info->task_data = ompt_data_none;
2555 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2556 __ompt_lw_taskteam_unlink(master_th);
2557 }
2558#endif
2559 /* Decrement our nested depth level */
2560 team->t.t_level--;
2561 team->t.t_active_level--;
2562 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2563
2564 // Restore number of threads in the team if needed. This code relies on
2565 // the proper adjustment of th_teams_size.nth after the fork in
2566 // __kmp_teams_master on each teams primary thread in the case that
2567 // __kmp_reserve_threads reduced it.
2568 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2569 int old_num = master_th->th.th_team_nproc;
2570 int new_num = master_th->th.th_teams_size.nth;
2571 kmp_info_t **other_threads = team->t.t_threads;
2572 team->t.t_nproc = new_num;
2573 for (int i = 0; i < old_num; ++i) {
2574 other_threads[i]->th.th_team_nproc = new_num;
2575 }
2576 // Adjust states of non-used threads of the team
2577 for (int i = old_num; i < new_num; ++i) {
2578 // Re-initialize thread's barrier data.
2579 KMP_DEBUG_ASSERT(other_threads[i]);
2580 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2581 for (int b = 0; b < bs_last_barrier; ++b) {
2582 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2583 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2584#if USE_DEBUGGER
2585 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2586#endif
2587 }
2588 if (__kmp_tasking_mode != tskm_immediate_exec) {
2589 // Synchronize thread's task state
2590 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2591 }
2592 }
2593 }
2594
2595#if OMPT_SUPPORT
2596 if (ompt_enabled.enabled) {
2597 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2598 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2599 }
2600#endif
2601
2602 return;
2603 }
2604
2605 /* do cleanup and restore the parent team */
2606 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2607 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2608
2609 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2610
2611 /* jc: The following lock has instructions with REL and ACQ semantics,
2612 separating the parallel user code called in this parallel region
2613 from the serial user code called after this function returns. */
2614 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2615
2616 if (!master_th->th.th_teams_microtask ||
2617 team->t.t_level > master_th->th.th_teams_level) {
2618 /* Decrement our nested depth level */
2619 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2620 }
2621 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2622
2623#if OMPT_SUPPORT
2624 if (ompt_enabled.enabled) {
2625 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2626 if (ompt_enabled.ompt_callback_implicit_task) {
2627 int flags = (team_microtask == (void *)__kmp_teams_master)
2628 ? ompt_task_initial
2629 : ompt_task_implicit;
2630 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2631 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2632 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2633 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2634 }
2635 task_info->frame.exit_frame = ompt_data_none;
2636 task_info->task_data = ompt_data_none;
2637 }
2638#endif
2639
2640 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2641 master_th, team));
2642 __kmp_pop_current_task_from_thread(master_th);
2643
2644 master_th->th.th_def_allocator = team->t.t_def_allocator;
2645
2646#if OMPD_SUPPORT
2647 if (ompd_state & OMPD_ENABLE_BP)
2648 ompd_bp_parallel_end();
2649#endif
2650 updateHWFPControl(team);
2651
2652 if (root->r.r_active != master_active)
2653 root->r.r_active = master_active;
2654
2655 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2656 master_th)); // this will free worker threads
2657
2658 /* this race was fun to find. make sure the following is in the critical
2659 region otherwise assertions may fail occasionally since the old team may be
2660 reallocated and the hierarchy appears inconsistent. it is actually safe to
2661 run and won't cause any bugs, but will cause those assertion failures. it's
2662 only one deref&assign so might as well put this in the critical region */
2663 master_th->th.th_team = parent_team;
2664 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2665 master_th->th.th_team_master = parent_team->t.t_threads[0];
2666 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2667
2668 /* restore serialized team, if need be */
2669 if (parent_team->t.t_serialized &&
2670 parent_team != master_th->th.th_serial_team &&
2671 parent_team != root->r.r_root_team) {
2672 __kmp_free_team(root,
2673 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2674 master_th->th.th_serial_team = parent_team;
2675 }
2676
2677 if (__kmp_tasking_mode != tskm_immediate_exec) {
2678 if (master_th->th.th_task_state_top >
2679 0) { // Restore task state from memo stack
2680 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2681 // Remember primary thread's state if we re-use this nested hot team
2682 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2683 master_th->th.th_task_state;
2684 --master_th->th.th_task_state_top; // pop
2685 // Now restore state at this level
2686 master_th->th.th_task_state =
2687 master_th->th
2688 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2689 } else if (team != root->r.r_hot_team) {
2690 // Reset the task state of primary thread if we are not hot team because
2691 // in this case all the worker threads will be free, and their task state
2692 // will be reset. If not reset the primary's, the task state will be
2693 // inconsistent.
2694 master_th->th.th_task_state = 0;
2695 }
2696 // Copy the task team from the parent team to the primary thread
2697 master_th->th.th_task_team =
2698 parent_team->t.t_task_team[master_th->th.th_task_state];
2699 KA_TRACE(20,
2700 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2701 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2702 parent_team));
2703 }
2704
2705 // TODO: GEH - cannot do this assertion because root thread not set up as
2706 // executing
2707 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2708 master_th->th.th_current_task->td_flags.executing = 1;
2709
2710 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2711
2712#if KMP_AFFINITY_SUPPORTED
2713 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2714 __kmp_reset_root_init_mask(gtid);
2715 }
2716#endif
2717#if OMPT_SUPPORT
2718 int flags =
2719 OMPT_INVOKER(fork_context) |
2720 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2721 : ompt_parallel_team);
2722 if (ompt_enabled.enabled) {
2723 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2724 codeptr);
2725 }
2726#endif
2727
2728 KMP_MB();
2729 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2730}
2731
2732/* Check whether we should push an internal control record onto the
2733 serial team stack. If so, do it. */
2734void __kmp_save_internal_controls(kmp_info_t *thread) {
2735
2736 if (thread->th.th_team != thread->th.th_serial_team) {
2737 return;
2738 }
2739 if (thread->th.th_team->t.t_serialized > 1) {
2740 int push = 0;
2741
2742 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2743 push = 1;
2744 } else {
2745 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2746 thread->th.th_team->t.t_serialized) {
2747 push = 1;
2748 }
2749 }
2750 if (push) { /* push a record on the serial team's stack */
2751 kmp_internal_control_t *control =
2752 (kmp_internal_control_t *)__kmp_allocate(
2753 sizeof(kmp_internal_control_t));
2754
2755 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2756
2757 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2758
2759 control->next = thread->th.th_team->t.t_control_stack_top;
2760 thread->th.th_team->t.t_control_stack_top = control;
2761 }
2762 }
2763}
2764
2765/* Changes set_nproc */
2766void __kmp_set_num_threads(int new_nth, int gtid) {
2767 kmp_info_t *thread;
2768 kmp_root_t *root;
2769
2770 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2771 KMP_DEBUG_ASSERT(__kmp_init_serial);
2772
2773 if (new_nth < 1)
2774 new_nth = 1;
2775 else if (new_nth > __kmp_max_nth)
2776 new_nth = __kmp_max_nth;
2777
2778 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2779 thread = __kmp_threads[gtid];
2780 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2781 return; // nothing to do
2782
2783 __kmp_save_internal_controls(thread);
2784
2785 set__nproc(thread, new_nth);
2786
2787 // If this omp_set_num_threads() call will cause the hot team size to be
2788 // reduced (in the absence of a num_threads clause), then reduce it now,
2789 // rather than waiting for the next parallel region.
2790 root = thread->th.th_root;
2791 if (__kmp_init_parallel && (!root->r.r_active) &&
2792 (root->r.r_hot_team->t.t_nproc > new_nth)
2793#if KMP_NESTED_HOT_TEAMS
2794 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2795#endif
2796 ) {
2797 kmp_team_t *hot_team = root->r.r_hot_team;
2798 int f;
2799
2800 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2801
2802 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2803 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2804 }
2805 // Release the extra threads we don't need any more.
2806 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2807 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2808 if (__kmp_tasking_mode != tskm_immediate_exec) {
2809 // When decreasing team size, threads no longer in the team should unref
2810 // task team.
2811 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2812 }
2813 __kmp_free_thread(hot_team->t.t_threads[f]);
2814 hot_team->t.t_threads[f] = NULL;
2815 }
2816 hot_team->t.t_nproc = new_nth;
2817#if KMP_NESTED_HOT_TEAMS
2818 if (thread->th.th_hot_teams) {
2819 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2820 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2821 }
2822#endif
2823
2824 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2825 hot_team->t.b->update_num_threads(new_nth);
2826 __kmp_add_threads_to_team(hot_team, new_nth);
2827 }
2828
2829 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2830
2831 // Update the t_nproc field in the threads that are still active.
2832 for (f = 0; f < new_nth; f++) {
2833 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2834 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2835 }
2836 // Special flag in case omp_set_num_threads() call
2837 hot_team->t.t_size_changed = -1;
2838 }
2839}
2840
2841/* Changes max_active_levels */
2842void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2843 kmp_info_t *thread;
2844
2845 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2846 "%d = (%d)\n",
2847 gtid, max_active_levels));
2848 KMP_DEBUG_ASSERT(__kmp_init_serial);
2849
2850 // validate max_active_levels
2851 if (max_active_levels < 0) {
2852 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2853 // We ignore this call if the user has specified a negative value.
2854 // The current setting won't be changed. The last valid setting will be
2855 // used. A warning will be issued (if warnings are allowed as controlled by
2856 // the KMP_WARNINGS env var).
2857 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2858 "max_active_levels for thread %d = (%d)\n",
2859 gtid, max_active_levels));
2860 return;
2861 }
2862 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2863 // it's OK, the max_active_levels is within the valid range: [ 0;
2864 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2865 // We allow a zero value. (implementation defined behavior)
2866 } else {
2867 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2868 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2869 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2870 // Current upper limit is MAX_INT. (implementation defined behavior)
2871 // If the input exceeds the upper limit, we correct the input to be the
2872 // upper limit. (implementation defined behavior)
2873 // Actually, the flow should never get here until we use MAX_INT limit.
2874 }
2875 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2876 "max_active_levels for thread %d = (%d)\n",
2877 gtid, max_active_levels));
2878
2879 thread = __kmp_threads[gtid];
2880
2881 __kmp_save_internal_controls(thread);
2882
2883 set__max_active_levels(thread, max_active_levels);
2884}
2885
2886/* Gets max_active_levels */
2887int __kmp_get_max_active_levels(int gtid) {
2888 kmp_info_t *thread;
2889
2890 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2891 KMP_DEBUG_ASSERT(__kmp_init_serial);
2892
2893 thread = __kmp_threads[gtid];
2894 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2895 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2896 "curtask_maxaclevel=%d\n",
2897 gtid, thread->th.th_current_task,
2898 thread->th.th_current_task->td_icvs.max_active_levels));
2899 return thread->th.th_current_task->td_icvs.max_active_levels;
2900}
2901
2902// nteams-var per-device ICV
2903void __kmp_set_num_teams(int num_teams) {
2904 if (num_teams > 0)
2905 __kmp_nteams = num_teams;
2906}
2907int __kmp_get_max_teams(void) { return __kmp_nteams; }
2908// teams-thread-limit-var per-device ICV
2909void __kmp_set_teams_thread_limit(int limit) {
2910 if (limit > 0)
2911 __kmp_teams_thread_limit = limit;
2912}
2913int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2914
2915KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2916KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2917
2918/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2919void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2920 kmp_info_t *thread;
2921 kmp_sched_t orig_kind;
2922 // kmp_team_t *team;
2923
2924 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2925 gtid, (int)kind, chunk));
2926 KMP_DEBUG_ASSERT(__kmp_init_serial);
2927
2928 // Check if the kind parameter is valid, correct if needed.
2929 // Valid parameters should fit in one of two intervals - standard or extended:
2930 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2931 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2932 orig_kind = kind;
2933 kind = __kmp_sched_without_mods(kind);
2934
2935 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2936 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2937 // TODO: Hint needs attention in case we change the default schedule.
2938 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2939 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2940 __kmp_msg_null);
2941 kind = kmp_sched_default;
2942 chunk = 0; // ignore chunk value in case of bad kind
2943 }
2944
2945 thread = __kmp_threads[gtid];
2946
2947 __kmp_save_internal_controls(thread);
2948
2949 if (kind < kmp_sched_upper_std) {
2950 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2951 // differ static chunked vs. unchunked: chunk should be invalid to
2952 // indicate unchunked schedule (which is the default)
2953 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2954 } else {
2955 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2956 __kmp_sch_map[kind - kmp_sched_lower - 1];
2957 }
2958 } else {
2959 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2960 // kmp_sched_lower - 2 ];
2961 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2962 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2963 kmp_sched_lower - 2];
2964 }
2965 __kmp_sched_apply_mods_intkind(
2966 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2967 if (kind == kmp_sched_auto || chunk < 1) {
2968 // ignore parameter chunk for schedule auto
2969 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2970 } else {
2971 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2972 }
2973}
2974
2975/* Gets def_sched_var ICV values */
2976void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2977 kmp_info_t *thread;
2978 enum sched_type th_type;
2979
2980 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2981 KMP_DEBUG_ASSERT(__kmp_init_serial);
2982
2983 thread = __kmp_threads[gtid];
2984
2985 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2986 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2987 case kmp_sch_static:
2988 case kmp_sch_static_greedy:
2989 case kmp_sch_static_balanced:
2990 *kind = kmp_sched_static;
2991 __kmp_sched_apply_mods_stdkind(kind, th_type);
2992 *chunk = 0; // chunk was not set, try to show this fact via zero value
2993 return;
2994 case kmp_sch_static_chunked:
2995 *kind = kmp_sched_static;
2996 break;
2997 case kmp_sch_dynamic_chunked:
2998 *kind = kmp_sched_dynamic;
2999 break;
3001 case kmp_sch_guided_iterative_chunked:
3002 case kmp_sch_guided_analytical_chunked:
3003 *kind = kmp_sched_guided;
3004 break;
3005 case kmp_sch_auto:
3006 *kind = kmp_sched_auto;
3007 break;
3008 case kmp_sch_trapezoidal:
3009 *kind = kmp_sched_trapezoidal;
3010 break;
3011#if KMP_STATIC_STEAL_ENABLED
3012 case kmp_sch_static_steal:
3013 *kind = kmp_sched_static_steal;
3014 break;
3015#endif
3016 default:
3017 KMP_FATAL(UnknownSchedulingType, th_type);
3018 }
3019
3020 __kmp_sched_apply_mods_stdkind(kind, th_type);
3021 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3022}
3023
3024int __kmp_get_ancestor_thread_num(int gtid, int level) {
3025
3026 int ii, dd;
3027 kmp_team_t *team;
3028 kmp_info_t *thr;
3029
3030 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3031 KMP_DEBUG_ASSERT(__kmp_init_serial);
3032
3033 // validate level
3034 if (level == 0)
3035 return 0;
3036 if (level < 0)
3037 return -1;
3038 thr = __kmp_threads[gtid];
3039 team = thr->th.th_team;
3040 ii = team->t.t_level;
3041 if (level > ii)
3042 return -1;
3043
3044 if (thr->th.th_teams_microtask) {
3045 // AC: we are in teams region where multiple nested teams have same level
3046 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3047 if (level <=
3048 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3049 KMP_DEBUG_ASSERT(ii >= tlevel);
3050 // AC: As we need to pass by the teams league, we need to artificially
3051 // increase ii
3052 if (ii == tlevel) {
3053 ii += 2; // three teams have same level
3054 } else {
3055 ii++; // two teams have same level
3056 }
3057 }
3058 }
3059
3060 if (ii == level)
3061 return __kmp_tid_from_gtid(gtid);
3062
3063 dd = team->t.t_serialized;
3064 level++;
3065 while (ii > level) {
3066 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3067 }
3068 if ((team->t.t_serialized) && (!dd)) {
3069 team = team->t.t_parent;
3070 continue;
3071 }
3072 if (ii > level) {
3073 team = team->t.t_parent;
3074 dd = team->t.t_serialized;
3075 ii--;
3076 }
3077 }
3078
3079 return (dd > 1) ? (0) : (team->t.t_master_tid);
3080}
3081
3082int __kmp_get_team_size(int gtid, int level) {
3083
3084 int ii, dd;
3085 kmp_team_t *team;
3086 kmp_info_t *thr;
3087
3088 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3089 KMP_DEBUG_ASSERT(__kmp_init_serial);
3090
3091 // validate level
3092 if (level == 0)
3093 return 1;
3094 if (level < 0)
3095 return -1;
3096 thr = __kmp_threads[gtid];
3097 team = thr->th.th_team;
3098 ii = team->t.t_level;
3099 if (level > ii)
3100 return -1;
3101
3102 if (thr->th.th_teams_microtask) {
3103 // AC: we are in teams region where multiple nested teams have same level
3104 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3105 if (level <=
3106 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3107 KMP_DEBUG_ASSERT(ii >= tlevel);
3108 // AC: As we need to pass by the teams league, we need to artificially
3109 // increase ii
3110 if (ii == tlevel) {
3111 ii += 2; // three teams have same level
3112 } else {
3113 ii++; // two teams have same level
3114 }
3115 }
3116 }
3117
3118 while (ii > level) {
3119 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3120 }
3121 if (team->t.t_serialized && (!dd)) {
3122 team = team->t.t_parent;
3123 continue;
3124 }
3125 if (ii > level) {
3126 team = team->t.t_parent;
3127 ii--;
3128 }
3129 }
3130
3131 return team->t.t_nproc;
3132}
3133
3134kmp_r_sched_t __kmp_get_schedule_global() {
3135 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3136 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3137 // independently. So one can get the updated schedule here.
3138
3139 kmp_r_sched_t r_sched;
3140
3141 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3142 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3143 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3144 // different roots (even in OMP 2.5)
3145 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3146 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3147 if (s == kmp_sch_static) {
3148 // replace STATIC with more detailed schedule (balanced or greedy)
3149 r_sched.r_sched_type = __kmp_static;
3150 } else if (s == kmp_sch_guided_chunked) {
3151 // replace GUIDED with more detailed schedule (iterative or analytical)
3152 r_sched.r_sched_type = __kmp_guided;
3153 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3154 r_sched.r_sched_type = __kmp_sched;
3155 }
3156 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3157
3158 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3159 // __kmp_chunk may be wrong here (if it was not ever set)
3160 r_sched.chunk = KMP_DEFAULT_CHUNK;
3161 } else {
3162 r_sched.chunk = __kmp_chunk;
3163 }
3164
3165 return r_sched;
3166}
3167
3168/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3169 at least argc number of *t_argv entries for the requested team. */
3170static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3171
3172 KMP_DEBUG_ASSERT(team);
3173 if (!realloc || argc > team->t.t_max_argc) {
3174
3175 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3176 "current entries=%d\n",
3177 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3178 /* if previously allocated heap space for args, free them */
3179 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3180 __kmp_free((void *)team->t.t_argv);
3181
3182 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3183 /* use unused space in the cache line for arguments */
3184 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3185 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3186 "argv entries\n",
3187 team->t.t_id, team->t.t_max_argc));
3188 team->t.t_argv = &team->t.t_inline_argv[0];
3189 if (__kmp_storage_map) {
3190 __kmp_print_storage_map_gtid(
3191 -1, &team->t.t_inline_argv[0],
3192 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3193 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3194 team->t.t_id);
3195 }
3196 } else {
3197 /* allocate space for arguments in the heap */
3198 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3199 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3200 : 2 * argc;
3201 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3202 "argv entries\n",
3203 team->t.t_id, team->t.t_max_argc));
3204 team->t.t_argv =
3205 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3206 if (__kmp_storage_map) {
3207 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3208 &team->t.t_argv[team->t.t_max_argc],
3209 sizeof(void *) * team->t.t_max_argc,
3210 "team_%d.t_argv", team->t.t_id);
3211 }
3212 }
3213 }
3214}
3215
3216static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3217 int i;
3218 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3219 team->t.t_threads =
3220 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3221 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3222 sizeof(dispatch_shared_info_t) * num_disp_buff);
3223 team->t.t_dispatch =
3224 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3225 team->t.t_implicit_task_taskdata =
3226 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3227 team->t.t_max_nproc = max_nth;
3228
3229 /* setup dispatch buffers */
3230 for (i = 0; i < num_disp_buff; ++i) {
3231 team->t.t_disp_buffer[i].buffer_index = i;
3232 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3233 }
3234}
3235
3236static void __kmp_free_team_arrays(kmp_team_t *team) {
3237 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3238 int i;
3239 for (i = 0; i < team->t.t_max_nproc; ++i) {
3240 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3241 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3242 team->t.t_dispatch[i].th_disp_buffer = NULL;
3243 }
3244 }
3245#if KMP_USE_HIER_SCHED
3246 __kmp_dispatch_free_hierarchies(team);
3247#endif
3248 __kmp_free(team->t.t_threads);
3249 __kmp_free(team->t.t_disp_buffer);
3250 __kmp_free(team->t.t_dispatch);
3251 __kmp_free(team->t.t_implicit_task_taskdata);
3252 team->t.t_threads = NULL;
3253 team->t.t_disp_buffer = NULL;
3254 team->t.t_dispatch = NULL;
3255 team->t.t_implicit_task_taskdata = 0;
3256}
3257
3258static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3259 kmp_info_t **oldThreads = team->t.t_threads;
3260
3261 __kmp_free(team->t.t_disp_buffer);
3262 __kmp_free(team->t.t_dispatch);
3263 __kmp_free(team->t.t_implicit_task_taskdata);
3264 __kmp_allocate_team_arrays(team, max_nth);
3265
3266 KMP_MEMCPY(team->t.t_threads, oldThreads,
3267 team->t.t_nproc * sizeof(kmp_info_t *));
3268
3269 __kmp_free(oldThreads);
3270}
3271
3272static kmp_internal_control_t __kmp_get_global_icvs(void) {
3273
3274 kmp_r_sched_t r_sched =
3275 __kmp_get_schedule_global(); // get current state of scheduling globals
3276
3277 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3278
3279 kmp_internal_control_t g_icvs = {
3280 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3281 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3282 // adjustment of threads (per thread)
3283 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3284 // whether blocktime is explicitly set
3285 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3286#if KMP_USE_MONITOR
3287 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3288// intervals
3289#endif
3290 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3291 // next parallel region (per thread)
3292 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3293 __kmp_cg_max_nth, // int thread_limit;
3294 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3295 // for max_active_levels
3296 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3297 // {sched,chunk} pair
3298 __kmp_nested_proc_bind.bind_types[0],
3299 __kmp_default_device,
3300 NULL // struct kmp_internal_control *next;
3301 };
3302
3303 return g_icvs;
3304}
3305
3306static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3307
3308 kmp_internal_control_t gx_icvs;
3309 gx_icvs.serial_nesting_level =
3310 0; // probably =team->t.t_serial like in save_inter_controls
3311 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3312 gx_icvs.next = NULL;
3313
3314 return gx_icvs;
3315}
3316
3317static void __kmp_initialize_root(kmp_root_t *root) {
3318 int f;
3319 kmp_team_t *root_team;
3320 kmp_team_t *hot_team;
3321 int hot_team_max_nth;
3322 kmp_r_sched_t r_sched =
3323 __kmp_get_schedule_global(); // get current state of scheduling globals
3324 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3325 KMP_DEBUG_ASSERT(root);
3326 KMP_ASSERT(!root->r.r_begin);
3327
3328 /* setup the root state structure */
3329 __kmp_init_lock(&root->r.r_begin_lock);
3330 root->r.r_begin = FALSE;
3331 root->r.r_active = FALSE;
3332 root->r.r_in_parallel = 0;
3333 root->r.r_blocktime = __kmp_dflt_blocktime;
3334#if KMP_AFFINITY_SUPPORTED
3335 root->r.r_affinity_assigned = FALSE;
3336#endif
3337
3338 /* setup the root team for this task */
3339 /* allocate the root team structure */
3340 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3341
3342 root_team =
3343 __kmp_allocate_team(root,
3344 1, // new_nproc
3345 1, // max_nproc
3346#if OMPT_SUPPORT
3347 ompt_data_none, // root parallel id
3348#endif
3349 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3350 0 // argc
3351 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3352 );
3353#if USE_DEBUGGER
3354 // Non-NULL value should be assigned to make the debugger display the root
3355 // team.
3356 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3357#endif
3358
3359 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3360
3361 root->r.r_root_team = root_team;
3362 root_team->t.t_control_stack_top = NULL;
3363
3364 /* initialize root team */
3365 root_team->t.t_threads[0] = NULL;
3366 root_team->t.t_nproc = 1;
3367 root_team->t.t_serialized = 1;
3368 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3369 root_team->t.t_sched.sched = r_sched.sched;
3370 KA_TRACE(
3371 20,
3372 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3373 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3374
3375 /* setup the hot team for this task */
3376 /* allocate the hot team structure */
3377 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3378
3379 hot_team =
3380 __kmp_allocate_team(root,
3381 1, // new_nproc
3382 __kmp_dflt_team_nth_ub * 2, // max_nproc
3383#if OMPT_SUPPORT
3384 ompt_data_none, // root parallel id
3385#endif
3386 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3387 0 // argc
3388 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3389 );
3390 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3391
3392 root->r.r_hot_team = hot_team;
3393 root_team->t.t_control_stack_top = NULL;
3394
3395 /* first-time initialization */
3396 hot_team->t.t_parent = root_team;
3397
3398 /* initialize hot team */
3399 hot_team_max_nth = hot_team->t.t_max_nproc;
3400 for (f = 0; f < hot_team_max_nth; ++f) {
3401 hot_team->t.t_threads[f] = NULL;
3402 }
3403 hot_team->t.t_nproc = 1;
3404 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3405 hot_team->t.t_sched.sched = r_sched.sched;
3406 hot_team->t.t_size_changed = 0;
3407}
3408
3409#ifdef KMP_DEBUG
3410
3411typedef struct kmp_team_list_item {
3412 kmp_team_p const *entry;
3413 struct kmp_team_list_item *next;
3414} kmp_team_list_item_t;
3415typedef kmp_team_list_item_t *kmp_team_list_t;
3416
3417static void __kmp_print_structure_team_accum( // Add team to list of teams.
3418 kmp_team_list_t list, // List of teams.
3419 kmp_team_p const *team // Team to add.
3420) {
3421
3422 // List must terminate with item where both entry and next are NULL.
3423 // Team is added to the list only once.
3424 // List is sorted in ascending order by team id.
3425 // Team id is *not* a key.
3426
3427 kmp_team_list_t l;
3428
3429 KMP_DEBUG_ASSERT(list != NULL);
3430 if (team == NULL) {
3431 return;
3432 }
3433
3434 __kmp_print_structure_team_accum(list, team->t.t_parent);
3435 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3436
3437 // Search list for the team.
3438 l = list;
3439 while (l->next != NULL && l->entry != team) {
3440 l = l->next;
3441 }
3442 if (l->next != NULL) {
3443 return; // Team has been added before, exit.
3444 }
3445
3446 // Team is not found. Search list again for insertion point.
3447 l = list;
3448 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3449 l = l->next;
3450 }
3451
3452 // Insert team.
3453 {
3454 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3455 sizeof(kmp_team_list_item_t));
3456 *item = *l;
3457 l->entry = team;
3458 l->next = item;
3459 }
3460}
3461
3462static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3463
3464) {
3465 __kmp_printf("%s", title);
3466 if (team != NULL) {
3467 __kmp_printf("%2x %p\n", team->t.t_id, team);
3468 } else {
3469 __kmp_printf(" - (nil)\n");
3470 }
3471}
3472
3473static void __kmp_print_structure_thread(char const *title,
3474 kmp_info_p const *thread) {
3475 __kmp_printf("%s", title);
3476 if (thread != NULL) {
3477 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3478 } else {
3479 __kmp_printf(" - (nil)\n");
3480 }
3481}
3482
3483void __kmp_print_structure(void) {
3484
3485 kmp_team_list_t list;
3486
3487 // Initialize list of teams.
3488 list =
3489 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3490 list->entry = NULL;
3491 list->next = NULL;
3492
3493 __kmp_printf("\n------------------------------\nGlobal Thread "
3494 "Table\n------------------------------\n");
3495 {
3496 int gtid;
3497 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3498 __kmp_printf("%2d", gtid);
3499 if (__kmp_threads != NULL) {
3500 __kmp_printf(" %p", __kmp_threads[gtid]);
3501 }
3502 if (__kmp_root != NULL) {
3503 __kmp_printf(" %p", __kmp_root[gtid]);
3504 }
3505 __kmp_printf("\n");
3506 }
3507 }
3508
3509 // Print out __kmp_threads array.
3510 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3511 "----------\n");
3512 if (__kmp_threads != NULL) {
3513 int gtid;
3514 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3515 kmp_info_t const *thread = __kmp_threads[gtid];
3516 if (thread != NULL) {
3517 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3518 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3519 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3520 __kmp_print_structure_team(" Serial Team: ",
3521 thread->th.th_serial_team);
3522 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3523 __kmp_print_structure_thread(" Primary: ",
3524 thread->th.th_team_master);
3525 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3526 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3527 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3528 __kmp_print_structure_thread(" Next in pool: ",
3529 thread->th.th_next_pool);
3530 __kmp_printf("\n");
3531 __kmp_print_structure_team_accum(list, thread->th.th_team);
3532 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3533 }
3534 }
3535 } else {
3536 __kmp_printf("Threads array is not allocated.\n");
3537 }
3538
3539 // Print out __kmp_root array.
3540 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3541 "--------\n");
3542 if (__kmp_root != NULL) {
3543 int gtid;
3544 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3545 kmp_root_t const *root = __kmp_root[gtid];
3546 if (root != NULL) {
3547 __kmp_printf("GTID %2d %p:\n", gtid, root);
3548 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3549 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3550 __kmp_print_structure_thread(" Uber Thread: ",
3551 root->r.r_uber_thread);
3552 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3553 __kmp_printf(" In Parallel: %2d\n",
3554 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3555 __kmp_printf("\n");
3556 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3557 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3558 }
3559 }
3560 } else {
3561 __kmp_printf("Ubers array is not allocated.\n");
3562 }
3563
3564 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3565 "--------\n");
3566 while (list->next != NULL) {
3567 kmp_team_p const *team = list->entry;
3568 int i;
3569 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3570 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3571 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3572 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3573 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3574 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3575 for (i = 0; i < team->t.t_nproc; ++i) {
3576 __kmp_printf(" Thread %2d: ", i);
3577 __kmp_print_structure_thread("", team->t.t_threads[i]);
3578 }
3579 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3580 __kmp_printf("\n");
3581 list = list->next;
3582 }
3583
3584 // Print out __kmp_thread_pool and __kmp_team_pool.
3585 __kmp_printf("\n------------------------------\nPools\n----------------------"
3586 "--------\n");
3587 __kmp_print_structure_thread("Thread pool: ",
3588 CCAST(kmp_info_t *, __kmp_thread_pool));
3589 __kmp_print_structure_team("Team pool: ",
3590 CCAST(kmp_team_t *, __kmp_team_pool));
3591 __kmp_printf("\n");
3592
3593 // Free team list.
3594 while (list != NULL) {
3595 kmp_team_list_item_t *item = list;
3596 list = list->next;
3597 KMP_INTERNAL_FREE(item);
3598 }
3599}
3600
3601#endif
3602
3603//---------------------------------------------------------------------------
3604// Stuff for per-thread fast random number generator
3605// Table of primes
3606static const unsigned __kmp_primes[] = {
3607 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3608 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3609 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3610 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3611 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3612 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3613 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3614 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3615 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3616 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3617 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3618
3619//---------------------------------------------------------------------------
3620// __kmp_get_random: Get a random number using a linear congruential method.
3621unsigned short __kmp_get_random(kmp_info_t *thread) {
3622 unsigned x = thread->th.th_x;
3623 unsigned short r = (unsigned short)(x >> 16);
3624
3625 thread->th.th_x = x * thread->th.th_a + 1;
3626
3627 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3628 thread->th.th_info.ds.ds_tid, r));
3629
3630 return r;
3631}
3632//--------------------------------------------------------
3633// __kmp_init_random: Initialize a random number generator
3634void __kmp_init_random(kmp_info_t *thread) {
3635 unsigned seed = thread->th.th_info.ds.ds_tid;
3636
3637 thread->th.th_a =
3638 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3639 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3640 KA_TRACE(30,
3641 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3642}
3643
3644#if KMP_OS_WINDOWS
3645/* reclaim array entries for root threads that are already dead, returns number
3646 * reclaimed */
3647static int __kmp_reclaim_dead_roots(void) {
3648 int i, r = 0;
3649
3650 for (i = 0; i < __kmp_threads_capacity; ++i) {
3651 if (KMP_UBER_GTID(i) &&
3652 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3653 !__kmp_root[i]
3654 ->r.r_active) { // AC: reclaim only roots died in non-active state
3655 r += __kmp_unregister_root_other_thread(i);
3656 }
3657 }
3658 return r;
3659}
3660#endif
3661
3662/* This function attempts to create free entries in __kmp_threads and
3663 __kmp_root, and returns the number of free entries generated.
3664
3665 For Windows* OS static library, the first mechanism used is to reclaim array
3666 entries for root threads that are already dead.
3667
3668 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3669 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3670 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3671 threadprivate cache array has been created. Synchronization with
3672 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3673
3674 After any dead root reclamation, if the clipping value allows array expansion
3675 to result in the generation of a total of nNeed free slots, the function does
3676 that expansion. If not, nothing is done beyond the possible initial root
3677 thread reclamation.
3678
3679 If any argument is negative, the behavior is undefined. */
3680static int __kmp_expand_threads(int nNeed) {
3681 int added = 0;
3682 int minimumRequiredCapacity;
3683 int newCapacity;
3684 kmp_info_t **newThreads;
3685 kmp_root_t **newRoot;
3686
3687 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3688 // resizing __kmp_threads does not need additional protection if foreign
3689 // threads are present
3690
3691#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3692 /* only for Windows static library */
3693 /* reclaim array entries for root threads that are already dead */
3694 added = __kmp_reclaim_dead_roots();
3695
3696 if (nNeed) {
3697 nNeed -= added;
3698 if (nNeed < 0)
3699 nNeed = 0;
3700 }
3701#endif
3702 if (nNeed <= 0)
3703 return added;
3704
3705 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3706 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3707 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3708 // > __kmp_max_nth in one of two ways:
3709 //
3710 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3711 // may not be reused by another thread, so we may need to increase
3712 // __kmp_threads_capacity to __kmp_max_nth + 1.
3713 //
3714 // 2) New foreign root(s) are encountered. We always register new foreign
3715 // roots. This may cause a smaller # of threads to be allocated at
3716 // subsequent parallel regions, but the worker threads hang around (and
3717 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3718 //
3719 // Anyway, that is the reason for moving the check to see if
3720 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3721 // instead of having it performed here. -BB
3722
3723 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3724
3725 /* compute expansion headroom to check if we can expand */
3726 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3727 /* possible expansion too small -- give up */
3728 return added;
3729 }
3730 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3731
3732 newCapacity = __kmp_threads_capacity;
3733 do {
3734 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3735 : __kmp_sys_max_nth;
3736 } while (newCapacity < minimumRequiredCapacity);
3737 newThreads = (kmp_info_t **)__kmp_allocate(
3738 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3739 newRoot =
3740 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3741 KMP_MEMCPY(newThreads, __kmp_threads,
3742 __kmp_threads_capacity * sizeof(kmp_info_t *));
3743 KMP_MEMCPY(newRoot, __kmp_root,
3744 __kmp_threads_capacity * sizeof(kmp_root_t *));
3745 // Put old __kmp_threads array on a list. Any ongoing references to the old
3746 // list will be valid. This list is cleaned up at library shutdown.
3747 kmp_old_threads_list_t *node =
3748 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3749 node->threads = __kmp_threads;
3750 node->next = __kmp_old_threads_list;
3751 __kmp_old_threads_list = node;
3752
3753 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3754 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3755 added += newCapacity - __kmp_threads_capacity;
3756 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3757
3758 if (newCapacity > __kmp_tp_capacity) {
3759 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3760 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3761 __kmp_threadprivate_resize_cache(newCapacity);
3762 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3763 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3764 }
3765 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3766 }
3767
3768 return added;
3769}
3770
3771/* Register the current thread as a root thread and obtain our gtid. We must
3772 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3773 thread that calls from __kmp_do_serial_initialize() */
3774int __kmp_register_root(int initial_thread) {
3775 kmp_info_t *root_thread;
3776 kmp_root_t *root;
3777 int gtid;
3778 int capacity;
3779 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3780 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3781 KMP_MB();
3782
3783 /* 2007-03-02:
3784 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3785 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3786 work as expected -- it may return false (that means there is at least one
3787 empty slot in __kmp_threads array), but it is possible the only free slot
3788 is #0, which is reserved for initial thread and so cannot be used for this
3789 one. Following code workarounds this bug.
3790
3791 However, right solution seems to be not reserving slot #0 for initial
3792 thread because:
3793 (1) there is no magic in slot #0,
3794 (2) we cannot detect initial thread reliably (the first thread which does
3795 serial initialization may be not a real initial thread).
3796 */
3797 capacity = __kmp_threads_capacity;
3798 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3799 --capacity;
3800 }
3801
3802 // If it is not for initializing the hidden helper team, we need to take
3803 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3804 // in __kmp_threads_capacity.
3805 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3806 capacity -= __kmp_hidden_helper_threads_num;
3807 }
3808
3809 /* see if there are too many threads */
3810 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3811 if (__kmp_tp_cached) {
3812 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3813 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3814 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3815 } else {
3816 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3817 __kmp_msg_null);
3818 }
3819 }
3820
3821 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3822 // 0: initial thread, also a regular OpenMP thread.
3823 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3824 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3825 // regular OpenMP threads.
3826 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3827 // Find an available thread slot for hidden helper thread. Slots for hidden
3828 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3829 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3830 gtid <= __kmp_hidden_helper_threads_num;
3831 gtid++)
3832 ;
3833 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3834 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3835 "hidden helper thread: T#%d\n",
3836 gtid));
3837 } else {
3838 /* find an available thread slot */
3839 // Don't reassign the zero slot since we need that to only be used by
3840 // initial thread. Slots for hidden helper threads should also be skipped.
3841 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3842 gtid = 0;
3843 } else {
3844 for (gtid = __kmp_hidden_helper_threads_num + 1;
3845 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3846 ;
3847 }
3848 KA_TRACE(
3849 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3850 KMP_ASSERT(gtid < __kmp_threads_capacity);
3851 }
3852
3853 /* update global accounting */
3854 __kmp_all_nth++;
3855 TCW_4(__kmp_nth, __kmp_nth + 1);
3856
3857 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3858 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3859 if (__kmp_adjust_gtid_mode) {
3860 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3861 if (TCR_4(__kmp_gtid_mode) != 2) {
3862 TCW_4(__kmp_gtid_mode, 2);
3863 }
3864 } else {
3865 if (TCR_4(__kmp_gtid_mode) != 1) {
3866 TCW_4(__kmp_gtid_mode, 1);
3867 }
3868 }
3869 }
3870
3871#ifdef KMP_ADJUST_BLOCKTIME
3872 /* Adjust blocktime to zero if necessary */
3873 /* Middle initialization might not have occurred yet */
3874 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3875 if (__kmp_nth > __kmp_avail_proc) {
3876 __kmp_zero_bt = TRUE;
3877 }
3878 }
3879#endif /* KMP_ADJUST_BLOCKTIME */
3880
3881 /* setup this new hierarchy */
3882 if (!(root = __kmp_root[gtid])) {
3883 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3884 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3885 }
3886
3887#if KMP_STATS_ENABLED
3888 // Initialize stats as soon as possible (right after gtid assignment).
3889 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3890 __kmp_stats_thread_ptr->startLife();
3891 KMP_SET_THREAD_STATE(SERIAL_REGION);
3892 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3893#endif
3894 __kmp_initialize_root(root);
3895
3896 /* setup new root thread structure */
3897 if (root->r.r_uber_thread) {
3898 root_thread = root->r.r_uber_thread;
3899 } else {
3900 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3901 if (__kmp_storage_map) {
3902 __kmp_print_thread_storage_map(root_thread, gtid);
3903 }
3904 root_thread->th.th_info.ds.ds_gtid = gtid;
3905#if OMPT_SUPPORT
3906 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3907#endif
3908 root_thread->th.th_root = root;
3909 if (__kmp_env_consistency_check) {
3910 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3911 }
3912#if USE_FAST_MEMORY
3913 __kmp_initialize_fast_memory(root_thread);
3914#endif /* USE_FAST_MEMORY */
3915
3916#if KMP_USE_BGET
3917 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3918 __kmp_initialize_bget(root_thread);
3919#endif
3920 __kmp_init_random(root_thread); // Initialize random number generator
3921 }
3922
3923 /* setup the serial team held in reserve by the root thread */
3924 if (!root_thread->th.th_serial_team) {
3925 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3926 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3927 root_thread->th.th_serial_team = __kmp_allocate_team(
3928 root, 1, 1,
3929#if OMPT_SUPPORT
3930 ompt_data_none, // root parallel id
3931#endif
3932 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3933 }
3934 KMP_ASSERT(root_thread->th.th_serial_team);
3935 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3936 root_thread->th.th_serial_team));
3937
3938 /* drop root_thread into place */
3939 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3940
3941 root->r.r_root_team->t.t_threads[0] = root_thread;
3942 root->r.r_hot_team->t.t_threads[0] = root_thread;
3943 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3944 // AC: the team created in reserve, not for execution (it is unused for now).
3945 root_thread->th.th_serial_team->t.t_serialized = 0;
3946 root->r.r_uber_thread = root_thread;
3947
3948 /* initialize the thread, get it ready to go */
3949 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3950 TCW_4(__kmp_init_gtid, TRUE);
3951
3952 /* prepare the primary thread for get_gtid() */
3953 __kmp_gtid_set_specific(gtid);
3954
3955#if USE_ITT_BUILD
3956 __kmp_itt_thread_name(gtid);
3957#endif /* USE_ITT_BUILD */
3958
3959#ifdef KMP_TDATA_GTID
3960 __kmp_gtid = gtid;
3961#endif
3962 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3963 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3964
3965 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3966 "plain=%u\n",
3967 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3968 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3969 KMP_INIT_BARRIER_STATE));
3970 { // Initialize barrier data.
3971 int b;
3972 for (b = 0; b < bs_last_barrier; ++b) {
3973 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3974#if USE_DEBUGGER
3975 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3976#endif
3977 }
3978 }
3979 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3980 KMP_INIT_BARRIER_STATE);
3981
3982#if KMP_AFFINITY_SUPPORTED
3983 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3984 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3985 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3986 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3987#endif /* KMP_AFFINITY_SUPPORTED */
3988 root_thread->th.th_def_allocator = __kmp_def_allocator;
3989 root_thread->th.th_prev_level = 0;
3990 root_thread->th.th_prev_num_threads = 1;
3991
3992 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3993 tmp->cg_root = root_thread;
3994 tmp->cg_thread_limit = __kmp_cg_max_nth;
3995 tmp->cg_nthreads = 1;
3996 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3997 " cg_nthreads init to 1\n",
3998 root_thread, tmp));
3999 tmp->up = NULL;
4000 root_thread->th.th_cg_roots = tmp;
4001
4002 __kmp_root_counter++;
4003
4004#if OMPT_SUPPORT
4005 if (!initial_thread && ompt_enabled.enabled) {
4006
4007 kmp_info_t *root_thread = ompt_get_thread();
4008
4009 ompt_set_thread_state(root_thread, ompt_state_overhead);
4010
4011 if (ompt_enabled.ompt_callback_thread_begin) {
4012 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4013 ompt_thread_initial, __ompt_get_thread_data_internal());
4014 }
4015 ompt_data_t *task_data;
4016 ompt_data_t *parallel_data;
4017 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4018 NULL);
4019 if (ompt_enabled.ompt_callback_implicit_task) {
4020 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4021 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4022 }
4023
4024 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4025 }
4026#endif
4027#if OMPD_SUPPORT
4028 if (ompd_state & OMPD_ENABLE_BP)
4029 ompd_bp_thread_begin();
4030#endif
4031
4032 KMP_MB();
4033 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4034
4035 return gtid;
4036}
4037
4038#if KMP_NESTED_HOT_TEAMS
4039static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4040 const int max_level) {
4041 int i, n, nth;
4042 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4043 if (!hot_teams || !hot_teams[level].hot_team) {
4044 return 0;
4045 }
4046 KMP_DEBUG_ASSERT(level < max_level);
4047 kmp_team_t *team = hot_teams[level].hot_team;
4048 nth = hot_teams[level].hot_team_nth;
4049 n = nth - 1; // primary thread is not freed
4050 if (level < max_level - 1) {
4051 for (i = 0; i < nth; ++i) {
4052 kmp_info_t *th = team->t.t_threads[i];
4053 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4054 if (i > 0 && th->th.th_hot_teams) {
4055 __kmp_free(th->th.th_hot_teams);
4056 th->th.th_hot_teams = NULL;
4057 }
4058 }
4059 }
4060 __kmp_free_team(root, team, NULL);
4061 return n;
4062}
4063#endif
4064
4065// Resets a root thread and clear its root and hot teams.
4066// Returns the number of __kmp_threads entries directly and indirectly freed.
4067static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4068 kmp_team_t *root_team = root->r.r_root_team;
4069 kmp_team_t *hot_team = root->r.r_hot_team;
4070 int n = hot_team->t.t_nproc;
4071 int i;
4072
4073 KMP_DEBUG_ASSERT(!root->r.r_active);
4074
4075 root->r.r_root_team = NULL;
4076 root->r.r_hot_team = NULL;
4077 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4078 // before call to __kmp_free_team().
4079 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4080#if KMP_NESTED_HOT_TEAMS
4081 if (__kmp_hot_teams_max_level >
4082 0) { // need to free nested hot teams and their threads if any
4083 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4084 kmp_info_t *th = hot_team->t.t_threads[i];
4085 if (__kmp_hot_teams_max_level > 1) {
4086 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4087 }
4088 if (th->th.th_hot_teams) {
4089 __kmp_free(th->th.th_hot_teams);
4090 th->th.th_hot_teams = NULL;
4091 }
4092 }
4093 }
4094#endif
4095 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4096
4097 // Before we can reap the thread, we need to make certain that all other
4098 // threads in the teams that had this root as ancestor have stopped trying to
4099 // steal tasks.
4100 if (__kmp_tasking_mode != tskm_immediate_exec) {
4101 __kmp_wait_to_unref_task_teams();
4102 }
4103
4104#if KMP_OS_WINDOWS
4105 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4106 KA_TRACE(
4107 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4108 "\n",
4109 (LPVOID) & (root->r.r_uber_thread->th),
4110 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4111 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4112#endif /* KMP_OS_WINDOWS */
4113
4114#if OMPD_SUPPORT
4115 if (ompd_state & OMPD_ENABLE_BP)
4116 ompd_bp_thread_end();
4117#endif
4118
4119#if OMPT_SUPPORT
4120 ompt_data_t *task_data;
4121 ompt_data_t *parallel_data;
4122 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4123 NULL);
4124 if (ompt_enabled.ompt_callback_implicit_task) {
4125 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4126 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4127 }
4128 if (ompt_enabled.ompt_callback_thread_end) {
4129 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4130 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4131 }
4132#endif
4133
4134 TCW_4(__kmp_nth,
4135 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4136 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4137 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4138 " to %d\n",
4139 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4140 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4141 if (i == 1) {
4142 // need to free contention group structure
4143 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4144 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4145 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4146 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4147 root->r.r_uber_thread->th.th_cg_roots = NULL;
4148 }
4149 __kmp_reap_thread(root->r.r_uber_thread, 1);
4150
4151 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4152 // instead of freeing.
4153 root->r.r_uber_thread = NULL;
4154 /* mark root as no longer in use */
4155 root->r.r_begin = FALSE;
4156
4157 return n;
4158}
4159
4160void __kmp_unregister_root_current_thread(int gtid) {
4161 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4162 /* this lock should be ok, since unregister_root_current_thread is never
4163 called during an abort, only during a normal close. furthermore, if you
4164 have the forkjoin lock, you should never try to get the initz lock */
4165 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4166 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4167 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4168 "exiting T#%d\n",
4169 gtid));
4170 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4171 return;
4172 }
4173 kmp_root_t *root = __kmp_root[gtid];
4174
4175 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4176 KMP_ASSERT(KMP_UBER_GTID(gtid));
4177 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4178 KMP_ASSERT(root->r.r_active == FALSE);
4179
4180 KMP_MB();
4181
4182 kmp_info_t *thread = __kmp_threads[gtid];
4183 kmp_team_t *team = thread->th.th_team;
4184 kmp_task_team_t *task_team = thread->th.th_task_team;
4185
4186 // we need to wait for the proxy tasks before finishing the thread
4187 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4188 task_team->tt.tt_hidden_helper_task_encountered)) {
4189#if OMPT_SUPPORT
4190 // the runtime is shutting down so we won't report any events
4191 thread->th.ompt_thread_info.state = ompt_state_undefined;
4192#endif
4193 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4194 }
4195
4196 __kmp_reset_root(gtid, root);
4197
4198 KMP_MB();
4199 KC_TRACE(10,
4200 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4201
4202 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4203}
4204
4205#if KMP_OS_WINDOWS
4206/* __kmp_forkjoin_lock must be already held
4207 Unregisters a root thread that is not the current thread. Returns the number
4208 of __kmp_threads entries freed as a result. */
4209static int __kmp_unregister_root_other_thread(int gtid) {
4210 kmp_root_t *root = __kmp_root[gtid];
4211 int r;
4212
4213 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4214 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4215 KMP_ASSERT(KMP_UBER_GTID(gtid));
4216 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4217 KMP_ASSERT(root->r.r_active == FALSE);
4218
4219 r = __kmp_reset_root(gtid, root);
4220 KC_TRACE(10,
4221 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4222 return r;
4223}
4224#endif
4225
4226#if KMP_DEBUG
4227void __kmp_task_info() {
4228
4229 kmp_int32 gtid = __kmp_entry_gtid();
4230 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4231 kmp_info_t *this_thr = __kmp_threads[gtid];
4232 kmp_team_t *steam = this_thr->th.th_serial_team;
4233 kmp_team_t *team = this_thr->th.th_team;
4234
4235 __kmp_printf(
4236 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4237 "ptask=%p\n",
4238 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4239 team->t.t_implicit_task_taskdata[tid].td_parent);
4240}
4241#endif // KMP_DEBUG
4242
4243/* TODO optimize with one big memclr, take out what isn't needed, split
4244 responsibility to workers as much as possible, and delay initialization of
4245 features as much as possible */
4246static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4247 int tid, int gtid) {
4248 /* this_thr->th.th_info.ds.ds_gtid is setup in
4249 kmp_allocate_thread/create_worker.
4250 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4251 KMP_DEBUG_ASSERT(this_thr != NULL);
4252 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4253 KMP_DEBUG_ASSERT(team);
4254 KMP_DEBUG_ASSERT(team->t.t_threads);
4255 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4256 kmp_info_t *master = team->t.t_threads[0];
4257 KMP_DEBUG_ASSERT(master);
4258 KMP_DEBUG_ASSERT(master->th.th_root);
4259
4260 KMP_MB();
4261
4262 TCW_SYNC_PTR(this_thr->th.th_team, team);
4263
4264 this_thr->th.th_info.ds.ds_tid = tid;
4265 this_thr->th.th_set_nproc = 0;
4266 if (__kmp_tasking_mode != tskm_immediate_exec)
4267 // When tasking is possible, threads are not safe to reap until they are
4268 // done tasking; this will be set when tasking code is exited in wait
4269 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4270 else // no tasking --> always safe to reap
4271 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4272 this_thr->th.th_set_proc_bind = proc_bind_default;
4273#if KMP_AFFINITY_SUPPORTED
4274 this_thr->th.th_new_place = this_thr->th.th_current_place;
4275#endif
4276 this_thr->th.th_root = master->th.th_root;
4277
4278 /* setup the thread's cache of the team structure */
4279 this_thr->th.th_team_nproc = team->t.t_nproc;
4280 this_thr->th.th_team_master = master;
4281 this_thr->th.th_team_serialized = team->t.t_serialized;
4282
4283 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4284
4285 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4286 tid, gtid, this_thr, this_thr->th.th_current_task));
4287
4288 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4289 team, tid, TRUE);
4290
4291 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4292 tid, gtid, this_thr, this_thr->th.th_current_task));
4293 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4294 // __kmp_initialize_team()?
4295
4296 /* TODO no worksharing in speculative threads */
4297 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4298
4299 this_thr->th.th_local.this_construct = 0;
4300
4301 if (!this_thr->th.th_pri_common) {
4302 this_thr->th.th_pri_common =
4303 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4304 if (__kmp_storage_map) {
4305 __kmp_print_storage_map_gtid(
4306 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4307 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4308 }
4309 this_thr->th.th_pri_head = NULL;
4310 }
4311
4312 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4313 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4314 // Make new thread's CG root same as primary thread's
4315 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4316 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4317 if (tmp) {
4318 // worker changes CG, need to check if old CG should be freed
4319 int i = tmp->cg_nthreads--;
4320 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4321 " on node %p of thread %p to %d\n",
4322 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4323 if (i == 1) {
4324 __kmp_free(tmp); // last thread left CG --> free it
4325 }
4326 }
4327 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4328 // Increment new thread's CG root's counter to add the new thread
4329 this_thr->th.th_cg_roots->cg_nthreads++;
4330 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4331 " node %p of thread %p to %d\n",
4332 this_thr, this_thr->th.th_cg_roots,
4333 this_thr->th.th_cg_roots->cg_root,
4334 this_thr->th.th_cg_roots->cg_nthreads));
4335 this_thr->th.th_current_task->td_icvs.thread_limit =
4336 this_thr->th.th_cg_roots->cg_thread_limit;
4337 }
4338
4339 /* Initialize dynamic dispatch */
4340 {
4341 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4342 // Use team max_nproc since this will never change for the team.
4343 size_t disp_size =
4344 sizeof(dispatch_private_info_t) *
4345 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4346 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4347 team->t.t_max_nproc));
4348 KMP_ASSERT(dispatch);
4349 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4350 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4351
4352 dispatch->th_disp_index = 0;
4353 dispatch->th_doacross_buf_idx = 0;
4354 if (!dispatch->th_disp_buffer) {
4355 dispatch->th_disp_buffer =
4356 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4357
4358 if (__kmp_storage_map) {
4359 __kmp_print_storage_map_gtid(
4360 gtid, &dispatch->th_disp_buffer[0],
4361 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4362 ? 1
4363 : __kmp_dispatch_num_buffers],
4364 disp_size,
4365 "th_%d.th_dispatch.th_disp_buffer "
4366 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4367 gtid, team->t.t_id, gtid);
4368 }
4369 } else {
4370 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4371 }
4372
4373 dispatch->th_dispatch_pr_current = 0;
4374 dispatch->th_dispatch_sh_current = 0;
4375
4376 dispatch->th_deo_fcn = 0; /* ORDERED */
4377 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4378 }
4379
4380 this_thr->th.th_next_pool = NULL;
4381
4382 if (!this_thr->th.th_task_state_memo_stack) {
4383 size_t i;
4384 this_thr->th.th_task_state_memo_stack =
4385 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4386 this_thr->th.th_task_state_top = 0;
4387 this_thr->th.th_task_state_stack_sz = 4;
4388 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4389 ++i) // zero init the stack
4390 this_thr->th.th_task_state_memo_stack[i] = 0;
4391 }
4392
4393 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4394 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4395
4396 KMP_MB();
4397}
4398
4399/* allocate a new thread for the requesting team. this is only called from
4400 within a forkjoin critical section. we will first try to get an available
4401 thread from the thread pool. if none is available, we will fork a new one
4402 assuming we are able to create a new one. this should be assured, as the
4403 caller should check on this first. */
4404kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4405 int new_tid) {
4406 kmp_team_t *serial_team;
4407 kmp_info_t *new_thr;
4408 int new_gtid;
4409
4410 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4411 KMP_DEBUG_ASSERT(root && team);
4412#if !KMP_NESTED_HOT_TEAMS
4413 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4414#endif
4415 KMP_MB();
4416
4417 /* first, try to get one from the thread pool */
4418 if (__kmp_thread_pool) {
4419 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4420 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4421 if (new_thr == __kmp_thread_pool_insert_pt) {
4422 __kmp_thread_pool_insert_pt = NULL;
4423 }
4424 TCW_4(new_thr->th.th_in_pool, FALSE);
4425 __kmp_suspend_initialize_thread(new_thr);
4426 __kmp_lock_suspend_mx(new_thr);
4427 if (new_thr->th.th_active_in_pool == TRUE) {
4428 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4429 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4430 new_thr->th.th_active_in_pool = FALSE;
4431 }
4432 __kmp_unlock_suspend_mx(new_thr);
4433
4434 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4435 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4436 KMP_ASSERT(!new_thr->th.th_team);
4437 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4438
4439 /* setup the thread structure */
4440 __kmp_initialize_info(new_thr, team, new_tid,
4441 new_thr->th.th_info.ds.ds_gtid);
4442 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4443
4444 TCW_4(__kmp_nth, __kmp_nth + 1);
4445
4446 new_thr->th.th_task_state = 0;
4447 new_thr->th.th_task_state_top = 0;
4448 new_thr->th.th_task_state_stack_sz = 4;
4449
4450 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4451 // Make sure pool thread has transitioned to waiting on own thread struct
4452 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4453 // Thread activated in __kmp_allocate_team when increasing team size
4454 }
4455
4456#ifdef KMP_ADJUST_BLOCKTIME
4457 /* Adjust blocktime back to zero if necessary */
4458 /* Middle initialization might not have occurred yet */
4459 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4460 if (__kmp_nth > __kmp_avail_proc) {
4461 __kmp_zero_bt = TRUE;
4462 }
4463 }
4464#endif /* KMP_ADJUST_BLOCKTIME */
4465
4466#if KMP_DEBUG
4467 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4468 // KMP_BARRIER_PARENT_FLAG.
4469 int b;
4470 kmp_balign_t *balign = new_thr->th.th_bar;
4471 for (b = 0; b < bs_last_barrier; ++b)
4472 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4473#endif
4474
4475 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4476 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4477
4478 KMP_MB();
4479 return new_thr;
4480 }
4481
4482 /* no, well fork a new one */
4483 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4484 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4485
4486#if KMP_USE_MONITOR
4487 // If this is the first worker thread the RTL is creating, then also
4488 // launch the monitor thread. We try to do this as early as possible.
4489 if (!TCR_4(__kmp_init_monitor)) {
4490 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4491 if (!TCR_4(__kmp_init_monitor)) {
4492 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4493 TCW_4(__kmp_init_monitor, 1);
4494 __kmp_create_monitor(&__kmp_monitor);
4495 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4496#if KMP_OS_WINDOWS
4497 // AC: wait until monitor has started. This is a fix for CQ232808.
4498 // The reason is that if the library is loaded/unloaded in a loop with
4499 // small (parallel) work in between, then there is high probability that
4500 // monitor thread started after the library shutdown. At shutdown it is
4501 // too late to cope with the problem, because when the primary thread is
4502 // in DllMain (process detach) the monitor has no chances to start (it is
4503 // blocked), and primary thread has no means to inform the monitor that
4504 // the library has gone, because all the memory which the monitor can
4505 // access is going to be released/reset.
4506 while (TCR_4(__kmp_init_monitor) < 2) {
4507 KMP_YIELD(TRUE);
4508 }
4509 KF_TRACE(10, ("after monitor thread has started\n"));
4510#endif
4511 }
4512 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4513 }
4514#endif
4515
4516 KMP_MB();
4517
4518 {
4519 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4520 ? 1
4521 : __kmp_hidden_helper_threads_num + 1;
4522
4523 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4524 ++new_gtid) {
4525 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4526 }
4527
4528 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4529 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4530 }
4531 }
4532
4533 /* allocate space for it. */
4534 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4535
4536 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4537
4538#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4539 // suppress race conditions detection on synchronization flags in debug mode
4540 // this helps to analyze library internals eliminating false positives
4541 __itt_suppress_mark_range(
4542 __itt_suppress_range, __itt_suppress_threading_errors,
4543 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4544 __itt_suppress_mark_range(
4545 __itt_suppress_range, __itt_suppress_threading_errors,
4546 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4547#if KMP_OS_WINDOWS
4548 __itt_suppress_mark_range(
4549 __itt_suppress_range, __itt_suppress_threading_errors,
4550 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4551#else
4552 __itt_suppress_mark_range(__itt_suppress_range,
4553 __itt_suppress_threading_errors,
4554 &new_thr->th.th_suspend_init_count,
4555 sizeof(new_thr->th.th_suspend_init_count));
4556#endif
4557 // TODO: check if we need to also suppress b_arrived flags
4558 __itt_suppress_mark_range(__itt_suppress_range,
4559 __itt_suppress_threading_errors,
4560 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4561 sizeof(new_thr->th.th_bar[0].bb.b_go));
4562 __itt_suppress_mark_range(__itt_suppress_range,
4563 __itt_suppress_threading_errors,
4564 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4565 sizeof(new_thr->th.th_bar[1].bb.b_go));
4566 __itt_suppress_mark_range(__itt_suppress_range,
4567 __itt_suppress_threading_errors,
4568 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4569 sizeof(new_thr->th.th_bar[2].bb.b_go));
4570#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4571 if (__kmp_storage_map) {
4572 __kmp_print_thread_storage_map(new_thr, new_gtid);
4573 }
4574
4575 // add the reserve serialized team, initialized from the team's primary thread
4576 {
4577 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4578 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4579 new_thr->th.th_serial_team = serial_team =
4580 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4581#if OMPT_SUPPORT
4582 ompt_data_none, // root parallel id
4583#endif
4584 proc_bind_default, &r_icvs,
4585 0 USE_NESTED_HOT_ARG(NULL));
4586 }
4587 KMP_ASSERT(serial_team);
4588 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4589 // execution (it is unused for now).
4590 serial_team->t.t_threads[0] = new_thr;
4591 KF_TRACE(10,
4592 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4593 new_thr));
4594
4595 /* setup the thread structures */
4596 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4597
4598#if USE_FAST_MEMORY
4599 __kmp_initialize_fast_memory(new_thr);
4600#endif /* USE_FAST_MEMORY */
4601
4602#if KMP_USE_BGET
4603 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4604 __kmp_initialize_bget(new_thr);
4605#endif
4606
4607 __kmp_init_random(new_thr); // Initialize random number generator
4608
4609 /* Initialize these only once when thread is grabbed for a team allocation */
4610 KA_TRACE(20,
4611 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4612 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4613
4614 int b;
4615 kmp_balign_t *balign = new_thr->th.th_bar;
4616 for (b = 0; b < bs_last_barrier; ++b) {
4617 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4618 balign[b].bb.team = NULL;
4619 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4620 balign[b].bb.use_oncore_barrier = 0;
4621 }
4622
4623 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4624 new_thr->th.th_sleep_loc_type = flag_unset;
4625
4626 new_thr->th.th_spin_here = FALSE;
4627 new_thr->th.th_next_waiting = 0;
4628#if KMP_OS_UNIX
4629 new_thr->th.th_blocking = false;
4630#endif
4631
4632#if KMP_AFFINITY_SUPPORTED
4633 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4634 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4635 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4636 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4637#endif
4638 new_thr->th.th_def_allocator = __kmp_def_allocator;
4639 new_thr->th.th_prev_level = 0;
4640 new_thr->th.th_prev_num_threads = 1;
4641
4642 TCW_4(new_thr->th.th_in_pool, FALSE);
4643 new_thr->th.th_active_in_pool = FALSE;
4644 TCW_4(new_thr->th.th_active, TRUE);
4645
4646 /* adjust the global counters */
4647 __kmp_all_nth++;
4648 __kmp_nth++;
4649
4650 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4651 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4652 if (__kmp_adjust_gtid_mode) {
4653 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4654 if (TCR_4(__kmp_gtid_mode) != 2) {
4655 TCW_4(__kmp_gtid_mode, 2);
4656 }
4657 } else {
4658 if (TCR_4(__kmp_gtid_mode) != 1) {
4659 TCW_4(__kmp_gtid_mode, 1);
4660 }
4661 }
4662 }
4663
4664#ifdef KMP_ADJUST_BLOCKTIME
4665 /* Adjust blocktime back to zero if necessary */
4666 /* Middle initialization might not have occurred yet */
4667 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4668 if (__kmp_nth > __kmp_avail_proc) {
4669 __kmp_zero_bt = TRUE;
4670 }
4671 }
4672#endif /* KMP_ADJUST_BLOCKTIME */
4673
4674 /* actually fork it and create the new worker thread */
4675 KF_TRACE(
4676 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4677 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4678 KF_TRACE(10,
4679 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4680
4681 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4682 new_gtid));
4683 KMP_MB();
4684 return new_thr;
4685}
4686
4687/* Reinitialize team for reuse.
4688 The hot team code calls this case at every fork barrier, so EPCC barrier
4689 test are extremely sensitive to changes in it, esp. writes to the team
4690 struct, which cause a cache invalidation in all threads.
4691 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4692static void __kmp_reinitialize_team(kmp_team_t *team,
4693 kmp_internal_control_t *new_icvs,
4694 ident_t *loc) {
4695 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4696 team->t.t_threads[0], team));
4697 KMP_DEBUG_ASSERT(team && new_icvs);
4698 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4699 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4700
4701 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4702 // Copy ICVs to the primary thread's implicit taskdata
4703 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4704 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4705
4706 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4707 team->t.t_threads[0], team));
4708}
4709
4710/* Initialize the team data structure.
4711 This assumes the t_threads and t_max_nproc are already set.
4712 Also, we don't touch the arguments */
4713static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4714 kmp_internal_control_t *new_icvs,
4715 ident_t *loc) {
4716 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4717
4718 /* verify */
4719 KMP_DEBUG_ASSERT(team);
4720 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4721 KMP_DEBUG_ASSERT(team->t.t_threads);
4722 KMP_MB();
4723
4724 team->t.t_master_tid = 0; /* not needed */
4725 /* team->t.t_master_bar; not needed */
4726 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4727 team->t.t_nproc = new_nproc;
4728
4729 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4730 team->t.t_next_pool = NULL;
4731 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4732 * up hot team */
4733
4734 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4735 team->t.t_invoke = NULL; /* not needed */
4736
4737 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4738 team->t.t_sched.sched = new_icvs->sched.sched;
4739
4740#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4741 team->t.t_fp_control_saved = FALSE; /* not needed */
4742 team->t.t_x87_fpu_control_word = 0; /* not needed */
4743 team->t.t_mxcsr = 0; /* not needed */
4744#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4745
4746 team->t.t_construct = 0;
4747
4748 team->t.t_ordered.dt.t_value = 0;
4749 team->t.t_master_active = FALSE;
4750
4751#ifdef KMP_DEBUG
4752 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4753#endif
4754#if KMP_OS_WINDOWS
4755 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4756#endif
4757
4758 team->t.t_control_stack_top = NULL;
4759
4760 __kmp_reinitialize_team(team, new_icvs, loc);
4761
4762 KMP_MB();
4763 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4764}
4765
4766#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4767/* Sets full mask for thread and returns old mask, no changes to structures. */
4768static void
4769__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4770 if (KMP_AFFINITY_CAPABLE()) {
4771 int status;
4772 if (old_mask != NULL) {
4773 status = __kmp_get_system_affinity(old_mask, TRUE);
4774 int error = errno;
4775 if (status != 0) {
4776 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4777 __kmp_msg_null);
4778 }
4779 }
4780 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4781 }
4782}
4783#endif
4784
4785#if KMP_AFFINITY_SUPPORTED
4786
4787// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4788// It calculates the worker + primary thread's partition based upon the parent
4789// thread's partition, and binds each worker to a thread in their partition.
4790// The primary thread's partition should already include its current binding.
4791static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4792 // Do not partition places for the hidden helper team
4793 if (KMP_HIDDEN_HELPER_TEAM(team))
4794 return;
4795 // Copy the primary thread's place partition to the team struct
4796 kmp_info_t *master_th = team->t.t_threads[0];
4797 KMP_DEBUG_ASSERT(master_th != NULL);
4798 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4799 int first_place = master_th->th.th_first_place;
4800 int last_place = master_th->th.th_last_place;
4801 int masters_place = master_th->th.th_current_place;
4802 int num_masks = __kmp_affinity.num_masks;
4803 team->t.t_first_place = first_place;
4804 team->t.t_last_place = last_place;
4805
4806 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4807 "bound to place %d partition = [%d,%d]\n",
4808 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4809 team->t.t_id, masters_place, first_place, last_place));
4810
4811 switch (proc_bind) {
4812
4813 case proc_bind_default:
4814 // Serial teams might have the proc_bind policy set to proc_bind_default.
4815 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4816 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4817 break;
4818
4819 case proc_bind_primary: {
4820 int f;
4821 int n_th = team->t.t_nproc;
4822 for (f = 1; f < n_th; f++) {
4823 kmp_info_t *th = team->t.t_threads[f];
4824 KMP_DEBUG_ASSERT(th != NULL);
4825 th->th.th_first_place = first_place;
4826 th->th.th_last_place = last_place;
4827 th->th.th_new_place = masters_place;
4828 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4829 team->t.t_display_affinity != 1) {
4830 team->t.t_display_affinity = 1;
4831 }
4832
4833 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4834 "partition = [%d,%d]\n",
4835 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4836 f, masters_place, first_place, last_place));
4837 }
4838 } break;
4839
4840 case proc_bind_close: {
4841 int f;
4842 int n_th = team->t.t_nproc;
4843 int n_places;
4844 if (first_place <= last_place) {
4845 n_places = last_place - first_place + 1;
4846 } else {
4847 n_places = num_masks - first_place + last_place + 1;
4848 }
4849 if (n_th <= n_places) {
4850 int place = masters_place;
4851 for (f = 1; f < n_th; f++) {
4852 kmp_info_t *th = team->t.t_threads[f];
4853 KMP_DEBUG_ASSERT(th != NULL);
4854
4855 if (place == last_place) {
4856 place = first_place;
4857 } else if (place == (num_masks - 1)) {
4858 place = 0;
4859 } else {
4860 place++;
4861 }
4862 th->th.th_first_place = first_place;
4863 th->th.th_last_place = last_place;
4864 th->th.th_new_place = place;
4865 if (__kmp_display_affinity && place != th->th.th_current_place &&
4866 team->t.t_display_affinity != 1) {
4867 team->t.t_display_affinity = 1;
4868 }
4869
4870 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4871 "partition = [%d,%d]\n",
4872 __kmp_gtid_from_thread(team->t.t_threads[f]),
4873 team->t.t_id, f, place, first_place, last_place));
4874 }
4875 } else {
4876 int S, rem, gap, s_count;
4877 S = n_th / n_places;
4878 s_count = 0;
4879 rem = n_th - (S * n_places);
4880 gap = rem > 0 ? n_places / rem : n_places;
4881 int place = masters_place;
4882 int gap_ct = gap;
4883 for (f = 0; f < n_th; f++) {
4884 kmp_info_t *th = team->t.t_threads[f];
4885 KMP_DEBUG_ASSERT(th != NULL);
4886
4887 th->th.th_first_place = first_place;
4888 th->th.th_last_place = last_place;
4889 th->th.th_new_place = place;
4890 if (__kmp_display_affinity && place != th->th.th_current_place &&
4891 team->t.t_display_affinity != 1) {
4892 team->t.t_display_affinity = 1;
4893 }
4894 s_count++;
4895
4896 if ((s_count == S) && rem && (gap_ct == gap)) {
4897 // do nothing, add an extra thread to place on next iteration
4898 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4899 // we added an extra thread to this place; move to next place
4900 if (place == last_place) {
4901 place = first_place;
4902 } else if (place == (num_masks - 1)) {
4903 place = 0;
4904 } else {
4905 place++;
4906 }
4907 s_count = 0;
4908 gap_ct = 1;
4909 rem--;
4910 } else if (s_count == S) { // place full; don't add extra
4911 if (place == last_place) {
4912 place = first_place;
4913 } else if (place == (num_masks - 1)) {
4914 place = 0;
4915 } else {
4916 place++;
4917 }
4918 gap_ct++;
4919 s_count = 0;
4920 }
4921
4922 KA_TRACE(100,
4923 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4924 "partition = [%d,%d]\n",
4925 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4926 th->th.th_new_place, first_place, last_place));
4927 }
4928 KMP_DEBUG_ASSERT(place == masters_place);
4929 }
4930 } break;
4931
4932 case proc_bind_spread: {
4933 int f;
4934 int n_th = team->t.t_nproc;
4935 int n_places;
4936 int thidx;
4937 if (first_place <= last_place) {
4938 n_places = last_place - first_place + 1;
4939 } else {
4940 n_places = num_masks - first_place + last_place + 1;
4941 }
4942 if (n_th <= n_places) {
4943 int place = -1;
4944
4945 if (n_places != num_masks) {
4946 int S = n_places / n_th;
4947 int s_count, rem, gap, gap_ct;
4948
4949 place = masters_place;
4950 rem = n_places - n_th * S;
4951 gap = rem ? n_th / rem : 1;
4952 gap_ct = gap;
4953 thidx = n_th;
4954 if (update_master_only == 1)
4955 thidx = 1;
4956 for (f = 0; f < thidx; f++) {
4957 kmp_info_t *th = team->t.t_threads[f];
4958 KMP_DEBUG_ASSERT(th != NULL);
4959
4960 th->th.th_first_place = place;
4961 th->th.th_new_place = place;
4962 if (__kmp_display_affinity && place != th->th.th_current_place &&
4963 team->t.t_display_affinity != 1) {
4964 team->t.t_display_affinity = 1;
4965 }
4966 s_count = 1;
4967 while (s_count < S) {
4968 if (place == last_place) {
4969 place = first_place;
4970 } else if (place == (num_masks - 1)) {
4971 place = 0;
4972 } else {
4973 place++;
4974 }
4975 s_count++;
4976 }
4977 if (rem && (gap_ct == gap)) {
4978 if (place == last_place) {
4979 place = first_place;
4980 } else if (place == (num_masks - 1)) {
4981 place = 0;
4982 } else {
4983 place++;
4984 }
4985 rem--;
4986 gap_ct = 0;
4987 }
4988 th->th.th_last_place = place;
4989 gap_ct++;
4990
4991 if (place == last_place) {
4992 place = first_place;
4993 } else if (place == (num_masks - 1)) {
4994 place = 0;
4995 } else {
4996 place++;
4997 }
4998
4999 KA_TRACE(100,
5000 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5001 "partition = [%d,%d], num_masks: %u\n",
5002 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5003 f, th->th.th_new_place, th->th.th_first_place,
5004 th->th.th_last_place, num_masks));
5005 }
5006 } else {
5007 /* Having uniform space of available computation places I can create
5008 T partitions of round(P/T) size and put threads into the first
5009 place of each partition. */
5010 double current = static_cast<double>(masters_place);
5011 double spacing =
5012 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5013 int first, last;
5014 kmp_info_t *th;
5015
5016 thidx = n_th + 1;
5017 if (update_master_only == 1)
5018 thidx = 1;
5019 for (f = 0; f < thidx; f++) {
5020 first = static_cast<int>(current);
5021 last = static_cast<int>(current + spacing) - 1;
5022 KMP_DEBUG_ASSERT(last >= first);
5023 if (first >= n_places) {
5024 if (masters_place) {
5025 first -= n_places;
5026 last -= n_places;
5027 if (first == (masters_place + 1)) {
5028 KMP_DEBUG_ASSERT(f == n_th);
5029 first--;
5030 }
5031 if (last == masters_place) {
5032 KMP_DEBUG_ASSERT(f == (n_th - 1));
5033 last--;
5034 }
5035 } else {
5036 KMP_DEBUG_ASSERT(f == n_th);
5037 first = 0;
5038 last = 0;
5039 }
5040 }
5041 if (last >= n_places) {
5042 last = (n_places - 1);
5043 }
5044 place = first;
5045 current += spacing;
5046 if (f < n_th) {
5047 KMP_DEBUG_ASSERT(0 <= first);
5048 KMP_DEBUG_ASSERT(n_places > first);
5049 KMP_DEBUG_ASSERT(0 <= last);
5050 KMP_DEBUG_ASSERT(n_places > last);
5051 KMP_DEBUG_ASSERT(last_place >= first_place);
5052 th = team->t.t_threads[f];
5053 KMP_DEBUG_ASSERT(th);
5054 th->th.th_first_place = first;
5055 th->th.th_new_place = place;
5056 th->th.th_last_place = last;
5057 if (__kmp_display_affinity && place != th->th.th_current_place &&
5058 team->t.t_display_affinity != 1) {
5059 team->t.t_display_affinity = 1;
5060 }
5061 KA_TRACE(100,
5062 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5063 "partition = [%d,%d], spacing = %.4f\n",
5064 __kmp_gtid_from_thread(team->t.t_threads[f]),
5065 team->t.t_id, f, th->th.th_new_place,
5066 th->th.th_first_place, th->th.th_last_place, spacing));
5067 }
5068 }
5069 }
5070 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5071 } else {
5072 int S, rem, gap, s_count;
5073 S = n_th / n_places;
5074 s_count = 0;
5075 rem = n_th - (S * n_places);
5076 gap = rem > 0 ? n_places / rem : n_places;
5077 int place = masters_place;
5078 int gap_ct = gap;
5079 thidx = n_th;
5080 if (update_master_only == 1)
5081 thidx = 1;
5082 for (f = 0; f < thidx; f++) {
5083 kmp_info_t *th = team->t.t_threads[f];
5084 KMP_DEBUG_ASSERT(th != NULL);
5085
5086 th->th.th_first_place = place;
5087 th->th.th_last_place = place;
5088 th->th.th_new_place = place;
5089 if (__kmp_display_affinity && place != th->th.th_current_place &&
5090 team->t.t_display_affinity != 1) {
5091 team->t.t_display_affinity = 1;
5092 }
5093 s_count++;
5094
5095 if ((s_count == S) && rem && (gap_ct == gap)) {
5096 // do nothing, add an extra thread to place on next iteration
5097 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5098 // we added an extra thread to this place; move on to next place
5099 if (place == last_place) {
5100 place = first_place;
5101 } else if (place == (num_masks - 1)) {
5102 place = 0;
5103 } else {
5104 place++;
5105 }
5106 s_count = 0;
5107 gap_ct = 1;
5108 rem--;
5109 } else if (s_count == S) { // place is full; don't add extra thread
5110 if (place == last_place) {
5111 place = first_place;
5112 } else if (place == (num_masks - 1)) {
5113 place = 0;
5114 } else {
5115 place++;
5116 }
5117 gap_ct++;
5118 s_count = 0;
5119 }
5120
5121 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5122 "partition = [%d,%d]\n",
5123 __kmp_gtid_from_thread(team->t.t_threads[f]),
5124 team->t.t_id, f, th->th.th_new_place,
5125 th->th.th_first_place, th->th.th_last_place));
5126 }
5127 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5128 }
5129 } break;
5130
5131 default:
5132 break;
5133 }
5134
5135 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5136}
5137
5138#endif // KMP_AFFINITY_SUPPORTED
5139
5140/* allocate a new team data structure to use. take one off of the free pool if
5141 available */
5142kmp_team_t *
5143__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5144#if OMPT_SUPPORT
5145 ompt_data_t ompt_parallel_data,
5146#endif
5147 kmp_proc_bind_t new_proc_bind,
5148 kmp_internal_control_t *new_icvs,
5149 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5150 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5151 int f;
5152 kmp_team_t *team;
5153 int use_hot_team = !root->r.r_active;
5154 int level = 0;
5155 int do_place_partition = 1;
5156
5157 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5158 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5159 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5160 KMP_MB();
5161
5162#if KMP_NESTED_HOT_TEAMS
5163 kmp_hot_team_ptr_t *hot_teams;
5164 if (master) {
5165 team = master->th.th_team;
5166 level = team->t.t_active_level;
5167 if (master->th.th_teams_microtask) { // in teams construct?
5168 if (master->th.th_teams_size.nteams > 1 &&
5169 ( // #teams > 1
5170 team->t.t_pkfn ==
5171 (microtask_t)__kmp_teams_master || // inner fork of the teams
5172 master->th.th_teams_level <
5173 team->t.t_level)) { // or nested parallel inside the teams
5174 ++level; // not increment if #teams==1, or for outer fork of the teams;
5175 // increment otherwise
5176 }
5177 // Do not perform the place partition if inner fork of the teams
5178 // Wait until nested parallel region encountered inside teams construct
5179 if ((master->th.th_teams_size.nteams == 1 &&
5180 master->th.th_teams_level >= team->t.t_level) ||
5181 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5182 do_place_partition = 0;
5183 }
5184 hot_teams = master->th.th_hot_teams;
5185 if (level < __kmp_hot_teams_max_level && hot_teams &&
5186 hot_teams[level].hot_team) {
5187 // hot team has already been allocated for given level
5188 use_hot_team = 1;
5189 } else {
5190 use_hot_team = 0;
5191 }
5192 } else {
5193 // check we won't access uninitialized hot_teams, just in case
5194 KMP_DEBUG_ASSERT(new_nproc == 1);
5195 }
5196#endif
5197 // Optimization to use a "hot" team
5198 if (use_hot_team && new_nproc > 1) {
5199 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5200#if KMP_NESTED_HOT_TEAMS
5201 team = hot_teams[level].hot_team;
5202#else
5203 team = root->r.r_hot_team;
5204#endif
5205#if KMP_DEBUG
5206 if (__kmp_tasking_mode != tskm_immediate_exec) {
5207 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5208 "task_team[1] = %p before reinit\n",
5209 team->t.t_task_team[0], team->t.t_task_team[1]));
5210 }
5211#endif
5212
5213 if (team->t.t_nproc != new_nproc &&
5214 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5215 // Distributed barrier may need a resize
5216 int old_nthr = team->t.t_nproc;
5217 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5218 }
5219
5220 // If not doing the place partition, then reset the team's proc bind
5221 // to indicate that partitioning of all threads still needs to take place
5222 if (do_place_partition == 0)
5223 team->t.t_proc_bind = proc_bind_default;
5224 // Has the number of threads changed?
5225 /* Let's assume the most common case is that the number of threads is
5226 unchanged, and put that case first. */
5227 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5228 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5229 // This case can mean that omp_set_num_threads() was called and the hot
5230 // team size was already reduced, so we check the special flag
5231 if (team->t.t_size_changed == -1) {
5232 team->t.t_size_changed = 1;
5233 } else {
5234 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5235 }
5236
5237 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5238 kmp_r_sched_t new_sched = new_icvs->sched;
5239 // set primary thread's schedule as new run-time schedule
5240 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5241
5242 __kmp_reinitialize_team(team, new_icvs,
5243 root->r.r_uber_thread->th.th_ident);
5244
5245 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5246 team->t.t_threads[0], team));
5247 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5248
5249#if KMP_AFFINITY_SUPPORTED
5250 if ((team->t.t_size_changed == 0) &&
5251 (team->t.t_proc_bind == new_proc_bind)) {
5252 if (new_proc_bind == proc_bind_spread) {
5253 if (do_place_partition) {
5254 // add flag to update only master for spread
5255 __kmp_partition_places(team, 1);
5256 }
5257 }
5258 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5259 "proc_bind = %d, partition = [%d,%d]\n",
5260 team->t.t_id, new_proc_bind, team->t.t_first_place,
5261 team->t.t_last_place));
5262 } else {
5263 if (do_place_partition) {
5264 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5265 __kmp_partition_places(team);
5266 }
5267 }
5268#else
5269 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5270#endif /* KMP_AFFINITY_SUPPORTED */
5271 } else if (team->t.t_nproc > new_nproc) {
5272 KA_TRACE(20,
5273 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5274 new_nproc));
5275
5276 team->t.t_size_changed = 1;
5277 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5278 // Barrier size already reduced earlier in this function
5279 // Activate team threads via th_used_in_team
5280 __kmp_add_threads_to_team(team, new_nproc);
5281 }
5282#if KMP_NESTED_HOT_TEAMS
5283 if (__kmp_hot_teams_mode == 0) {
5284 // AC: saved number of threads should correspond to team's value in this
5285 // mode, can be bigger in mode 1, when hot team has threads in reserve
5286 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5287 hot_teams[level].hot_team_nth = new_nproc;
5288#endif // KMP_NESTED_HOT_TEAMS
5289 /* release the extra threads we don't need any more */
5290 for (f = new_nproc; f < team->t.t_nproc; f++) {
5291 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292 if (__kmp_tasking_mode != tskm_immediate_exec) {
5293 // When decreasing team size, threads no longer in the team should
5294 // unref task team.
5295 team->t.t_threads[f]->th.th_task_team = NULL;
5296 }
5297 __kmp_free_thread(team->t.t_threads[f]);
5298 team->t.t_threads[f] = NULL;
5299 }
5300#if KMP_NESTED_HOT_TEAMS
5301 } // (__kmp_hot_teams_mode == 0)
5302 else {
5303 // When keeping extra threads in team, switch threads to wait on own
5304 // b_go flag
5305 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5306 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5307 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5308 for (int b = 0; b < bs_last_barrier; ++b) {
5309 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5310 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5311 }
5312 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5313 }
5314 }
5315 }
5316#endif // KMP_NESTED_HOT_TEAMS
5317 team->t.t_nproc = new_nproc;
5318 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5319 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5320 __kmp_reinitialize_team(team, new_icvs,
5321 root->r.r_uber_thread->th.th_ident);
5322
5323 // Update remaining threads
5324 for (f = 0; f < new_nproc; ++f) {
5325 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5326 }
5327
5328 // restore the current task state of the primary thread: should be the
5329 // implicit task
5330 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5331 team->t.t_threads[0], team));
5332
5333 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5334
5335#ifdef KMP_DEBUG
5336 for (f = 0; f < team->t.t_nproc; f++) {
5337 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5338 team->t.t_threads[f]->th.th_team_nproc ==
5339 team->t.t_nproc);
5340 }
5341#endif
5342
5343 if (do_place_partition) {
5344 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5345#if KMP_AFFINITY_SUPPORTED
5346 __kmp_partition_places(team);
5347#endif
5348 }
5349 } else { // team->t.t_nproc < new_nproc
5350#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5351 kmp_affin_mask_t *old_mask;
5352 if (KMP_AFFINITY_CAPABLE()) {
5353 KMP_CPU_ALLOC(old_mask);
5354 }
5355#endif
5356
5357 KA_TRACE(20,
5358 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5359 new_nproc));
5360 int old_nproc = team->t.t_nproc; // save old value and use to update only
5361 team->t.t_size_changed = 1;
5362
5363#if KMP_NESTED_HOT_TEAMS
5364 int avail_threads = hot_teams[level].hot_team_nth;
5365 if (new_nproc < avail_threads)
5366 avail_threads = new_nproc;
5367 kmp_info_t **other_threads = team->t.t_threads;
5368 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5369 // Adjust barrier data of reserved threads (if any) of the team
5370 // Other data will be set in __kmp_initialize_info() below.
5371 int b;
5372 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5373 for (b = 0; b < bs_last_barrier; ++b) {
5374 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5375 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5376#if USE_DEBUGGER
5377 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5378#endif
5379 }
5380 }
5381 if (hot_teams[level].hot_team_nth >= new_nproc) {
5382 // we have all needed threads in reserve, no need to allocate any
5383 // this only possible in mode 1, cannot have reserved threads in mode 0
5384 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5385 team->t.t_nproc = new_nproc; // just get reserved threads involved
5386 } else {
5387 // We may have some threads in reserve, but not enough;
5388 // get reserved threads involved if any.
5389 team->t.t_nproc = hot_teams[level].hot_team_nth;
5390 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5391#endif // KMP_NESTED_HOT_TEAMS
5392 if (team->t.t_max_nproc < new_nproc) {
5393 /* reallocate larger arrays */
5394 __kmp_reallocate_team_arrays(team, new_nproc);
5395 __kmp_reinitialize_team(team, new_icvs, NULL);
5396 }
5397
5398#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5399 /* Temporarily set full mask for primary thread before creation of
5400 workers. The reason is that workers inherit the affinity from the
5401 primary thread, so if a lot of workers are created on the single
5402 core quickly, they don't get a chance to set their own affinity for
5403 a long time. */
5404 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5405#endif
5406
5407 /* allocate new threads for the hot team */
5408 for (f = team->t.t_nproc; f < new_nproc; f++) {
5409 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5410 KMP_DEBUG_ASSERT(new_worker);
5411 team->t.t_threads[f] = new_worker;
5412
5413 KA_TRACE(20,
5414 ("__kmp_allocate_team: team %d init T#%d arrived: "
5415 "join=%llu, plain=%llu\n",
5416 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5417 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5418 team->t.t_bar[bs_plain_barrier].b_arrived));
5419
5420 { // Initialize barrier data for new threads.
5421 int b;
5422 kmp_balign_t *balign = new_worker->th.th_bar;
5423 for (b = 0; b < bs_last_barrier; ++b) {
5424 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5425 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5426 KMP_BARRIER_PARENT_FLAG);
5427#if USE_DEBUGGER
5428 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5429#endif
5430 }
5431 }
5432 }
5433
5434#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5435 if (KMP_AFFINITY_CAPABLE()) {
5436 /* Restore initial primary thread's affinity mask */
5437 __kmp_set_system_affinity(old_mask, TRUE);
5438 KMP_CPU_FREE(old_mask);
5439 }
5440#endif
5441#if KMP_NESTED_HOT_TEAMS
5442 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5443#endif // KMP_NESTED_HOT_TEAMS
5444 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5445 // Barrier size already increased earlier in this function
5446 // Activate team threads via th_used_in_team
5447 __kmp_add_threads_to_team(team, new_nproc);
5448 }
5449 /* make sure everyone is syncronized */
5450 // new threads below
5451 __kmp_initialize_team(team, new_nproc, new_icvs,
5452 root->r.r_uber_thread->th.th_ident);
5453
5454 /* reinitialize the threads */
5455 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5456 for (f = 0; f < team->t.t_nproc; ++f)
5457 __kmp_initialize_info(team->t.t_threads[f], team, f,
5458 __kmp_gtid_from_tid(f, team));
5459
5460 // set th_task_state for new threads in hot team with older thread's state
5461 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5462 for (f = old_nproc; f < team->t.t_nproc; ++f)
5463 team->t.t_threads[f]->th.th_task_state = old_state;
5464
5465#ifdef KMP_DEBUG
5466 for (f = 0; f < team->t.t_nproc; ++f) {
5467 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5468 team->t.t_threads[f]->th.th_team_nproc ==
5469 team->t.t_nproc);
5470 }
5471#endif
5472
5473 if (do_place_partition) {
5474 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5475#if KMP_AFFINITY_SUPPORTED
5476 __kmp_partition_places(team);
5477#endif
5478 }
5479 } // Check changes in number of threads
5480
5481 kmp_info_t *master = team->t.t_threads[0];
5482 if (master->th.th_teams_microtask) {
5483 for (f = 1; f < new_nproc; ++f) {
5484 // propagate teams construct specific info to workers
5485 kmp_info_t *thr = team->t.t_threads[f];
5486 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5487 thr->th.th_teams_level = master->th.th_teams_level;
5488 thr->th.th_teams_size = master->th.th_teams_size;
5489 }
5490 }
5491#if KMP_NESTED_HOT_TEAMS
5492 if (level) {
5493 // Sync barrier state for nested hot teams, not needed for outermost hot
5494 // team.
5495 for (f = 1; f < new_nproc; ++f) {
5496 kmp_info_t *thr = team->t.t_threads[f];
5497 int b;
5498 kmp_balign_t *balign = thr->th.th_bar;
5499 for (b = 0; b < bs_last_barrier; ++b) {
5500 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5501 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5502#if USE_DEBUGGER
5503 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5504#endif
5505 }
5506 }
5507 }
5508#endif // KMP_NESTED_HOT_TEAMS
5509
5510 /* reallocate space for arguments if necessary */
5511 __kmp_alloc_argv_entries(argc, team, TRUE);
5512 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5513 // The hot team re-uses the previous task team,
5514 // if untouched during the previous release->gather phase.
5515
5516 KF_TRACE(10, (" hot_team = %p\n", team));
5517
5518#if KMP_DEBUG
5519 if (__kmp_tasking_mode != tskm_immediate_exec) {
5520 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5521 "task_team[1] = %p after reinit\n",
5522 team->t.t_task_team[0], team->t.t_task_team[1]));
5523 }
5524#endif
5525
5526#if OMPT_SUPPORT
5527 __ompt_team_assign_id(team, ompt_parallel_data);
5528#endif
5529
5530 KMP_MB();
5531
5532 return team;
5533 }
5534
5535 /* next, let's try to take one from the team pool */
5536 KMP_MB();
5537 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5538 /* TODO: consider resizing undersized teams instead of reaping them, now
5539 that we have a resizing mechanism */
5540 if (team->t.t_max_nproc >= max_nproc) {
5541 /* take this team from the team pool */
5542 __kmp_team_pool = team->t.t_next_pool;
5543
5544 if (max_nproc > 1 &&
5545 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5546 if (!team->t.b) { // Allocate barrier structure
5547 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5548 }
5549 }
5550
5551 /* setup the team for fresh use */
5552 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5553
5554 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5555 "task_team[1] %p to NULL\n",
5556 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5557 team->t.t_task_team[0] = NULL;
5558 team->t.t_task_team[1] = NULL;
5559
5560 /* reallocate space for arguments if necessary */
5561 __kmp_alloc_argv_entries(argc, team, TRUE);
5562 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5563
5564 KA_TRACE(
5565 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5566 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5567 { // Initialize barrier data.
5568 int b;
5569 for (b = 0; b < bs_last_barrier; ++b) {
5570 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5571#if USE_DEBUGGER
5572 team->t.t_bar[b].b_master_arrived = 0;
5573 team->t.t_bar[b].b_team_arrived = 0;
5574#endif
5575 }
5576 }
5577
5578 team->t.t_proc_bind = new_proc_bind;
5579
5580 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5581 team->t.t_id));
5582
5583#if OMPT_SUPPORT
5584 __ompt_team_assign_id(team, ompt_parallel_data);
5585#endif
5586
5587 KMP_MB();
5588
5589 return team;
5590 }
5591
5592 /* reap team if it is too small, then loop back and check the next one */
5593 // not sure if this is wise, but, will be redone during the hot-teams
5594 // rewrite.
5595 /* TODO: Use technique to find the right size hot-team, don't reap them */
5596 team = __kmp_reap_team(team);
5597 __kmp_team_pool = team;
5598 }
5599
5600 /* nothing available in the pool, no matter, make a new team! */
5601 KMP_MB();
5602 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5603
5604 /* and set it up */
5605 team->t.t_max_nproc = max_nproc;
5606 if (max_nproc > 1 &&
5607 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5608 // Allocate barrier structure
5609 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5610 }
5611
5612 /* NOTE well, for some reason allocating one big buffer and dividing it up
5613 seems to really hurt performance a lot on the P4, so, let's not use this */
5614 __kmp_allocate_team_arrays(team, max_nproc);
5615
5616 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5617 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5618
5619 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5620 "%p to NULL\n",
5621 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5622 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5623 // memory, no need to duplicate
5624 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5625 // memory, no need to duplicate
5626
5627 if (__kmp_storage_map) {
5628 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5629 }
5630
5631 /* allocate space for arguments */
5632 __kmp_alloc_argv_entries(argc, team, FALSE);
5633 team->t.t_argc = argc;
5634
5635 KA_TRACE(20,
5636 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5637 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5638 { // Initialize barrier data.
5639 int b;
5640 for (b = 0; b < bs_last_barrier; ++b) {
5641 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5642#if USE_DEBUGGER
5643 team->t.t_bar[b].b_master_arrived = 0;
5644 team->t.t_bar[b].b_team_arrived = 0;
5645#endif
5646 }
5647 }
5648
5649 team->t.t_proc_bind = new_proc_bind;
5650
5651#if OMPT_SUPPORT
5652 __ompt_team_assign_id(team, ompt_parallel_data);
5653 team->t.ompt_serialized_team_info = NULL;
5654#endif
5655
5656 KMP_MB();
5657
5658 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5659 team->t.t_id));
5660
5661 return team;
5662}
5663
5664/* TODO implement hot-teams at all levels */
5665/* TODO implement lazy thread release on demand (disband request) */
5666
5667/* free the team. return it to the team pool. release all the threads
5668 * associated with it */
5669void __kmp_free_team(kmp_root_t *root,
5670 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5671 int f;
5672 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5673 team->t.t_id));
5674
5675 /* verify state */
5676 KMP_DEBUG_ASSERT(root);
5677 KMP_DEBUG_ASSERT(team);
5678 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5679 KMP_DEBUG_ASSERT(team->t.t_threads);
5680
5681 int use_hot_team = team == root->r.r_hot_team;
5682#if KMP_NESTED_HOT_TEAMS
5683 int level;
5684 if (master) {
5685 level = team->t.t_active_level - 1;
5686 if (master->th.th_teams_microtask) { // in teams construct?
5687 if (master->th.th_teams_size.nteams > 1) {
5688 ++level; // level was not increased in teams construct for
5689 // team_of_masters
5690 }
5691 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5692 master->th.th_teams_level == team->t.t_level) {
5693 ++level; // level was not increased in teams construct for
5694 // team_of_workers before the parallel
5695 } // team->t.t_level will be increased inside parallel
5696 }
5697#if KMP_DEBUG
5698 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5699#endif
5700 if (level < __kmp_hot_teams_max_level) {
5701 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5702 use_hot_team = 1;
5703 }
5704 }
5705#endif // KMP_NESTED_HOT_TEAMS
5706
5707 /* team is done working */
5708 TCW_SYNC_PTR(team->t.t_pkfn,
5709 NULL); // Important for Debugging Support Library.
5710#if KMP_OS_WINDOWS
5711 team->t.t_copyin_counter = 0; // init counter for possible reuse
5712#endif
5713 // Do not reset pointer to parent team to NULL for hot teams.
5714
5715 /* if we are non-hot team, release our threads */
5716 if (!use_hot_team) {
5717 if (__kmp_tasking_mode != tskm_immediate_exec) {
5718 // Wait for threads to reach reapable state
5719 for (f = 1; f < team->t.t_nproc; ++f) {
5720 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5721 kmp_info_t *th = team->t.t_threads[f];
5722 volatile kmp_uint32 *state = &th->th.th_reap_state;
5723 while (*state != KMP_SAFE_TO_REAP) {
5724#if KMP_OS_WINDOWS
5725 // On Windows a thread can be killed at any time, check this
5726 DWORD ecode;
5727 if (!__kmp_is_thread_alive(th, &ecode)) {
5728 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5729 break;
5730 }
5731#endif
5732 // first check if thread is sleeping
5733 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5734 if (fl.is_sleeping())
5735 fl.resume(__kmp_gtid_from_thread(th));
5736 KMP_CPU_PAUSE();
5737 }
5738 }
5739
5740 // Delete task teams
5741 int tt_idx;
5742 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5743 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5744 if (task_team != NULL) {
5745 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5746 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5747 team->t.t_threads[f]->th.th_task_team = NULL;
5748 }
5749 KA_TRACE(
5750 20,
5751 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5752 __kmp_get_gtid(), task_team, team->t.t_id));
5753#if KMP_NESTED_HOT_TEAMS
5754 __kmp_free_task_team(master, task_team);
5755#endif
5756 team->t.t_task_team[tt_idx] = NULL;
5757 }
5758 }
5759 }
5760
5761 // Reset pointer to parent team only for non-hot teams.
5762 team->t.t_parent = NULL;
5763 team->t.t_level = 0;
5764 team->t.t_active_level = 0;
5765
5766 /* free the worker threads */
5767 for (f = 1; f < team->t.t_nproc; ++f) {
5768 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5769 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5770 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5771 1, 2);
5772 }
5773 __kmp_free_thread(team->t.t_threads[f]);
5774 }
5775
5776 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5777 if (team->t.b) {
5778 // wake up thread at old location
5779 team->t.b->go_release();
5780 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5781 for (f = 1; f < team->t.t_nproc; ++f) {
5782 if (team->t.b->sleep[f].sleep) {
5783 __kmp_atomic_resume_64(
5784 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5785 (kmp_atomic_flag_64<> *)NULL);
5786 }
5787 }
5788 }
5789 // Wait for threads to be removed from team
5790 for (int f = 1; f < team->t.t_nproc; ++f) {
5791 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5792 KMP_CPU_PAUSE();
5793 }
5794 }
5795 }
5796
5797 for (f = 1; f < team->t.t_nproc; ++f) {
5798 team->t.t_threads[f] = NULL;
5799 }
5800
5801 if (team->t.t_max_nproc > 1 &&
5802 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5803 distributedBarrier::deallocate(team->t.b);
5804 team->t.b = NULL;
5805 }
5806 /* put the team back in the team pool */
5807 /* TODO limit size of team pool, call reap_team if pool too large */
5808 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5809 __kmp_team_pool = (volatile kmp_team_t *)team;
5810 } else { // Check if team was created for primary threads in teams construct
5811 // See if first worker is a CG root
5812 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5813 team->t.t_threads[1]->th.th_cg_roots);
5814 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5815 // Clean up the CG root nodes on workers so that this team can be re-used
5816 for (f = 1; f < team->t.t_nproc; ++f) {
5817 kmp_info_t *thr = team->t.t_threads[f];
5818 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5819 thr->th.th_cg_roots->cg_root == thr);
5820 // Pop current CG root off list
5821 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5822 thr->th.th_cg_roots = tmp->up;
5823 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5824 " up to node %p. cg_nthreads was %d\n",
5825 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5826 int i = tmp->cg_nthreads--;
5827 if (i == 1) {
5828 __kmp_free(tmp); // free CG if we are the last thread in it
5829 }
5830 // Restore current task's thread_limit from CG root
5831 if (thr->th.th_cg_roots)
5832 thr->th.th_current_task->td_icvs.thread_limit =
5833 thr->th.th_cg_roots->cg_thread_limit;
5834 }
5835 }
5836 }
5837
5838 KMP_MB();
5839}
5840
5841/* reap the team. destroy it, reclaim all its resources and free its memory */
5842kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5843 kmp_team_t *next_pool = team->t.t_next_pool;
5844
5845 KMP_DEBUG_ASSERT(team);
5846 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5847 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5848 KMP_DEBUG_ASSERT(team->t.t_threads);
5849 KMP_DEBUG_ASSERT(team->t.t_argv);
5850
5851 /* TODO clean the threads that are a part of this? */
5852
5853 /* free stuff */
5854 __kmp_free_team_arrays(team);
5855 if (team->t.t_argv != &team->t.t_inline_argv[0])
5856 __kmp_free((void *)team->t.t_argv);
5857 __kmp_free(team);
5858
5859 KMP_MB();
5860 return next_pool;
5861}
5862
5863// Free the thread. Don't reap it, just place it on the pool of available
5864// threads.
5865//
5866// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5867// binding for the affinity mechanism to be useful.
5868//
5869// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5870// However, we want to avoid a potential performance problem by always
5871// scanning through the list to find the correct point at which to insert
5872// the thread (potential N**2 behavior). To do this we keep track of the
5873// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5874// With single-level parallelism, threads will always be added to the tail
5875// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5876// parallelism, all bets are off and we may need to scan through the entire
5877// free list.
5878//
5879// This change also has a potentially large performance benefit, for some
5880// applications. Previously, as threads were freed from the hot team, they
5881// would be placed back on the free list in inverse order. If the hot team
5882// grew back to it's original size, then the freed thread would be placed
5883// back on the hot team in reverse order. This could cause bad cache
5884// locality problems on programs where the size of the hot team regularly
5885// grew and shrunk.
5886//
5887// Now, for single-level parallelism, the OMP tid is always == gtid.
5888void __kmp_free_thread(kmp_info_t *this_th) {
5889 int gtid;
5890 kmp_info_t **scan;
5891
5892 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5893 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5894
5895 KMP_DEBUG_ASSERT(this_th);
5896
5897 // When moving thread to pool, switch thread to wait on own b_go flag, and
5898 // uninitialized (NULL team).
5899 int b;
5900 kmp_balign_t *balign = this_th->th.th_bar;
5901 for (b = 0; b < bs_last_barrier; ++b) {
5902 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5903 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5904 balign[b].bb.team = NULL;
5905 balign[b].bb.leaf_kids = 0;
5906 }
5907 this_th->th.th_task_state = 0;
5908 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5909
5910 /* put thread back on the free pool */
5911 TCW_PTR(this_th->th.th_team, NULL);
5912 TCW_PTR(this_th->th.th_root, NULL);
5913 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5914
5915 while (this_th->th.th_cg_roots) {
5916 this_th->th.th_cg_roots->cg_nthreads--;
5917 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5918 " %p of thread %p to %d\n",
5919 this_th, this_th->th.th_cg_roots,
5920 this_th->th.th_cg_roots->cg_root,
5921 this_th->th.th_cg_roots->cg_nthreads));
5922 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5923 if (tmp->cg_root == this_th) { // Thread is a cg_root
5924 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5925 KA_TRACE(
5926 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5927 this_th->th.th_cg_roots = tmp->up;
5928 __kmp_free(tmp);
5929 } else { // Worker thread
5930 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5931 __kmp_free(tmp);
5932 }
5933 this_th->th.th_cg_roots = NULL;
5934 break;
5935 }
5936 }
5937
5938 /* If the implicit task assigned to this thread can be used by other threads
5939 * -> multiple threads can share the data and try to free the task at
5940 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5941 * with higher probability when hot team is disabled but can occurs even when
5942 * the hot team is enabled */
5943 __kmp_free_implicit_task(this_th);
5944 this_th->th.th_current_task = NULL;
5945
5946 // If the __kmp_thread_pool_insert_pt is already past the new insert
5947 // point, then we need to re-scan the entire list.
5948 gtid = this_th->th.th_info.ds.ds_gtid;
5949 if (__kmp_thread_pool_insert_pt != NULL) {
5950 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5951 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5952 __kmp_thread_pool_insert_pt = NULL;
5953 }
5954 }
5955
5956 // Scan down the list to find the place to insert the thread.
5957 // scan is the address of a link in the list, possibly the address of
5958 // __kmp_thread_pool itself.
5959 //
5960 // In the absence of nested parallelism, the for loop will have 0 iterations.
5961 if (__kmp_thread_pool_insert_pt != NULL) {
5962 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5963 } else {
5964 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5965 }
5966 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5967 scan = &((*scan)->th.th_next_pool))
5968 ;
5969
5970 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5971 // to its address.
5972 TCW_PTR(this_th->th.th_next_pool, *scan);
5973 __kmp_thread_pool_insert_pt = *scan = this_th;
5974 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5975 (this_th->th.th_info.ds.ds_gtid <
5976 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5977 TCW_4(this_th->th.th_in_pool, TRUE);
5978 __kmp_suspend_initialize_thread(this_th);
5979 __kmp_lock_suspend_mx(this_th);
5980 if (this_th->th.th_active == TRUE) {
5981 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5982 this_th->th.th_active_in_pool = TRUE;
5983 }
5984#if KMP_DEBUG
5985 else {
5986 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5987 }
5988#endif
5989 __kmp_unlock_suspend_mx(this_th);
5990
5991 TCW_4(__kmp_nth, __kmp_nth - 1);
5992
5993#ifdef KMP_ADJUST_BLOCKTIME
5994 /* Adjust blocktime back to user setting or default if necessary */
5995 /* Middle initialization might never have occurred */
5996 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5997 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5998 if (__kmp_nth <= __kmp_avail_proc) {
5999 __kmp_zero_bt = FALSE;
6000 }
6001 }
6002#endif /* KMP_ADJUST_BLOCKTIME */
6003
6004 KMP_MB();
6005}
6006
6007/* ------------------------------------------------------------------------ */
6008
6009void *__kmp_launch_thread(kmp_info_t *this_thr) {
6010#if OMP_PROFILING_SUPPORT
6011 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6012 // TODO: add a configuration option for time granularity
6013 if (ProfileTraceFile)
6014 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6015#endif
6016
6017 int gtid = this_thr->th.th_info.ds.ds_gtid;
6018 /* void *stack_data;*/
6019 kmp_team_t **volatile pteam;
6020
6021 KMP_MB();
6022 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6023
6024 if (__kmp_env_consistency_check) {
6025 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6026 }
6027
6028#if OMPD_SUPPORT
6029 if (ompd_state & OMPD_ENABLE_BP)
6030 ompd_bp_thread_begin();
6031#endif
6032
6033#if OMPT_SUPPORT
6034 ompt_data_t *thread_data = nullptr;
6035 if (ompt_enabled.enabled) {
6036 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6037 *thread_data = ompt_data_none;
6038
6039 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6040 this_thr->th.ompt_thread_info.wait_id = 0;
6041 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6042 this_thr->th.ompt_thread_info.parallel_flags = 0;
6043 if (ompt_enabled.ompt_callback_thread_begin) {
6044 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6045 ompt_thread_worker, thread_data);
6046 }
6047 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6048 }
6049#endif
6050
6051 /* This is the place where threads wait for work */
6052 while (!TCR_4(__kmp_global.g.g_done)) {
6053 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6054 KMP_MB();
6055
6056 /* wait for work to do */
6057 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6058
6059 /* No tid yet since not part of a team */
6060 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6061
6062#if OMPT_SUPPORT
6063 if (ompt_enabled.enabled) {
6064 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6065 }
6066#endif
6067
6068 pteam = &this_thr->th.th_team;
6069
6070 /* have we been allocated? */
6071 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6072 /* we were just woken up, so run our new task */
6073 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6074 int rc;
6075 KA_TRACE(20,
6076 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6077 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6078 (*pteam)->t.t_pkfn));
6079
6080 updateHWFPControl(*pteam);
6081
6082#if OMPT_SUPPORT
6083 if (ompt_enabled.enabled) {
6084 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6085 }
6086#endif
6087
6088 rc = (*pteam)->t.t_invoke(gtid);
6089 KMP_ASSERT(rc);
6090
6091 KMP_MB();
6092 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6093 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6094 (*pteam)->t.t_pkfn));
6095 }
6096#if OMPT_SUPPORT
6097 if (ompt_enabled.enabled) {
6098 /* no frame set while outside task */
6099 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6100
6101 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6102 }
6103#endif
6104 /* join barrier after parallel region */
6105 __kmp_join_barrier(gtid);
6106 }
6107 }
6108
6109#if OMPD_SUPPORT
6110 if (ompd_state & OMPD_ENABLE_BP)
6111 ompd_bp_thread_end();
6112#endif
6113
6114#if OMPT_SUPPORT
6115 if (ompt_enabled.ompt_callback_thread_end) {
6116 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6117 }
6118#endif
6119
6120 this_thr->th.th_task_team = NULL;
6121 /* run the destructors for the threadprivate data for this thread */
6122 __kmp_common_destroy_gtid(gtid);
6123
6124 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6125 KMP_MB();
6126
6127#if OMP_PROFILING_SUPPORT
6128 llvm::timeTraceProfilerFinishThread();
6129#endif
6130 return this_thr;
6131}
6132
6133/* ------------------------------------------------------------------------ */
6134
6135void __kmp_internal_end_dest(void *specific_gtid) {
6136 // Make sure no significant bits are lost
6137 int gtid;
6138 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6139
6140 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6141 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6142 * this is because 0 is reserved for the nothing-stored case */
6143
6144 __kmp_internal_end_thread(gtid);
6145}
6146
6147#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6148
6149__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6150 __kmp_internal_end_atexit();
6151}
6152
6153#endif
6154
6155/* [Windows] josh: when the atexit handler is called, there may still be more
6156 than one thread alive */
6157void __kmp_internal_end_atexit(void) {
6158 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6159 /* [Windows]
6160 josh: ideally, we want to completely shutdown the library in this atexit
6161 handler, but stat code that depends on thread specific data for gtid fails
6162 because that data becomes unavailable at some point during the shutdown, so
6163 we call __kmp_internal_end_thread instead. We should eventually remove the
6164 dependency on __kmp_get_specific_gtid in the stat code and use
6165 __kmp_internal_end_library to cleanly shutdown the library.
6166
6167 // TODO: Can some of this comment about GVS be removed?
6168 I suspect that the offending stat code is executed when the calling thread
6169 tries to clean up a dead root thread's data structures, resulting in GVS
6170 code trying to close the GVS structures for that thread, but since the stat
6171 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6172 the calling thread is cleaning up itself instead of another thread, it get
6173 confused. This happens because allowing a thread to unregister and cleanup
6174 another thread is a recent modification for addressing an issue.
6175 Based on the current design (20050722), a thread may end up
6176 trying to unregister another thread only if thread death does not trigger
6177 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6178 thread specific data destructor function to detect thread death. For
6179 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6180 is nothing. Thus, the workaround is applicable only for Windows static
6181 stat library. */
6182 __kmp_internal_end_library(-1);
6183#if KMP_OS_WINDOWS
6184 __kmp_close_console();
6185#endif
6186}
6187
6188static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6189 // It is assumed __kmp_forkjoin_lock is acquired.
6190
6191 int gtid;
6192
6193 KMP_DEBUG_ASSERT(thread != NULL);
6194
6195 gtid = thread->th.th_info.ds.ds_gtid;
6196
6197 if (!is_root) {
6198 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6199 /* Assume the threads are at the fork barrier here */
6200 KA_TRACE(
6201 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6202 gtid));
6203 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6204 while (
6205 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6206 KMP_CPU_PAUSE();
6207 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6208 } else {
6209 /* Need release fence here to prevent seg faults for tree forkjoin
6210 barrier (GEH) */
6211 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6212 thread);
6213 __kmp_release_64(&flag);
6214 }
6215 }
6216
6217 // Terminate OS thread.
6218 __kmp_reap_worker(thread);
6219
6220 // The thread was killed asynchronously. If it was actively
6221 // spinning in the thread pool, decrement the global count.
6222 //
6223 // There is a small timing hole here - if the worker thread was just waking
6224 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6225 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6226 // the global counter might not get updated.
6227 //
6228 // Currently, this can only happen as the library is unloaded,
6229 // so there are no harmful side effects.
6230 if (thread->th.th_active_in_pool) {
6231 thread->th.th_active_in_pool = FALSE;
6232 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6233 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6234 }
6235 }
6236
6237 __kmp_free_implicit_task(thread);
6238
6239// Free the fast memory for tasking
6240#if USE_FAST_MEMORY
6241 __kmp_free_fast_memory(thread);
6242#endif /* USE_FAST_MEMORY */
6243
6244 __kmp_suspend_uninitialize_thread(thread);
6245
6246 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6247 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6248
6249 --__kmp_all_nth;
6250 // __kmp_nth was decremented when thread is added to the pool.
6251
6252#ifdef KMP_ADJUST_BLOCKTIME
6253 /* Adjust blocktime back to user setting or default if necessary */
6254 /* Middle initialization might never have occurred */
6255 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6256 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6257 if (__kmp_nth <= __kmp_avail_proc) {
6258 __kmp_zero_bt = FALSE;
6259 }
6260 }
6261#endif /* KMP_ADJUST_BLOCKTIME */
6262
6263 /* free the memory being used */
6264 if (__kmp_env_consistency_check) {
6265 if (thread->th.th_cons) {
6266 __kmp_free_cons_stack(thread->th.th_cons);
6267 thread->th.th_cons = NULL;
6268 }
6269 }
6270
6271 if (thread->th.th_pri_common != NULL) {
6272 __kmp_free(thread->th.th_pri_common);
6273 thread->th.th_pri_common = NULL;
6274 }
6275
6276 if (thread->th.th_task_state_memo_stack != NULL) {
6277 __kmp_free(thread->th.th_task_state_memo_stack);
6278 thread->th.th_task_state_memo_stack = NULL;
6279 }
6280
6281#if KMP_USE_BGET
6282 if (thread->th.th_local.bget_data != NULL) {
6283 __kmp_finalize_bget(thread);
6284 }
6285#endif
6286
6287#if KMP_AFFINITY_SUPPORTED
6288 if (thread->th.th_affin_mask != NULL) {
6289 KMP_CPU_FREE(thread->th.th_affin_mask);
6290 thread->th.th_affin_mask = NULL;
6291 }
6292#endif /* KMP_AFFINITY_SUPPORTED */
6293
6294#if KMP_USE_HIER_SCHED
6295 if (thread->th.th_hier_bar_data != NULL) {
6296 __kmp_free(thread->th.th_hier_bar_data);
6297 thread->th.th_hier_bar_data = NULL;
6298 }
6299#endif
6300
6301 __kmp_reap_team(thread->th.th_serial_team);
6302 thread->th.th_serial_team = NULL;
6303 __kmp_free(thread);
6304
6305 KMP_MB();
6306
6307} // __kmp_reap_thread
6308
6309static void __kmp_itthash_clean(kmp_info_t *th) {
6310#if USE_ITT_NOTIFY
6311 if (__kmp_itt_region_domains.count > 0) {
6312 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6313 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6314 while (bucket) {
6315 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6316 __kmp_thread_free(th, bucket);
6317 bucket = next;
6318 }
6319 }
6320 }
6321 if (__kmp_itt_barrier_domains.count > 0) {
6322 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6323 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6324 while (bucket) {
6325 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6326 __kmp_thread_free(th, bucket);
6327 bucket = next;
6328 }
6329 }
6330 }
6331#endif
6332}
6333
6334static void __kmp_internal_end(void) {
6335 int i;
6336
6337 /* First, unregister the library */
6338 __kmp_unregister_library();
6339
6340#if KMP_OS_WINDOWS
6341 /* In Win static library, we can't tell when a root actually dies, so we
6342 reclaim the data structures for any root threads that have died but not
6343 unregistered themselves, in order to shut down cleanly.
6344 In Win dynamic library we also can't tell when a thread dies. */
6345 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6346// dead roots
6347#endif
6348
6349 for (i = 0; i < __kmp_threads_capacity; i++)
6350 if (__kmp_root[i])
6351 if (__kmp_root[i]->r.r_active)
6352 break;
6353 KMP_MB(); /* Flush all pending memory write invalidates. */
6354 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6355
6356 if (i < __kmp_threads_capacity) {
6357#if KMP_USE_MONITOR
6358 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6359 KMP_MB(); /* Flush all pending memory write invalidates. */
6360
6361 // Need to check that monitor was initialized before reaping it. If we are
6362 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6363 // __kmp_monitor will appear to contain valid data, but it is only valid in
6364 // the parent process, not the child.
6365 // New behavior (201008): instead of keying off of the flag
6366 // __kmp_init_parallel, the monitor thread creation is keyed off
6367 // of the new flag __kmp_init_monitor.
6368 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6369 if (TCR_4(__kmp_init_monitor)) {
6370 __kmp_reap_monitor(&__kmp_monitor);
6371 TCW_4(__kmp_init_monitor, 0);
6372 }
6373 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6374 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6375#endif // KMP_USE_MONITOR
6376 } else {
6377/* TODO move this to cleanup code */
6378#ifdef KMP_DEBUG
6379 /* make sure that everything has properly ended */
6380 for (i = 0; i < __kmp_threads_capacity; i++) {
6381 if (__kmp_root[i]) {
6382 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6383 // there can be uber threads alive here
6384 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6385 }
6386 }
6387#endif
6388
6389 KMP_MB();
6390
6391 // Reap the worker threads.
6392 // This is valid for now, but be careful if threads are reaped sooner.
6393 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6394 // Get the next thread from the pool.
6395 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6396 __kmp_thread_pool = thread->th.th_next_pool;
6397 // Reap it.
6398 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6399 thread->th.th_next_pool = NULL;
6400 thread->th.th_in_pool = FALSE;
6401 __kmp_reap_thread(thread, 0);
6402 }
6403 __kmp_thread_pool_insert_pt = NULL;
6404
6405 // Reap teams.
6406 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6407 // Get the next team from the pool.
6408 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6409 __kmp_team_pool = team->t.t_next_pool;
6410 // Reap it.
6411 team->t.t_next_pool = NULL;
6412 __kmp_reap_team(team);
6413 }
6414
6415 __kmp_reap_task_teams();
6416
6417#if KMP_OS_UNIX
6418 // Threads that are not reaped should not access any resources since they
6419 // are going to be deallocated soon, so the shutdown sequence should wait
6420 // until all threads either exit the final spin-waiting loop or begin
6421 // sleeping after the given blocktime.
6422 for (i = 0; i < __kmp_threads_capacity; i++) {
6423 kmp_info_t *thr = __kmp_threads[i];
6424 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6425 KMP_CPU_PAUSE();
6426 }
6427#endif
6428
6429 for (i = 0; i < __kmp_threads_capacity; ++i) {
6430 // TBD: Add some checking...
6431 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6432 }
6433
6434 /* Make sure all threadprivate destructors get run by joining with all
6435 worker threads before resetting this flag */
6436 TCW_SYNC_4(__kmp_init_common, FALSE);
6437
6438 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6439 KMP_MB();
6440
6441#if KMP_USE_MONITOR
6442 // See note above: One of the possible fixes for CQ138434 / CQ140126
6443 //
6444 // FIXME: push both code fragments down and CSE them?
6445 // push them into __kmp_cleanup() ?
6446 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6447 if (TCR_4(__kmp_init_monitor)) {
6448 __kmp_reap_monitor(&__kmp_monitor);
6449 TCW_4(__kmp_init_monitor, 0);
6450 }
6451 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6452 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6453#endif
6454 } /* else !__kmp_global.t_active */
6455 TCW_4(__kmp_init_gtid, FALSE);
6456 KMP_MB(); /* Flush all pending memory write invalidates. */
6457
6458 __kmp_cleanup();
6459#if OMPT_SUPPORT
6460 ompt_fini();
6461#endif
6462}
6463
6464void __kmp_internal_end_library(int gtid_req) {
6465 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6466 /* this shouldn't be a race condition because __kmp_internal_end() is the
6467 only place to clear __kmp_serial_init */
6468 /* we'll check this later too, after we get the lock */
6469 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6470 // redundant, because the next check will work in any case.
6471 if (__kmp_global.g.g_abort) {
6472 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6473 /* TODO abort? */
6474 return;
6475 }
6476 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6477 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6478 return;
6479 }
6480
6481 // If hidden helper team has been initialized, we need to deinit it
6482 if (TCR_4(__kmp_init_hidden_helper) &&
6483 !TCR_4(__kmp_hidden_helper_team_done)) {
6484 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6485 // First release the main thread to let it continue its work
6486 __kmp_hidden_helper_main_thread_release();
6487 // Wait until the hidden helper team has been destroyed
6488 __kmp_hidden_helper_threads_deinitz_wait();
6489 }
6490
6491 KMP_MB(); /* Flush all pending memory write invalidates. */
6492 /* find out who we are and what we should do */
6493 {
6494 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6495 KA_TRACE(
6496 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6497 if (gtid == KMP_GTID_SHUTDOWN) {
6498 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6499 "already shutdown\n"));
6500 return;
6501 } else if (gtid == KMP_GTID_MONITOR) {
6502 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6503 "registered, or system shutdown\n"));
6504 return;
6505 } else if (gtid == KMP_GTID_DNE) {
6506 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6507 "shutdown\n"));
6508 /* we don't know who we are, but we may still shutdown the library */
6509 } else if (KMP_UBER_GTID(gtid)) {
6510 /* unregister ourselves as an uber thread. gtid is no longer valid */
6511 if (__kmp_root[gtid]->r.r_active) {
6512 __kmp_global.g.g_abort = -1;
6513 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6514 __kmp_unregister_library();
6515 KA_TRACE(10,
6516 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6517 gtid));
6518 return;
6519 } else {
6520 __kmp_itthash_clean(__kmp_threads[gtid]);
6521 KA_TRACE(
6522 10,
6523 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6524 __kmp_unregister_root_current_thread(gtid);
6525 }
6526 } else {
6527/* worker threads may call this function through the atexit handler, if they
6528 * call exit() */
6529/* For now, skip the usual subsequent processing and just dump the debug buffer.
6530 TODO: do a thorough shutdown instead */
6531#ifdef DUMP_DEBUG_ON_EXIT
6532 if (__kmp_debug_buf)
6533 __kmp_dump_debug_buffer();
6534#endif
6535 // added unregister library call here when we switch to shm linux
6536 // if we don't, it will leave lots of files in /dev/shm
6537 // cleanup shared memory file before exiting.
6538 __kmp_unregister_library();
6539 return;
6540 }
6541 }
6542 /* synchronize the termination process */
6543 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6544
6545 /* have we already finished */
6546 if (__kmp_global.g.g_abort) {
6547 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6548 /* TODO abort? */
6549 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6550 return;
6551 }
6552 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6553 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6554 return;
6555 }
6556
6557 /* We need this lock to enforce mutex between this reading of
6558 __kmp_threads_capacity and the writing by __kmp_register_root.
6559 Alternatively, we can use a counter of roots that is atomically updated by
6560 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6561 __kmp_internal_end_*. */
6562 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6563
6564 /* now we can safely conduct the actual termination */
6565 __kmp_internal_end();
6566
6567 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6568 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6569
6570 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6571
6572#ifdef DUMP_DEBUG_ON_EXIT
6573 if (__kmp_debug_buf)
6574 __kmp_dump_debug_buffer();
6575#endif
6576
6577#if KMP_OS_WINDOWS
6578 __kmp_close_console();
6579#endif
6580
6581 __kmp_fini_allocator();
6582
6583} // __kmp_internal_end_library
6584
6585void __kmp_internal_end_thread(int gtid_req) {
6586 int i;
6587
6588 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6589 /* this shouldn't be a race condition because __kmp_internal_end() is the
6590 * only place to clear __kmp_serial_init */
6591 /* we'll check this later too, after we get the lock */
6592 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6593 // redundant, because the next check will work in any case.
6594 if (__kmp_global.g.g_abort) {
6595 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6596 /* TODO abort? */
6597 return;
6598 }
6599 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6600 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6601 return;
6602 }
6603
6604 // If hidden helper team has been initialized, we need to deinit it
6605 if (TCR_4(__kmp_init_hidden_helper) &&
6606 !TCR_4(__kmp_hidden_helper_team_done)) {
6607 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6608 // First release the main thread to let it continue its work
6609 __kmp_hidden_helper_main_thread_release();
6610 // Wait until the hidden helper team has been destroyed
6611 __kmp_hidden_helper_threads_deinitz_wait();
6612 }
6613
6614 KMP_MB(); /* Flush all pending memory write invalidates. */
6615
6616 /* find out who we are and what we should do */
6617 {
6618 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6619 KA_TRACE(10,
6620 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6621 if (gtid == KMP_GTID_SHUTDOWN) {
6622 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6623 "already shutdown\n"));
6624 return;
6625 } else if (gtid == KMP_GTID_MONITOR) {
6626 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6627 "registered, or system shutdown\n"));
6628 return;
6629 } else if (gtid == KMP_GTID_DNE) {
6630 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6631 "shutdown\n"));
6632 return;
6633 /* we don't know who we are */
6634 } else if (KMP_UBER_GTID(gtid)) {
6635 /* unregister ourselves as an uber thread. gtid is no longer valid */
6636 if (__kmp_root[gtid]->r.r_active) {
6637 __kmp_global.g.g_abort = -1;
6638 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6639 KA_TRACE(10,
6640 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6641 gtid));
6642 return;
6643 } else {
6644 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6645 gtid));
6646 __kmp_unregister_root_current_thread(gtid);
6647 }
6648 } else {
6649 /* just a worker thread, let's leave */
6650 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6651
6652 if (gtid >= 0) {
6653 __kmp_threads[gtid]->th.th_task_team = NULL;
6654 }
6655
6656 KA_TRACE(10,
6657 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6658 gtid));
6659 return;
6660 }
6661 }
6662#if KMP_DYNAMIC_LIB
6663 if (__kmp_pause_status != kmp_hard_paused)
6664 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6665 // because we will better shutdown later in the library destructor.
6666 {
6667 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6668 return;
6669 }
6670#endif
6671 /* synchronize the termination process */
6672 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6673
6674 /* have we already finished */
6675 if (__kmp_global.g.g_abort) {
6676 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6677 /* TODO abort? */
6678 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6679 return;
6680 }
6681 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6682 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6683 return;
6684 }
6685
6686 /* We need this lock to enforce mutex between this reading of
6687 __kmp_threads_capacity and the writing by __kmp_register_root.
6688 Alternatively, we can use a counter of roots that is atomically updated by
6689 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6690 __kmp_internal_end_*. */
6691
6692 /* should we finish the run-time? are all siblings done? */
6693 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6694
6695 for (i = 0; i < __kmp_threads_capacity; ++i) {
6696 if (KMP_UBER_GTID(i)) {
6697 KA_TRACE(
6698 10,
6699 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6700 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6701 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6702 return;
6703 }
6704 }
6705
6706 /* now we can safely conduct the actual termination */
6707
6708 __kmp_internal_end();
6709
6710 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6711 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6712
6713 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6714
6715#ifdef DUMP_DEBUG_ON_EXIT
6716 if (__kmp_debug_buf)
6717 __kmp_dump_debug_buffer();
6718#endif
6719} // __kmp_internal_end_thread
6720
6721// -----------------------------------------------------------------------------
6722// Library registration stuff.
6723
6724static long __kmp_registration_flag = 0;
6725// Random value used to indicate library initialization.
6726static char *__kmp_registration_str = NULL;
6727// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6728
6729static inline char *__kmp_reg_status_name() {
6730/* On RHEL 3u5 if linked statically, getpid() returns different values in
6731 each thread. If registration and unregistration go in different threads
6732 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6733 env var can not be found, because the name will contain different pid. */
6734// macOS* complains about name being too long with additional getuid()
6735#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6736 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6737 (int)getuid());
6738#else
6739 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6740#endif
6741} // __kmp_reg_status_get
6742
6743#if defined(KMP_USE_SHM)
6744// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6745char *temp_reg_status_file_name = nullptr;
6746#endif
6747
6748void __kmp_register_library_startup(void) {
6749
6750 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6751 int done = 0;
6752 union {
6753 double dtime;
6754 long ltime;
6755 } time;
6756#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6757 __kmp_initialize_system_tick();
6758#endif
6759 __kmp_read_system_time(&time.dtime);
6760 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6761 __kmp_registration_str =
6762 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6763 __kmp_registration_flag, KMP_LIBRARY_FILE);
6764
6765 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6766 __kmp_registration_str));
6767
6768 while (!done) {
6769
6770 char *value = NULL; // Actual value of the environment variable.
6771
6772#if defined(KMP_USE_SHM)
6773 char *shm_name = __kmp_str_format("/%s", name);
6774 int shm_preexist = 0;
6775 char *data1;
6776 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6777 if ((fd1 == -1) && (errno == EEXIST)) {
6778 // file didn't open because it already exists.
6779 // try opening existing file
6780 fd1 = shm_open(shm_name, O_RDWR, 0666);
6781 if (fd1 == -1) { // file didn't open
6782 // error out here
6783 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6784 __kmp_msg_null);
6785 } else {
6786 // able to open existing file
6787 shm_preexist = 1;
6788 }
6789 } else if (fd1 == -1) {
6790 // SHM didn't open; it was due to error other than already exists. Try to
6791 // create a temp file under /tmp.
6792 // TODO: /tmp might not always be the temporary directory. For now we will
6793 // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6794 char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6795 fd1 = mkstemp(temp_file_name);
6796 if (fd1 == -1) {
6797 // error out here.
6798 __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6799 __kmp_msg_null);
6800 }
6801 temp_reg_status_file_name = temp_file_name;
6802 }
6803 if (shm_preexist == 0) {
6804 // we created SHM now set size
6805 if (ftruncate(fd1, SHM_SIZE) == -1) {
6806 // error occured setting size;
6807 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6808 KMP_ERR(errno), __kmp_msg_null);
6809 }
6810 }
6811 data1 =
6812 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6813 if (data1 == MAP_FAILED) {
6814 // failed to map shared memory
6815 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6816 __kmp_msg_null);
6817 }
6818 if (shm_preexist == 0) { // set data to SHM, set value
6819 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6820 }
6821 // Read value from either what we just wrote or existing file.
6822 value = __kmp_str_format("%s", data1); // read value from SHM
6823 munmap(data1, SHM_SIZE);
6824 close(fd1);
6825#else // Windows and unix with static library
6826 // Set environment variable, but do not overwrite if it is exist.
6827 __kmp_env_set(name, __kmp_registration_str, 0);
6828 // read value to see if it got set
6829 value = __kmp_env_get(name);
6830#endif
6831
6832 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6833 done = 1; // Ok, environment variable set successfully, exit the loop.
6834 } else {
6835 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6836 // Check whether it alive or dead.
6837 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6838 char *tail = value;
6839 char *flag_addr_str = NULL;
6840 char *flag_val_str = NULL;
6841 char const *file_name = NULL;
6842 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6843 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6844 file_name = tail;
6845 if (tail != NULL) {
6846 unsigned long *flag_addr = 0;
6847 unsigned long flag_val = 0;
6848 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6849 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6850 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6851 // First, check whether environment-encoded address is mapped into
6852 // addr space.
6853 // If so, dereference it to see if it still has the right value.
6854 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6855 neighbor = 1;
6856 } else {
6857 // If not, then we know the other copy of the library is no longer
6858 // running.
6859 neighbor = 2;
6860 }
6861 }
6862 }
6863 switch (neighbor) {
6864 case 0: // Cannot parse environment variable -- neighbor status unknown.
6865 // Assume it is the incompatible format of future version of the
6866 // library. Assume the other library is alive.
6867 // WARN( ... ); // TODO: Issue a warning.
6868 file_name = "unknown library";
6869 KMP_FALLTHROUGH();
6870 // Attention! Falling to the next case. That's intentional.
6871 case 1: { // Neighbor is alive.
6872 // Check it is allowed.
6873 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6874 if (!__kmp_str_match_true(duplicate_ok)) {
6875 // That's not allowed. Issue fatal error.
6876 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6877 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6878 }
6879 KMP_INTERNAL_FREE(duplicate_ok);
6880 __kmp_duplicate_library_ok = 1;
6881 done = 1; // Exit the loop.
6882 } break;
6883 case 2: { // Neighbor is dead.
6884
6885#if defined(KMP_USE_SHM)
6886 // close shared memory.
6887 shm_unlink(shm_name); // this removes file in /dev/shm
6888#else
6889 // Clear the variable and try to register library again.
6890 __kmp_env_unset(name);
6891#endif
6892 } break;
6893 default: {
6894 KMP_DEBUG_ASSERT(0);
6895 } break;
6896 }
6897 }
6898 KMP_INTERNAL_FREE((void *)value);
6899#if defined(KMP_USE_SHM)
6900 KMP_INTERNAL_FREE((void *)shm_name);
6901#endif
6902 } // while
6903 KMP_INTERNAL_FREE((void *)name);
6904
6905} // func __kmp_register_library_startup
6906
6907void __kmp_unregister_library(void) {
6908
6909 char *name = __kmp_reg_status_name();
6910 char *value = NULL;
6911
6912#if defined(KMP_USE_SHM)
6913 bool use_shm = true;
6914 char *shm_name = __kmp_str_format("/%s", name);
6915 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6916 if (fd1 == -1) {
6917 // File did not open. Try the temporary file.
6918 use_shm = false;
6919 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6920 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6921 if (fd1 == -1) {
6922 // give it up now.
6923 return;
6924 }
6925 }
6926 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6927 if (data1 != MAP_FAILED) {
6928 value = __kmp_str_format("%s", data1); // read value from SHM
6929 munmap(data1, SHM_SIZE);
6930 }
6931 close(fd1);
6932#else
6933 value = __kmp_env_get(name);
6934#endif
6935
6936 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6937 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6938 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6939// Ok, this is our variable. Delete it.
6940#if defined(KMP_USE_SHM)
6941 if (use_shm) {
6942 shm_unlink(shm_name); // this removes file in /dev/shm
6943 } else {
6944 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6945 unlink(temp_reg_status_file_name); // this removes the temp file
6946 }
6947#else
6948 __kmp_env_unset(name);
6949#endif
6950 }
6951
6952#if defined(KMP_USE_SHM)
6953 KMP_INTERNAL_FREE(shm_name);
6954 if (!use_shm) {
6955 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6956 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6957 }
6958#endif
6959
6960 KMP_INTERNAL_FREE(__kmp_registration_str);
6961 KMP_INTERNAL_FREE(value);
6962 KMP_INTERNAL_FREE(name);
6963
6964 __kmp_registration_flag = 0;
6965 __kmp_registration_str = NULL;
6966
6967} // __kmp_unregister_library
6968
6969// End of Library registration stuff.
6970// -----------------------------------------------------------------------------
6971
6972#if KMP_MIC_SUPPORTED
6973
6974static void __kmp_check_mic_type() {
6975 kmp_cpuid_t cpuid_state = {0};
6976 kmp_cpuid_t *cs_p = &cpuid_state;
6977 __kmp_x86_cpuid(1, 0, cs_p);
6978 // We don't support mic1 at the moment
6979 if ((cs_p->eax & 0xff0) == 0xB10) {
6980 __kmp_mic_type = mic2;
6981 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6982 __kmp_mic_type = mic3;
6983 } else {
6984 __kmp_mic_type = non_mic;
6985 }
6986}
6987
6988#endif /* KMP_MIC_SUPPORTED */
6989
6990#if KMP_HAVE_UMWAIT
6991static void __kmp_user_level_mwait_init() {
6992 struct kmp_cpuid buf;
6993 __kmp_x86_cpuid(7, 0, &buf);
6994 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6995 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6996 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6997 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6998 __kmp_umwait_enabled));
6999}
7000#elif KMP_HAVE_MWAIT
7001#ifndef AT_INTELPHIUSERMWAIT
7002// Spurious, non-existent value that should always fail to return anything.
7003// Will be replaced with the correct value when we know that.
7004#define AT_INTELPHIUSERMWAIT 10000
7005#endif
7006// getauxval() function is available in RHEL7 and SLES12. If a system with an
7007// earlier OS is used to build the RTL, we'll use the following internal
7008// function when the entry is not found.
7009unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7010unsigned long getauxval(unsigned long) { return 0; }
7011
7012static void __kmp_user_level_mwait_init() {
7013 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7014 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7015 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7016 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7017 if (__kmp_mic_type == mic3) {
7018 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7019 if ((res & 0x1) || __kmp_user_level_mwait) {
7020 __kmp_mwait_enabled = TRUE;
7021 if (__kmp_user_level_mwait) {
7022 KMP_INFORM(EnvMwaitWarn);
7023 }
7024 } else {
7025 __kmp_mwait_enabled = FALSE;
7026 }
7027 }
7028 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7029 "__kmp_mwait_enabled = %d\n",
7030 __kmp_mic_type, __kmp_mwait_enabled));
7031}
7032#endif /* KMP_HAVE_UMWAIT */
7033
7034static void __kmp_do_serial_initialize(void) {
7035 int i, gtid;
7036 size_t size;
7037
7038 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7039
7040 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7041 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7042 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7043 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7044 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7045
7046#if OMPT_SUPPORT
7047 ompt_pre_init();
7048#endif
7049#if OMPD_SUPPORT
7050 __kmp_env_dump();
7051 ompd_init();
7052#endif
7053
7054 __kmp_validate_locks();
7055
7056#if ENABLE_LIBOMPTARGET
7057 /* Initialize functions from libomptarget */
7058 __kmp_init_omptarget();
7059#endif
7060
7061 /* Initialize internal memory allocator */
7062 __kmp_init_allocator();
7063
7064 /* Register the library startup via an environment variable or via mapped
7065 shared memory file and check to see whether another copy of the library is
7066 already registered. Since forked child process is often terminated, we
7067 postpone the registration till middle initialization in the child */
7068 if (__kmp_need_register_serial)
7069 __kmp_register_library_startup();
7070
7071 /* TODO reinitialization of library */
7072 if (TCR_4(__kmp_global.g.g_done)) {
7073 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7074 }
7075
7076 __kmp_global.g.g_abort = 0;
7077 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7078
7079/* initialize the locks */
7080#if KMP_USE_ADAPTIVE_LOCKS
7081#if KMP_DEBUG_ADAPTIVE_LOCKS
7082 __kmp_init_speculative_stats();
7083#endif
7084#endif
7085#if KMP_STATS_ENABLED
7086 __kmp_stats_init();
7087#endif
7088 __kmp_init_lock(&__kmp_global_lock);
7089 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7090 __kmp_init_lock(&__kmp_debug_lock);
7091 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7092 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7093 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7094 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7095 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7096 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7097 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7098 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7099 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7100 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7101 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7102 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7103 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7104 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7105 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7106#if KMP_USE_MONITOR
7107 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7108#endif
7109 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7110
7111 /* conduct initialization and initial setup of configuration */
7112
7113 __kmp_runtime_initialize();
7114
7115#if KMP_MIC_SUPPORTED
7116 __kmp_check_mic_type();
7117#endif
7118
7119// Some global variable initialization moved here from kmp_env_initialize()
7120#ifdef KMP_DEBUG
7121 kmp_diag = 0;
7122#endif
7123 __kmp_abort_delay = 0;
7124
7125 // From __kmp_init_dflt_team_nth()
7126 /* assume the entire machine will be used */
7127 __kmp_dflt_team_nth_ub = __kmp_xproc;
7128 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7129 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7130 }
7131 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7132 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7133 }
7134 __kmp_max_nth = __kmp_sys_max_nth;
7135 __kmp_cg_max_nth = __kmp_sys_max_nth;
7136 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7137 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7138 __kmp_teams_max_nth = __kmp_sys_max_nth;
7139 }
7140
7141 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7142 // part
7143 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7144#if KMP_USE_MONITOR
7145 __kmp_monitor_wakeups =
7146 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7147 __kmp_bt_intervals =
7148 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7149#endif
7150 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7151 __kmp_library = library_throughput;
7152 // From KMP_SCHEDULE initialization
7153 __kmp_static = kmp_sch_static_balanced;
7154// AC: do not use analytical here, because it is non-monotonous
7155//__kmp_guided = kmp_sch_guided_iterative_chunked;
7156//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7157// need to repeat assignment
7158// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7159// bit control and barrier method control parts
7160#if KMP_FAST_REDUCTION_BARRIER
7161#define kmp_reduction_barrier_gather_bb ((int)1)
7162#define kmp_reduction_barrier_release_bb ((int)1)
7163#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7164#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7165#endif // KMP_FAST_REDUCTION_BARRIER
7166 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7167 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7168 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7169 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7170 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7171#if KMP_FAST_REDUCTION_BARRIER
7172 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7173 // lin_64 ): hyper,1
7174 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7175 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7176 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7177 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7178 }
7179#endif // KMP_FAST_REDUCTION_BARRIER
7180 }
7181#if KMP_FAST_REDUCTION_BARRIER
7182#undef kmp_reduction_barrier_release_pat
7183#undef kmp_reduction_barrier_gather_pat
7184#undef kmp_reduction_barrier_release_bb
7185#undef kmp_reduction_barrier_gather_bb
7186#endif // KMP_FAST_REDUCTION_BARRIER
7187#if KMP_MIC_SUPPORTED
7188 if (__kmp_mic_type == mic2) { // KNC
7189 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7190 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7191 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7192 1; // forkjoin release
7193 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7194 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7195 }
7196#if KMP_FAST_REDUCTION_BARRIER
7197 if (__kmp_mic_type == mic2) { // KNC
7198 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7199 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7200 }
7201#endif // KMP_FAST_REDUCTION_BARRIER
7202#endif // KMP_MIC_SUPPORTED
7203
7204// From KMP_CHECKS initialization
7205#ifdef KMP_DEBUG
7206 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7207#else
7208 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7209#endif
7210
7211 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7212 __kmp_foreign_tp = TRUE;
7213
7214 __kmp_global.g.g_dynamic = FALSE;
7215 __kmp_global.g.g_dynamic_mode = dynamic_default;
7216
7217 __kmp_init_nesting_mode();
7218
7219 __kmp_env_initialize(NULL);
7220
7221#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7222 __kmp_user_level_mwait_init();
7223#endif
7224// Print all messages in message catalog for testing purposes.
7225#ifdef KMP_DEBUG
7226 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7227 if (__kmp_str_match_true(val)) {
7228 kmp_str_buf_t buffer;
7229 __kmp_str_buf_init(&buffer);
7230 __kmp_i18n_dump_catalog(&buffer);
7231 __kmp_printf("%s", buffer.str);
7232 __kmp_str_buf_free(&buffer);
7233 }
7234 __kmp_env_free(&val);
7235#endif
7236
7237 __kmp_threads_capacity =
7238 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7239 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7240 __kmp_tp_capacity = __kmp_default_tp_capacity(
7241 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7242
7243 // If the library is shut down properly, both pools must be NULL. Just in
7244 // case, set them to NULL -- some memory may leak, but subsequent code will
7245 // work even if pools are not freed.
7246 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7247 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7248 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7249 __kmp_thread_pool = NULL;
7250 __kmp_thread_pool_insert_pt = NULL;
7251 __kmp_team_pool = NULL;
7252
7253 /* Allocate all of the variable sized records */
7254 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7255 * expandable */
7256 /* Since allocation is cache-aligned, just add extra padding at the end */
7257 size =
7258 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7259 CACHE_LINE;
7260 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7261 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7262 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7263
7264 /* init thread counts */
7265 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7266 0); // Asserts fail if the library is reinitializing and
7267 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7268 __kmp_all_nth = 0;
7269 __kmp_nth = 0;
7270
7271 /* setup the uber master thread and hierarchy */
7272 gtid = __kmp_register_root(TRUE);
7273 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7274 KMP_ASSERT(KMP_UBER_GTID(gtid));
7275 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7276
7277 KMP_MB(); /* Flush all pending memory write invalidates. */
7278
7279 __kmp_common_initialize();
7280
7281#if KMP_OS_UNIX
7282 /* invoke the child fork handler */
7283 __kmp_register_atfork();
7284#endif
7285
7286#if !KMP_DYNAMIC_LIB || \
7287 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7288 {
7289 /* Invoke the exit handler when the program finishes, only for static
7290 library and macOS* dynamic. For other dynamic libraries, we already
7291 have _fini and DllMain. */
7292 int rc = atexit(__kmp_internal_end_atexit);
7293 if (rc != 0) {
7294 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7295 __kmp_msg_null);
7296 }
7297 }
7298#endif
7299
7300#if KMP_HANDLE_SIGNALS
7301#if KMP_OS_UNIX
7302 /* NOTE: make sure that this is called before the user installs their own
7303 signal handlers so that the user handlers are called first. this way they
7304 can return false, not call our handler, avoid terminating the library, and
7305 continue execution where they left off. */
7306 __kmp_install_signals(FALSE);
7307#endif /* KMP_OS_UNIX */
7308#if KMP_OS_WINDOWS
7309 __kmp_install_signals(TRUE);
7310#endif /* KMP_OS_WINDOWS */
7311#endif
7312
7313 /* we have finished the serial initialization */
7314 __kmp_init_counter++;
7315
7316 __kmp_init_serial = TRUE;
7317
7318 if (__kmp_settings) {
7319 __kmp_env_print();
7320 }
7321
7322 if (__kmp_display_env || __kmp_display_env_verbose) {
7323 __kmp_env_print_2();
7324 }
7325
7326#if OMPT_SUPPORT
7327 ompt_post_init();
7328#endif
7329
7330 KMP_MB();
7331
7332 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7333}
7334
7335void __kmp_serial_initialize(void) {
7336 if (__kmp_init_serial) {
7337 return;
7338 }
7339 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7340 if (__kmp_init_serial) {
7341 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7342 return;
7343 }
7344 __kmp_do_serial_initialize();
7345 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7346}
7347
7348static void __kmp_do_middle_initialize(void) {
7349 int i, j;
7350 int prev_dflt_team_nth;
7351
7352 if (!__kmp_init_serial) {
7353 __kmp_do_serial_initialize();
7354 }
7355
7356 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7357
7358 if (UNLIKELY(!__kmp_need_register_serial)) {
7359 // We are in a forked child process. The registration was skipped during
7360 // serial initialization in __kmp_atfork_child handler. Do it here.
7361 __kmp_register_library_startup();
7362 }
7363
7364 // Save the previous value for the __kmp_dflt_team_nth so that
7365 // we can avoid some reinitialization if it hasn't changed.
7366 prev_dflt_team_nth = __kmp_dflt_team_nth;
7367
7368#if KMP_AFFINITY_SUPPORTED
7369 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7370 // number of cores on the machine.
7371 __kmp_affinity_initialize(__kmp_affinity);
7372
7373#endif /* KMP_AFFINITY_SUPPORTED */
7374
7375 KMP_ASSERT(__kmp_xproc > 0);
7376 if (__kmp_avail_proc == 0) {
7377 __kmp_avail_proc = __kmp_xproc;
7378 }
7379
7380 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7381 // correct them now
7382 j = 0;
7383 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7384 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7385 __kmp_avail_proc;
7386 j++;
7387 }
7388
7389 if (__kmp_dflt_team_nth == 0) {
7390#ifdef KMP_DFLT_NTH_CORES
7391 // Default #threads = #cores
7392 __kmp_dflt_team_nth = __kmp_ncores;
7393 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7394 "__kmp_ncores (%d)\n",
7395 __kmp_dflt_team_nth));
7396#else
7397 // Default #threads = #available OS procs
7398 __kmp_dflt_team_nth = __kmp_avail_proc;
7399 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7400 "__kmp_avail_proc(%d)\n",
7401 __kmp_dflt_team_nth));
7402#endif /* KMP_DFLT_NTH_CORES */
7403 }
7404
7405 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7406 __kmp_dflt_team_nth = KMP_MIN_NTH;
7407 }
7408 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7409 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7410 }
7411
7412 if (__kmp_nesting_mode > 0)
7413 __kmp_set_nesting_mode_threads();
7414
7415 // There's no harm in continuing if the following check fails,
7416 // but it indicates an error in the previous logic.
7417 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7418
7419 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7420 // Run through the __kmp_threads array and set the num threads icv for each
7421 // root thread that is currently registered with the RTL (which has not
7422 // already explicitly set its nthreads-var with a call to
7423 // omp_set_num_threads()).
7424 for (i = 0; i < __kmp_threads_capacity; i++) {
7425 kmp_info_t *thread = __kmp_threads[i];
7426 if (thread == NULL)
7427 continue;
7428 if (thread->th.th_current_task->td_icvs.nproc != 0)
7429 continue;
7430
7431 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7432 }
7433 }
7434 KA_TRACE(
7435 20,
7436 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7437 __kmp_dflt_team_nth));
7438
7439#ifdef KMP_ADJUST_BLOCKTIME
7440 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7441 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7442 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7443 if (__kmp_nth > __kmp_avail_proc) {
7444 __kmp_zero_bt = TRUE;
7445 }
7446 }
7447#endif /* KMP_ADJUST_BLOCKTIME */
7448
7449 /* we have finished middle initialization */
7450 TCW_SYNC_4(__kmp_init_middle, TRUE);
7451
7452 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7453}
7454
7455void __kmp_middle_initialize(void) {
7456 if (__kmp_init_middle) {
7457 return;
7458 }
7459 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7460 if (__kmp_init_middle) {
7461 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7462 return;
7463 }
7464 __kmp_do_middle_initialize();
7465 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7466}
7467
7468void __kmp_parallel_initialize(void) {
7469 int gtid = __kmp_entry_gtid(); // this might be a new root
7470
7471 /* synchronize parallel initialization (for sibling) */
7472 if (TCR_4(__kmp_init_parallel))
7473 return;
7474 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7475 if (TCR_4(__kmp_init_parallel)) {
7476 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7477 return;
7478 }
7479
7480 /* TODO reinitialization after we have already shut down */
7481 if (TCR_4(__kmp_global.g.g_done)) {
7482 KA_TRACE(
7483 10,
7484 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7485 __kmp_infinite_loop();
7486 }
7487
7488 /* jc: The lock __kmp_initz_lock is already held, so calling
7489 __kmp_serial_initialize would cause a deadlock. So we call
7490 __kmp_do_serial_initialize directly. */
7491 if (!__kmp_init_middle) {
7492 __kmp_do_middle_initialize();
7493 }
7494 __kmp_assign_root_init_mask();
7495 __kmp_resume_if_hard_paused();
7496
7497 /* begin initialization */
7498 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7499 KMP_ASSERT(KMP_UBER_GTID(gtid));
7500
7501#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7502 // Save the FP control regs.
7503 // Worker threads will set theirs to these values at thread startup.
7504 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7505 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7506 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7507#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7508
7509#if KMP_OS_UNIX
7510#if KMP_HANDLE_SIGNALS
7511 /* must be after __kmp_serial_initialize */
7512 __kmp_install_signals(TRUE);
7513#endif
7514#endif
7515
7516 __kmp_suspend_initialize();
7517
7518#if defined(USE_LOAD_BALANCE)
7519 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7520 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7521 }
7522#else
7523 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7524 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7525 }
7526#endif
7527
7528 if (__kmp_version) {
7529 __kmp_print_version_2();
7530 }
7531
7532 /* we have finished parallel initialization */
7533 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7534
7535 KMP_MB();
7536 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7537
7538 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7539}
7540
7541void __kmp_hidden_helper_initialize() {
7542 if (TCR_4(__kmp_init_hidden_helper))
7543 return;
7544
7545 // __kmp_parallel_initialize is required before we initialize hidden helper
7546 if (!TCR_4(__kmp_init_parallel))
7547 __kmp_parallel_initialize();
7548
7549 // Double check. Note that this double check should not be placed before
7550 // __kmp_parallel_initialize as it will cause dead lock.
7551 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7552 if (TCR_4(__kmp_init_hidden_helper)) {
7553 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7554 return;
7555 }
7556
7557#if KMP_AFFINITY_SUPPORTED
7558 // Initialize hidden helper affinity settings.
7559 // The above __kmp_parallel_initialize() will initialize
7560 // regular affinity (and topology) if not already done.
7561 if (!__kmp_hh_affinity.flags.initialized)
7562 __kmp_affinity_initialize(__kmp_hh_affinity);
7563#endif
7564
7565 // Set the count of hidden helper tasks to be executed to zero
7566 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7567
7568 // Set the global variable indicating that we're initializing hidden helper
7569 // team/threads
7570 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7571
7572 // Platform independent initialization
7573 __kmp_do_initialize_hidden_helper_threads();
7574
7575 // Wait here for the finish of initialization of hidden helper teams
7576 __kmp_hidden_helper_threads_initz_wait();
7577
7578 // We have finished hidden helper initialization
7579 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7580
7581 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7582}
7583
7584/* ------------------------------------------------------------------------ */
7585
7586void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7587 kmp_team_t *team) {
7588 kmp_disp_t *dispatch;
7589
7590 KMP_MB();
7591
7592 /* none of the threads have encountered any constructs, yet. */
7593 this_thr->th.th_local.this_construct = 0;
7594#if KMP_CACHE_MANAGE
7595 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7596#endif /* KMP_CACHE_MANAGE */
7597 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7598 KMP_DEBUG_ASSERT(dispatch);
7599 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7600 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7601 // this_thr->th.th_info.ds.ds_tid ] );
7602
7603 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7604 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7605 if (__kmp_env_consistency_check)
7606 __kmp_push_parallel(gtid, team->t.t_ident);
7607
7608 KMP_MB(); /* Flush all pending memory write invalidates. */
7609}
7610
7611void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7612 kmp_team_t *team) {
7613 if (__kmp_env_consistency_check)
7614 __kmp_pop_parallel(gtid, team->t.t_ident);
7615
7616 __kmp_finish_implicit_task(this_thr);
7617}
7618
7619int __kmp_invoke_task_func(int gtid) {
7620 int rc;
7621 int tid = __kmp_tid_from_gtid(gtid);
7622 kmp_info_t *this_thr = __kmp_threads[gtid];
7623 kmp_team_t *team = this_thr->th.th_team;
7624
7625 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7626#if USE_ITT_BUILD
7627 if (__itt_stack_caller_create_ptr) {
7628 // inform ittnotify about entering user's code
7629 if (team->t.t_stack_id != NULL) {
7630 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7631 } else {
7632 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7633 __kmp_itt_stack_callee_enter(
7634 (__itt_caller)team->t.t_parent->t.t_stack_id);
7635 }
7636 }
7637#endif /* USE_ITT_BUILD */
7638#if INCLUDE_SSC_MARKS
7639 SSC_MARK_INVOKING();
7640#endif
7641
7642#if OMPT_SUPPORT
7643 void *dummy;
7644 void **exit_frame_p;
7645 ompt_data_t *my_task_data;
7646 ompt_data_t *my_parallel_data;
7647 int ompt_team_size;
7648
7649 if (ompt_enabled.enabled) {
7650 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7651 .ompt_task_info.frame.exit_frame.ptr);
7652 } else {
7653 exit_frame_p = &dummy;
7654 }
7655
7656 my_task_data =
7657 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7658 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7659 if (ompt_enabled.ompt_callback_implicit_task) {
7660 ompt_team_size = team->t.t_nproc;
7661 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7662 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7663 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7664 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7665 }
7666#endif
7667
7668#if KMP_STATS_ENABLED
7669 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7670 if (previous_state == stats_state_e::TEAMS_REGION) {
7671 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7672 } else {
7673 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7674 }
7675 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7676#endif
7677
7678 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7679 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7680#if OMPT_SUPPORT
7681 ,
7682 exit_frame_p
7683#endif
7684 );
7685#if OMPT_SUPPORT
7686 *exit_frame_p = NULL;
7687 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7688#endif
7689
7690#if KMP_STATS_ENABLED
7691 if (previous_state == stats_state_e::TEAMS_REGION) {
7692 KMP_SET_THREAD_STATE(previous_state);
7693 }
7694 KMP_POP_PARTITIONED_TIMER();
7695#endif
7696
7697#if USE_ITT_BUILD
7698 if (__itt_stack_caller_create_ptr) {
7699 // inform ittnotify about leaving user's code
7700 if (team->t.t_stack_id != NULL) {
7701 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7702 } else {
7703 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7704 __kmp_itt_stack_callee_leave(
7705 (__itt_caller)team->t.t_parent->t.t_stack_id);
7706 }
7707 }
7708#endif /* USE_ITT_BUILD */
7709 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7710
7711 return rc;
7712}
7713
7714void __kmp_teams_master(int gtid) {
7715 // This routine is called by all primary threads in teams construct
7716 kmp_info_t *thr = __kmp_threads[gtid];
7717 kmp_team_t *team = thr->th.th_team;
7718 ident_t *loc = team->t.t_ident;
7719 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7720 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7721 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7722 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7723 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7724
7725 // This thread is a new CG root. Set up the proper variables.
7726 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7727 tmp->cg_root = thr; // Make thr the CG root
7728 // Init to thread limit stored when league primary threads were forked
7729 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7730 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7731 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7732 " cg_nthreads to 1\n",
7733 thr, tmp));
7734 tmp->up = thr->th.th_cg_roots;
7735 thr->th.th_cg_roots = tmp;
7736
7737// Launch league of teams now, but not let workers execute
7738// (they hang on fork barrier until next parallel)
7739#if INCLUDE_SSC_MARKS
7740 SSC_MARK_FORKING();
7741#endif
7742 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7743 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7744 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7745#if INCLUDE_SSC_MARKS
7746 SSC_MARK_JOINING();
7747#endif
7748 // If the team size was reduced from the limit, set it to the new size
7749 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7750 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7751 // AC: last parameter "1" eliminates join barrier which won't work because
7752 // worker threads are in a fork barrier waiting for more parallel regions
7753 __kmp_join_call(loc, gtid
7754#if OMPT_SUPPORT
7755 ,
7756 fork_context_intel
7757#endif
7758 ,
7759 1);
7760}
7761
7762int __kmp_invoke_teams_master(int gtid) {
7763 kmp_info_t *this_thr = __kmp_threads[gtid];
7764 kmp_team_t *team = this_thr->th.th_team;
7765#if KMP_DEBUG
7766 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7767 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7768 (void *)__kmp_teams_master);
7769#endif
7770 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7771#if OMPT_SUPPORT
7772 int tid = __kmp_tid_from_gtid(gtid);
7773 ompt_data_t *task_data =
7774 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7775 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7776 if (ompt_enabled.ompt_callback_implicit_task) {
7777 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7778 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7779 ompt_task_initial);
7780 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7781 }
7782#endif
7783 __kmp_teams_master(gtid);
7784#if OMPT_SUPPORT
7785 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7786#endif
7787 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7788 return 1;
7789}
7790
7791/* this sets the requested number of threads for the next parallel region
7792 encountered by this team. since this should be enclosed in the forkjoin
7793 critical section it should avoid race conditions with asymmetrical nested
7794 parallelism */
7795
7796void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7797 kmp_info_t *thr = __kmp_threads[gtid];
7798
7799 if (num_threads > 0)
7800 thr->th.th_set_nproc = num_threads;
7801}
7802
7803static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7804 int num_threads) {
7805 KMP_DEBUG_ASSERT(thr);
7806 // Remember the number of threads for inner parallel regions
7807 if (!TCR_4(__kmp_init_middle))
7808 __kmp_middle_initialize(); // get internal globals calculated
7809 __kmp_assign_root_init_mask();
7810 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7811 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7812
7813 if (num_threads == 0) {
7814 if (__kmp_teams_thread_limit > 0) {
7815 num_threads = __kmp_teams_thread_limit;
7816 } else {
7817 num_threads = __kmp_avail_proc / num_teams;
7818 }
7819 // adjust num_threads w/o warning as it is not user setting
7820 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7821 // no thread_limit clause specified - do not change thread-limit-var ICV
7822 if (num_threads > __kmp_dflt_team_nth) {
7823 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7824 }
7825 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7826 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7827 } // prevent team size to exceed thread-limit-var
7828 if (num_teams * num_threads > __kmp_teams_max_nth) {
7829 num_threads = __kmp_teams_max_nth / num_teams;
7830 }
7831 if (num_threads == 0) {
7832 num_threads = 1;
7833 }
7834 } else {
7835 if (num_threads < 0) {
7836 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7837 __kmp_msg_null);
7838 num_threads = 1;
7839 }
7840 // This thread will be the primary thread of the league primary threads
7841 // Store new thread limit; old limit is saved in th_cg_roots list
7842 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7843 // num_threads = min(num_threads, nthreads-var)
7844 if (num_threads > __kmp_dflt_team_nth) {
7845 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7846 }
7847 if (num_teams * num_threads > __kmp_teams_max_nth) {
7848 int new_threads = __kmp_teams_max_nth / num_teams;
7849 if (new_threads == 0) {
7850 new_threads = 1;
7851 }
7852 if (new_threads != num_threads) {
7853 if (!__kmp_reserve_warn) { // user asked for too many threads
7854 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7855 __kmp_msg(kmp_ms_warning,
7856 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7857 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7858 }
7859 }
7860 num_threads = new_threads;
7861 }
7862 }
7863 thr->th.th_teams_size.nth = num_threads;
7864}
7865
7866/* this sets the requested number of teams for the teams region and/or
7867 the number of threads for the next parallel region encountered */
7868void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7869 int num_threads) {
7870 kmp_info_t *thr = __kmp_threads[gtid];
7871 if (num_teams < 0) {
7872 // OpenMP specification requires requested values to be positive,
7873 // but people can send us any value, so we'd better check
7874 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7875 __kmp_msg_null);
7876 num_teams = 1;
7877 }
7878 if (num_teams == 0) {
7879 if (__kmp_nteams > 0) {
7880 num_teams = __kmp_nteams;
7881 } else {
7882 num_teams = 1; // default number of teams is 1.
7883 }
7884 }
7885 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7886 if (!__kmp_reserve_warn) {
7887 __kmp_reserve_warn = 1;
7888 __kmp_msg(kmp_ms_warning,
7889 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7890 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7891 }
7892 num_teams = __kmp_teams_max_nth;
7893 }
7894 // Set number of teams (number of threads in the outer "parallel" of the
7895 // teams)
7896 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7897
7898 __kmp_push_thread_limit(thr, num_teams, num_threads);
7899}
7900
7901/* This sets the requested number of teams for the teams region and/or
7902 the number of threads for the next parallel region encountered */
7903void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7904 int num_teams_ub, int num_threads) {
7905 kmp_info_t *thr = __kmp_threads[gtid];
7906 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7907 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7908 KMP_DEBUG_ASSERT(num_threads >= 0);
7909
7910 if (num_teams_lb > num_teams_ub) {
7911 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7912 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7913 }
7914
7915 int num_teams = 1; // defalt number of teams is 1.
7916
7917 if (num_teams_lb == 0 && num_teams_ub > 0)
7918 num_teams_lb = num_teams_ub;
7919
7920 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7921 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7922 if (num_teams > __kmp_teams_max_nth) {
7923 if (!__kmp_reserve_warn) {
7924 __kmp_reserve_warn = 1;
7925 __kmp_msg(kmp_ms_warning,
7926 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7927 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7928 }
7929 num_teams = __kmp_teams_max_nth;
7930 }
7931 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7932 num_teams = num_teams_ub;
7933 } else { // num_teams_lb <= num_teams <= num_teams_ub
7934 if (num_threads <= 0) {
7935 if (num_teams_ub > __kmp_teams_max_nth) {
7936 num_teams = num_teams_lb;
7937 } else {
7938 num_teams = num_teams_ub;
7939 }
7940 } else {
7941 num_teams = (num_threads > __kmp_teams_max_nth)
7942 ? num_teams
7943 : __kmp_teams_max_nth / num_threads;
7944 if (num_teams < num_teams_lb) {
7945 num_teams = num_teams_lb;
7946 } else if (num_teams > num_teams_ub) {
7947 num_teams = num_teams_ub;
7948 }
7949 }
7950 }
7951 // Set number of teams (number of threads in the outer "parallel" of the
7952 // teams)
7953 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7954
7955 __kmp_push_thread_limit(thr, num_teams, num_threads);
7956}
7957
7958// Set the proc_bind var to use in the following parallel region.
7959void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7960 kmp_info_t *thr = __kmp_threads[gtid];
7961 thr->th.th_set_proc_bind = proc_bind;
7962}
7963
7964/* Launch the worker threads into the microtask. */
7965
7966void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7967 kmp_info_t *this_thr = __kmp_threads[gtid];
7968
7969#ifdef KMP_DEBUG
7970 int f;
7971#endif /* KMP_DEBUG */
7972
7973 KMP_DEBUG_ASSERT(team);
7974 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7975 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7976 KMP_MB(); /* Flush all pending memory write invalidates. */
7977
7978 team->t.t_construct = 0; /* no single directives seen yet */
7979 team->t.t_ordered.dt.t_value =
7980 0; /* thread 0 enters the ordered section first */
7981
7982 /* Reset the identifiers on the dispatch buffer */
7983 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7984 if (team->t.t_max_nproc > 1) {
7985 int i;
7986 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7987 team->t.t_disp_buffer[i].buffer_index = i;
7988 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7989 }
7990 } else {
7991 team->t.t_disp_buffer[0].buffer_index = 0;
7992 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7993 }
7994
7995 KMP_MB(); /* Flush all pending memory write invalidates. */
7996 KMP_ASSERT(this_thr->th.th_team == team);
7997
7998#ifdef KMP_DEBUG
7999 for (f = 0; f < team->t.t_nproc; f++) {
8000 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8001 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8002 }
8003#endif /* KMP_DEBUG */
8004
8005 /* release the worker threads so they may begin working */
8006 __kmp_fork_barrier(gtid, 0);
8007}
8008
8009void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8010 kmp_info_t *this_thr = __kmp_threads[gtid];
8011
8012 KMP_DEBUG_ASSERT(team);
8013 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8014 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8015 KMP_MB(); /* Flush all pending memory write invalidates. */
8016
8017 /* Join barrier after fork */
8018
8019#ifdef KMP_DEBUG
8020 if (__kmp_threads[gtid] &&
8021 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8022 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8023 __kmp_threads[gtid]);
8024 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8025 "team->t.t_nproc=%d\n",
8026 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8027 team->t.t_nproc);
8028 __kmp_print_structure();
8029 }
8030 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8031 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8032#endif /* KMP_DEBUG */
8033
8034 __kmp_join_barrier(gtid); /* wait for everyone */
8035#if OMPT_SUPPORT
8036 if (ompt_enabled.enabled &&
8037 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8038 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8039 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8040 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8041#if OMPT_OPTIONAL
8042 void *codeptr = NULL;
8043 if (KMP_MASTER_TID(ds_tid) &&
8044 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8045 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8046 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8047
8048 if (ompt_enabled.ompt_callback_sync_region_wait) {
8049 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8050 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8051 codeptr);
8052 }
8053 if (ompt_enabled.ompt_callback_sync_region) {
8054 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8055 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8056 codeptr);
8057 }
8058#endif
8059 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8060 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8061 ompt_scope_end, NULL, task_data, 0, ds_tid,
8062 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8063 }
8064 }
8065#endif
8066
8067 KMP_MB(); /* Flush all pending memory write invalidates. */
8068 KMP_ASSERT(this_thr->th.th_team == team);
8069}
8070
8071/* ------------------------------------------------------------------------ */
8072
8073#ifdef USE_LOAD_BALANCE
8074
8075// Return the worker threads actively spinning in the hot team, if we
8076// are at the outermost level of parallelism. Otherwise, return 0.
8077static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8078 int i;
8079 int retval;
8080 kmp_team_t *hot_team;
8081
8082 if (root->r.r_active) {
8083 return 0;
8084 }
8085 hot_team = root->r.r_hot_team;
8086 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8087 return hot_team->t.t_nproc - 1; // Don't count primary thread
8088 }
8089
8090 // Skip the primary thread - it is accounted for elsewhere.
8091 retval = 0;
8092 for (i = 1; i < hot_team->t.t_nproc; i++) {
8093 if (hot_team->t.t_threads[i]->th.th_active) {
8094 retval++;
8095 }
8096 }
8097 return retval;
8098}
8099
8100// Perform an automatic adjustment to the number of
8101// threads used by the next parallel region.
8102static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8103 int retval;
8104 int pool_active;
8105 int hot_team_active;
8106 int team_curr_active;
8107 int system_active;
8108
8109 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8110 set_nproc));
8111 KMP_DEBUG_ASSERT(root);
8112 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8113 ->th.th_current_task->td_icvs.dynamic == TRUE);
8114 KMP_DEBUG_ASSERT(set_nproc > 1);
8115
8116 if (set_nproc == 1) {
8117 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8118 return 1;
8119 }
8120
8121 // Threads that are active in the thread pool, active in the hot team for this
8122 // particular root (if we are at the outer par level), and the currently
8123 // executing thread (to become the primary thread) are available to add to the
8124 // new team, but are currently contributing to the system load, and must be
8125 // accounted for.
8126 pool_active = __kmp_thread_pool_active_nth;
8127 hot_team_active = __kmp_active_hot_team_nproc(root);
8128 team_curr_active = pool_active + hot_team_active + 1;
8129
8130 // Check the system load.
8131 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8132 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8133 "hot team active = %d\n",
8134 system_active, pool_active, hot_team_active));
8135
8136 if (system_active < 0) {
8137 // There was an error reading the necessary info from /proc, so use the
8138 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8139 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8140 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8141 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8142
8143 // Make this call behave like the thread limit algorithm.
8144 retval = __kmp_avail_proc - __kmp_nth +
8145 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8146 if (retval > set_nproc) {
8147 retval = set_nproc;
8148 }
8149 if (retval < KMP_MIN_NTH) {
8150 retval = KMP_MIN_NTH;
8151 }
8152
8153 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8154 retval));
8155 return retval;
8156 }
8157
8158 // There is a slight delay in the load balance algorithm in detecting new
8159 // running procs. The real system load at this instant should be at least as
8160 // large as the #active omp thread that are available to add to the team.
8161 if (system_active < team_curr_active) {
8162 system_active = team_curr_active;
8163 }
8164 retval = __kmp_avail_proc - system_active + team_curr_active;
8165 if (retval > set_nproc) {
8166 retval = set_nproc;
8167 }
8168 if (retval < KMP_MIN_NTH) {
8169 retval = KMP_MIN_NTH;
8170 }
8171
8172 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8173 return retval;
8174} // __kmp_load_balance_nproc()
8175
8176#endif /* USE_LOAD_BALANCE */
8177
8178/* ------------------------------------------------------------------------ */
8179
8180/* NOTE: this is called with the __kmp_init_lock held */
8181void __kmp_cleanup(void) {
8182 int f;
8183
8184 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8185
8186 if (TCR_4(__kmp_init_parallel)) {
8187#if KMP_HANDLE_SIGNALS
8188 __kmp_remove_signals();
8189#endif
8190 TCW_4(__kmp_init_parallel, FALSE);
8191 }
8192
8193 if (TCR_4(__kmp_init_middle)) {
8194#if KMP_AFFINITY_SUPPORTED
8195 __kmp_affinity_uninitialize();
8196#endif /* KMP_AFFINITY_SUPPORTED */
8197 __kmp_cleanup_hierarchy();
8198 TCW_4(__kmp_init_middle, FALSE);
8199 }
8200
8201 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8202
8203 if (__kmp_init_serial) {
8204 __kmp_runtime_destroy();
8205 __kmp_init_serial = FALSE;
8206 }
8207
8208 __kmp_cleanup_threadprivate_caches();
8209
8210 for (f = 0; f < __kmp_threads_capacity; f++) {
8211 if (__kmp_root[f] != NULL) {
8212 __kmp_free(__kmp_root[f]);
8213 __kmp_root[f] = NULL;
8214 }
8215 }
8216 __kmp_free(__kmp_threads);
8217 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8218 // there is no need in freeing __kmp_root.
8219 __kmp_threads = NULL;
8220 __kmp_root = NULL;
8221 __kmp_threads_capacity = 0;
8222
8223 // Free old __kmp_threads arrays if they exist.
8224 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8225 while (ptr) {
8226 kmp_old_threads_list_t *next = ptr->next;
8227 __kmp_free(ptr->threads);
8228 __kmp_free(ptr);
8229 ptr = next;
8230 }
8231
8232#if KMP_USE_DYNAMIC_LOCK
8233 __kmp_cleanup_indirect_user_locks();
8234#else
8235 __kmp_cleanup_user_locks();
8236#endif
8237#if OMPD_SUPPORT
8238 if (ompd_state) {
8239 __kmp_free(ompd_env_block);
8240 ompd_env_block = NULL;
8241 ompd_env_block_size = 0;
8242 }
8243#endif
8244
8245#if KMP_AFFINITY_SUPPORTED
8246 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8247 __kmp_cpuinfo_file = NULL;
8248#endif /* KMP_AFFINITY_SUPPORTED */
8249
8250#if KMP_USE_ADAPTIVE_LOCKS
8251#if KMP_DEBUG_ADAPTIVE_LOCKS
8252 __kmp_print_speculative_stats();
8253#endif
8254#endif
8255 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8256 __kmp_nested_nth.nth = NULL;
8257 __kmp_nested_nth.size = 0;
8258 __kmp_nested_nth.used = 0;
8259 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8260 __kmp_nested_proc_bind.bind_types = NULL;
8261 __kmp_nested_proc_bind.size = 0;
8262 __kmp_nested_proc_bind.used = 0;
8263 if (__kmp_affinity_format) {
8264 KMP_INTERNAL_FREE(__kmp_affinity_format);
8265 __kmp_affinity_format = NULL;
8266 }
8267
8268 __kmp_i18n_catclose();
8269
8270#if KMP_USE_HIER_SCHED
8271 __kmp_hier_scheds.deallocate();
8272#endif
8273
8274#if KMP_STATS_ENABLED
8275 __kmp_stats_fini();
8276#endif
8277
8278 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8279}
8280
8281/* ------------------------------------------------------------------------ */
8282
8283int __kmp_ignore_mppbeg(void) {
8284 char *env;
8285
8286 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8287 if (__kmp_str_match_false(env))
8288 return FALSE;
8289 }
8290 // By default __kmpc_begin() is no-op.
8291 return TRUE;
8292}
8293
8294int __kmp_ignore_mppend(void) {
8295 char *env;
8296
8297 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8298 if (__kmp_str_match_false(env))
8299 return FALSE;
8300 }
8301 // By default __kmpc_end() is no-op.
8302 return TRUE;
8303}
8304
8305void __kmp_internal_begin(void) {
8306 int gtid;
8307 kmp_root_t *root;
8308
8309 /* this is a very important step as it will register new sibling threads
8310 and assign these new uber threads a new gtid */
8311 gtid = __kmp_entry_gtid();
8312 root = __kmp_threads[gtid]->th.th_root;
8313 KMP_ASSERT(KMP_UBER_GTID(gtid));
8314
8315 if (root->r.r_begin)
8316 return;
8317 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8318 if (root->r.r_begin) {
8319 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8320 return;
8321 }
8322
8323 root->r.r_begin = TRUE;
8324
8325 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8326}
8327
8328/* ------------------------------------------------------------------------ */
8329
8330void __kmp_user_set_library(enum library_type arg) {
8331 int gtid;
8332 kmp_root_t *root;
8333 kmp_info_t *thread;
8334
8335 /* first, make sure we are initialized so we can get our gtid */
8336
8337 gtid = __kmp_entry_gtid();
8338 thread = __kmp_threads[gtid];
8339
8340 root = thread->th.th_root;
8341
8342 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8343 library_serial));
8344 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8345 thread */
8346 KMP_WARNING(SetLibraryIncorrectCall);
8347 return;
8348 }
8349
8350 switch (arg) {
8351 case library_serial:
8352 thread->th.th_set_nproc = 0;
8353 set__nproc(thread, 1);
8354 break;
8355 case library_turnaround:
8356 thread->th.th_set_nproc = 0;
8357 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8358 : __kmp_dflt_team_nth_ub);
8359 break;
8360 case library_throughput:
8361 thread->th.th_set_nproc = 0;
8362 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8363 : __kmp_dflt_team_nth_ub);
8364 break;
8365 default:
8366 KMP_FATAL(UnknownLibraryType, arg);
8367 }
8368
8369 __kmp_aux_set_library(arg);
8370}
8371
8372void __kmp_aux_set_stacksize(size_t arg) {
8373 if (!__kmp_init_serial)
8374 __kmp_serial_initialize();
8375
8376#if KMP_OS_DARWIN
8377 if (arg & (0x1000 - 1)) {
8378 arg &= ~(0x1000 - 1);
8379 if (arg + 0x1000) /* check for overflow if we round up */
8380 arg += 0x1000;
8381 }
8382#endif
8383 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8384
8385 /* only change the default stacksize before the first parallel region */
8386 if (!TCR_4(__kmp_init_parallel)) {
8387 size_t value = arg; /* argument is in bytes */
8388
8389 if (value < __kmp_sys_min_stksize)
8390 value = __kmp_sys_min_stksize;
8391 else if (value > KMP_MAX_STKSIZE)
8392 value = KMP_MAX_STKSIZE;
8393
8394 __kmp_stksize = value;
8395
8396 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8397 }
8398
8399 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8400}
8401
8402/* set the behaviour of the runtime library */
8403/* TODO this can cause some odd behaviour with sibling parallelism... */
8404void __kmp_aux_set_library(enum library_type arg) {
8405 __kmp_library = arg;
8406
8407 switch (__kmp_library) {
8408 case library_serial: {
8409 KMP_INFORM(LibraryIsSerial);
8410 } break;
8411 case library_turnaround:
8412 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8413 __kmp_use_yield = 2; // only yield when oversubscribed
8414 break;
8415 case library_throughput:
8416 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8417 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8418 break;
8419 default:
8420 KMP_FATAL(UnknownLibraryType, arg);
8421 }
8422}
8423
8424/* Getting team information common for all team API */
8425// Returns NULL if not in teams construct
8426static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8427 kmp_info_t *thr = __kmp_entry_thread();
8428 teams_serialized = 0;
8429 if (thr->th.th_teams_microtask) {
8430 kmp_team_t *team = thr->th.th_team;
8431 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8432 int ii = team->t.t_level;
8433 teams_serialized = team->t.t_serialized;
8434 int level = tlevel + 1;
8435 KMP_DEBUG_ASSERT(ii >= tlevel);
8436 while (ii > level) {
8437 for (teams_serialized = team->t.t_serialized;
8438 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8439 }
8440 if (team->t.t_serialized && (!teams_serialized)) {
8441 team = team->t.t_parent;
8442 continue;
8443 }
8444 if (ii > level) {
8445 team = team->t.t_parent;
8446 ii--;
8447 }
8448 }
8449 return team;
8450 }
8451 return NULL;
8452}
8453
8454int __kmp_aux_get_team_num() {
8455 int serialized;
8456 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8457 if (team) {
8458 if (serialized > 1) {
8459 return 0; // teams region is serialized ( 1 team of 1 thread ).
8460 } else {
8461 return team->t.t_master_tid;
8462 }
8463 }
8464 return 0;
8465}
8466
8467int __kmp_aux_get_num_teams() {
8468 int serialized;
8469 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8470 if (team) {
8471 if (serialized > 1) {
8472 return 1;
8473 } else {
8474 return team->t.t_parent->t.t_nproc;
8475 }
8476 }
8477 return 1;
8478}
8479
8480/* ------------------------------------------------------------------------ */
8481
8482/*
8483 * Affinity Format Parser
8484 *
8485 * Field is in form of: %[[[0].]size]type
8486 * % and type are required (%% means print a literal '%')
8487 * type is either single char or long name surrounded by {},
8488 * e.g., N or {num_threads}
8489 * 0 => leading zeros
8490 * . => right justified when size is specified
8491 * by default output is left justified
8492 * size is the *minimum* field length
8493 * All other characters are printed as is
8494 *
8495 * Available field types:
8496 * L {thread_level} - omp_get_level()
8497 * n {thread_num} - omp_get_thread_num()
8498 * h {host} - name of host machine
8499 * P {process_id} - process id (integer)
8500 * T {thread_identifier} - native thread identifier (integer)
8501 * N {num_threads} - omp_get_num_threads()
8502 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8503 * a {thread_affinity} - comma separated list of integers or integer ranges
8504 * (values of affinity mask)
8505 *
8506 * Implementation-specific field types can be added
8507 * If a type is unknown, print "undefined"
8508 */
8509
8510// Structure holding the short name, long name, and corresponding data type
8511// for snprintf. A table of these will represent the entire valid keyword
8512// field types.
8513typedef struct kmp_affinity_format_field_t {
8514 char short_name; // from spec e.g., L -> thread level
8515 const char *long_name; // from spec thread_level -> thread level
8516 char field_format; // data type for snprintf (typically 'd' or 's'
8517 // for integer or string)
8518} kmp_affinity_format_field_t;
8519
8520static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8521#if KMP_AFFINITY_SUPPORTED
8522 {'A', "thread_affinity", 's'},
8523#endif
8524 {'t', "team_num", 'd'},
8525 {'T', "num_teams", 'd'},
8526 {'L', "nesting_level", 'd'},
8527 {'n', "thread_num", 'd'},
8528 {'N', "num_threads", 'd'},
8529 {'a', "ancestor_tnum", 'd'},
8530 {'H', "host", 's'},
8531 {'P', "process_id", 'd'},
8532 {'i', "native_thread_id", 'd'}};
8533
8534// Return the number of characters it takes to hold field
8535static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8536 const char **ptr,
8537 kmp_str_buf_t *field_buffer) {
8538 int rc, format_index, field_value;
8539 const char *width_left, *width_right;
8540 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8541 static const int FORMAT_SIZE = 20;
8542 char format[FORMAT_SIZE] = {0};
8543 char absolute_short_name = 0;
8544
8545 KMP_DEBUG_ASSERT(gtid >= 0);
8546 KMP_DEBUG_ASSERT(th);
8547 KMP_DEBUG_ASSERT(**ptr == '%');
8548 KMP_DEBUG_ASSERT(field_buffer);
8549
8550 __kmp_str_buf_clear(field_buffer);
8551
8552 // Skip the initial %
8553 (*ptr)++;
8554
8555 // Check for %% first
8556 if (**ptr == '%') {
8557 __kmp_str_buf_cat(field_buffer, "%", 1);
8558 (*ptr)++; // skip over the second %
8559 return 1;
8560 }
8561
8562 // Parse field modifiers if they are present
8563 pad_zeros = false;
8564 if (**ptr == '0') {
8565 pad_zeros = true;
8566 (*ptr)++; // skip over 0
8567 }
8568 right_justify = false;
8569 if (**ptr == '.') {
8570 right_justify = true;
8571 (*ptr)++; // skip over .
8572 }
8573 // Parse width of field: [width_left, width_right)
8574 width_left = width_right = NULL;
8575 if (**ptr >= '0' && **ptr <= '9') {
8576 width_left = *ptr;
8577 SKIP_DIGITS(*ptr);
8578 width_right = *ptr;
8579 }
8580
8581 // Create the format for KMP_SNPRINTF based on flags parsed above
8582 format_index = 0;
8583 format[format_index++] = '%';
8584 if (!right_justify)
8585 format[format_index++] = '-';
8586 if (pad_zeros)
8587 format[format_index++] = '0';
8588 if (width_left && width_right) {
8589 int i = 0;
8590 // Only allow 8 digit number widths.
8591 // This also prevents overflowing format variable
8592 while (i < 8 && width_left < width_right) {
8593 format[format_index++] = *width_left;
8594 width_left++;
8595 i++;
8596 }
8597 }
8598
8599 // Parse a name (long or short)
8600 // Canonicalize the name into absolute_short_name
8601 found_valid_name = false;
8602 parse_long_name = (**ptr == '{');
8603 if (parse_long_name)
8604 (*ptr)++; // skip initial left brace
8605 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8606 sizeof(__kmp_affinity_format_table[0]);
8607 ++i) {
8608 char short_name = __kmp_affinity_format_table[i].short_name;
8609 const char *long_name = __kmp_affinity_format_table[i].long_name;
8610 char field_format = __kmp_affinity_format_table[i].field_format;
8611 if (parse_long_name) {
8612 size_t length = KMP_STRLEN(long_name);
8613 if (strncmp(*ptr, long_name, length) == 0) {
8614 found_valid_name = true;
8615 (*ptr) += length; // skip the long name
8616 }
8617 } else if (**ptr == short_name) {
8618 found_valid_name = true;
8619 (*ptr)++; // skip the short name
8620 }
8621 if (found_valid_name) {
8622 format[format_index++] = field_format;
8623 format[format_index++] = '\0';
8624 absolute_short_name = short_name;
8625 break;
8626 }
8627 }
8628 if (parse_long_name) {
8629 if (**ptr != '}') {
8630 absolute_short_name = 0;
8631 } else {
8632 (*ptr)++; // skip over the right brace
8633 }
8634 }
8635
8636 // Attempt to fill the buffer with the requested
8637 // value using snprintf within __kmp_str_buf_print()
8638 switch (absolute_short_name) {
8639 case 't':
8640 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8641 break;
8642 case 'T':
8643 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8644 break;
8645 case 'L':
8646 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8647 break;
8648 case 'n':
8649 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8650 break;
8651 case 'H': {
8652 static const int BUFFER_SIZE = 256;
8653 char buf[BUFFER_SIZE];
8654 __kmp_expand_host_name(buf, BUFFER_SIZE);
8655 rc = __kmp_str_buf_print(field_buffer, format, buf);
8656 } break;
8657 case 'P':
8658 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8659 break;
8660 case 'i':
8661 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8662 break;
8663 case 'N':
8664 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8665 break;
8666 case 'a':
8667 field_value =
8668 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8669 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8670 break;
8671#if KMP_AFFINITY_SUPPORTED
8672 case 'A': {
8673 kmp_str_buf_t buf;
8674 __kmp_str_buf_init(&buf);
8675 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8676 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8677 __kmp_str_buf_free(&buf);
8678 } break;
8679#endif
8680 default:
8681 // According to spec, If an implementation does not have info for field
8682 // type, then "undefined" is printed
8683 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8684 // Skip the field
8685 if (parse_long_name) {
8686 SKIP_TOKEN(*ptr);
8687 if (**ptr == '}')
8688 (*ptr)++;
8689 } else {
8690 (*ptr)++;
8691 }
8692 }
8693
8694 KMP_ASSERT(format_index <= FORMAT_SIZE);
8695 return rc;
8696}
8697
8698/*
8699 * Return number of characters needed to hold the affinity string
8700 * (not including null byte character)
8701 * The resultant string is printed to buffer, which the caller can then
8702 * handle afterwards
8703 */
8704size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8705 kmp_str_buf_t *buffer) {
8706 const char *parse_ptr;
8707 size_t retval;
8708 const kmp_info_t *th;
8709 kmp_str_buf_t field;
8710
8711 KMP_DEBUG_ASSERT(buffer);
8712 KMP_DEBUG_ASSERT(gtid >= 0);
8713
8714 __kmp_str_buf_init(&field);
8715 __kmp_str_buf_clear(buffer);
8716
8717 th = __kmp_threads[gtid];
8718 retval = 0;
8719
8720 // If format is NULL or zero-length string, then we use
8721 // affinity-format-var ICV
8722 parse_ptr = format;
8723 if (parse_ptr == NULL || *parse_ptr == '\0') {
8724 parse_ptr = __kmp_affinity_format;
8725 }
8726 KMP_DEBUG_ASSERT(parse_ptr);
8727
8728 while (*parse_ptr != '\0') {
8729 // Parse a field
8730 if (*parse_ptr == '%') {
8731 // Put field in the buffer
8732 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8733 __kmp_str_buf_catbuf(buffer, &field);
8734 retval += rc;
8735 } else {
8736 // Put literal character in buffer
8737 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8738 retval++;
8739 parse_ptr++;
8740 }
8741 }
8742 __kmp_str_buf_free(&field);
8743 return retval;
8744}
8745
8746// Displays the affinity string to stdout
8747void __kmp_aux_display_affinity(int gtid, const char *format) {
8748 kmp_str_buf_t buf;
8749 __kmp_str_buf_init(&buf);
8750 __kmp_aux_capture_affinity(gtid, format, &buf);
8751 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8752 __kmp_str_buf_free(&buf);
8753}
8754
8755/* ------------------------------------------------------------------------ */
8756
8757void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8758 int blocktime = arg; /* argument is in milliseconds */
8759#if KMP_USE_MONITOR
8760 int bt_intervals;
8761#endif
8762 kmp_int8 bt_set;
8763
8764 __kmp_save_internal_controls(thread);
8765
8766 /* Normalize and set blocktime for the teams */
8767 if (blocktime < KMP_MIN_BLOCKTIME)
8768 blocktime = KMP_MIN_BLOCKTIME;
8769 else if (blocktime > KMP_MAX_BLOCKTIME)
8770 blocktime = KMP_MAX_BLOCKTIME;
8771
8772 set__blocktime_team(thread->th.th_team, tid, blocktime);
8773 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8774
8775#if KMP_USE_MONITOR
8776 /* Calculate and set blocktime intervals for the teams */
8777 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8778
8779 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8780 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8781#endif
8782
8783 /* Set whether blocktime has been set to "TRUE" */
8784 bt_set = TRUE;
8785
8786 set__bt_set_team(thread->th.th_team, tid, bt_set);
8787 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8788#if KMP_USE_MONITOR
8789 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8790 "bt_intervals=%d, monitor_updates=%d\n",
8791 __kmp_gtid_from_tid(tid, thread->th.th_team),
8792 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8793 __kmp_monitor_wakeups));
8794#else
8795 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8796 __kmp_gtid_from_tid(tid, thread->th.th_team),
8797 thread->th.th_team->t.t_id, tid, blocktime));
8798#endif
8799}
8800
8801void __kmp_aux_set_defaults(char const *str, size_t len) {
8802 if (!__kmp_init_serial) {
8803 __kmp_serial_initialize();
8804 }
8805 __kmp_env_initialize(str);
8806
8807 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8808 __kmp_env_print();
8809 }
8810} // __kmp_aux_set_defaults
8811
8812/* ------------------------------------------------------------------------ */
8813/* internal fast reduction routines */
8814
8815PACKED_REDUCTION_METHOD_T
8816__kmp_determine_reduction_method(
8817 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8818 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8819 kmp_critical_name *lck) {
8820
8821 // Default reduction method: critical construct ( lck != NULL, like in current
8822 // PAROPT )
8823 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8824 // can be selected by RTL
8825 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8826 // can be selected by RTL
8827 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8828 // among generated by PAROPT.
8829
8830 PACKED_REDUCTION_METHOD_T retval;
8831
8832 int team_size;
8833
8834 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8835
8836#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8837 (loc && \
8838 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8839#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8840
8841 retval = critical_reduce_block;
8842
8843 // another choice of getting a team size (with 1 dynamic deference) is slower
8844 team_size = __kmp_get_team_num_threads(global_tid);
8845 if (team_size == 1) {
8846
8847 retval = empty_reduce_block;
8848
8849 } else {
8850
8851 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8852
8853#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8854 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8855
8856#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8857 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8858
8859 int teamsize_cutoff = 4;
8860
8861#if KMP_MIC_SUPPORTED
8862 if (__kmp_mic_type != non_mic) {
8863 teamsize_cutoff = 8;
8864 }
8865#endif
8866 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8867 if (tree_available) {
8868 if (team_size <= teamsize_cutoff) {
8869 if (atomic_available) {
8870 retval = atomic_reduce_block;
8871 }
8872 } else {
8873 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8874 }
8875 } else if (atomic_available) {
8876 retval = atomic_reduce_block;
8877 }
8878#else
8879#error "Unknown or unsupported OS"
8880#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8881 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8882
8883#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8884
8885#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8886
8887 // basic tuning
8888
8889 if (atomic_available) {
8890 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8891 retval = atomic_reduce_block;
8892 }
8893 } // otherwise: use critical section
8894
8895#elif KMP_OS_DARWIN
8896
8897 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8898 if (atomic_available && (num_vars <= 3)) {
8899 retval = atomic_reduce_block;
8900 } else if (tree_available) {
8901 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8902 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8903 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8904 }
8905 } // otherwise: use critical section
8906
8907#else
8908#error "Unknown or unsupported OS"
8909#endif
8910
8911#else
8912#error "Unknown or unsupported architecture"
8913#endif
8914 }
8915
8916 // KMP_FORCE_REDUCTION
8917
8918 // If the team is serialized (team_size == 1), ignore the forced reduction
8919 // method and stay with the unsynchronized method (empty_reduce_block)
8920 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8921 team_size != 1) {
8922
8923 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8924
8925 int atomic_available, tree_available;
8926
8927 switch ((forced_retval = __kmp_force_reduction_method)) {
8928 case critical_reduce_block:
8929 KMP_ASSERT(lck); // lck should be != 0
8930 break;
8931
8932 case atomic_reduce_block:
8933 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8934 if (!atomic_available) {
8935 KMP_WARNING(RedMethodNotSupported, "atomic");
8936 forced_retval = critical_reduce_block;
8937 }
8938 break;
8939
8940 case tree_reduce_block:
8941 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8942 if (!tree_available) {
8943 KMP_WARNING(RedMethodNotSupported, "tree");
8944 forced_retval = critical_reduce_block;
8945 } else {
8946#if KMP_FAST_REDUCTION_BARRIER
8947 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8948#endif
8949 }
8950 break;
8951
8952 default:
8953 KMP_ASSERT(0); // "unsupported method specified"
8954 }
8955
8956 retval = forced_retval;
8957 }
8958
8959 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8960
8961#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8962#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8963
8964 return (retval);
8965}
8966// this function is for testing set/get/determine reduce method
8967kmp_int32 __kmp_get_reduce_method(void) {
8968 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8969}
8970
8971// Soft pause sets up threads to ignore blocktime and just go to sleep.
8972// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8973void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8974
8975// Hard pause shuts down the runtime completely. Resume happens naturally when
8976// OpenMP is used subsequently.
8977void __kmp_hard_pause() {
8978 __kmp_pause_status = kmp_hard_paused;
8979 __kmp_internal_end_thread(-1);
8980}
8981
8982// Soft resume sets __kmp_pause_status, and wakes up all threads.
8983void __kmp_resume_if_soft_paused() {
8984 if (__kmp_pause_status == kmp_soft_paused) {
8985 __kmp_pause_status = kmp_not_paused;
8986
8987 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8988 kmp_info_t *thread = __kmp_threads[gtid];
8989 if (thread) { // Wake it if sleeping
8990 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8991 thread);
8992 if (fl.is_sleeping())
8993 fl.resume(gtid);
8994 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8995 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8996 } else { // thread holds the lock and may sleep soon
8997 do { // until either the thread sleeps, or we can get the lock
8998 if (fl.is_sleeping()) {
8999 fl.resume(gtid);
9000 break;
9001 } else if (__kmp_try_suspend_mx(thread)) {
9002 __kmp_unlock_suspend_mx(thread);
9003 break;
9004 }
9005 } while (1);
9006 }
9007 }
9008 }
9009 }
9010}
9011
9012// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9013// TODO: add warning messages
9014int __kmp_pause_resource(kmp_pause_status_t level) {
9015 if (level == kmp_not_paused) { // requesting resume
9016 if (__kmp_pause_status == kmp_not_paused) {
9017 // error message about runtime not being paused, so can't resume
9018 return 1;
9019 } else {
9020 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9021 __kmp_pause_status == kmp_hard_paused);
9022 __kmp_pause_status = kmp_not_paused;
9023 return 0;
9024 }
9025 } else if (level == kmp_soft_paused) { // requesting soft pause
9026 if (__kmp_pause_status != kmp_not_paused) {
9027 // error message about already being paused
9028 return 1;
9029 } else {
9030 __kmp_soft_pause();
9031 return 0;
9032 }
9033 } else if (level == kmp_hard_paused) { // requesting hard pause
9034 if (__kmp_pause_status != kmp_not_paused) {
9035 // error message about already being paused
9036 return 1;
9037 } else {
9038 __kmp_hard_pause();
9039 return 0;
9040 }
9041 } else {
9042 // error message about invalid level
9043 return 1;
9044 }
9045}
9046
9047void __kmp_omp_display_env(int verbose) {
9048 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9049 if (__kmp_init_serial == 0)
9050 __kmp_do_serial_initialize();
9051 __kmp_display_env_impl(!verbose, verbose);
9052 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9053}
9054
9055// The team size is changing, so distributed barrier must be modified
9056void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9057 int new_nthreads) {
9058 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9059 bp_dist_bar);
9060 kmp_info_t **other_threads = team->t.t_threads;
9061
9062 // We want all the workers to stop waiting on the barrier while we adjust the
9063 // size of the team.
9064 for (int f = 1; f < old_nthreads; ++f) {
9065 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9066 // Ignore threads that are already inactive or not present in the team
9067 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9068 // teams construct causes thread_limit to get passed in, and some of
9069 // those could be inactive; just ignore them
9070 continue;
9071 }
9072 // If thread is transitioning still to in_use state, wait for it
9073 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9074 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9075 KMP_CPU_PAUSE();
9076 }
9077 // The thread should be in_use now
9078 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9079 // Transition to unused state
9080 team->t.t_threads[f]->th.th_used_in_team.store(2);
9081 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9082 }
9083 // Release all the workers
9084 team->t.b->go_release();
9085
9086 KMP_MFENCE();
9087
9088 // Workers should see transition status 2 and move to 0; but may need to be
9089 // woken up first
9090 int count = old_nthreads - 1;
9091 while (count > 0) {
9092 count = old_nthreads - 1;
9093 for (int f = 1; f < old_nthreads; ++f) {
9094 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9095 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9096 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9097 void *, other_threads[f]->th.th_sleep_loc);
9098 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9099 }
9100 } else {
9101 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9102 count--;
9103 }
9104 }
9105 }
9106 // Now update the barrier size
9107 team->t.b->update_num_threads(new_nthreads);
9108 team->t.b->go_reset();
9109}
9110
9111void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9112 // Add the threads back to the team
9113 KMP_DEBUG_ASSERT(team);
9114 // Threads were paused and pointed at th_used_in_team temporarily during a
9115 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9116 // the thread that it should transition itself back into the team. Then, if
9117 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9118 // to wake it up.
9119 for (int f = 1; f < new_nthreads; ++f) {
9120 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9121 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9122 3);
9123 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9124 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9125 (kmp_flag_32<false, false> *)NULL);
9126 }
9127 }
9128 // The threads should be transitioning to the team; when they are done, they
9129 // should have set th_used_in_team to 1. This loop forces master to wait until
9130 // all threads have moved into the team and are waiting in the barrier.
9131 int count = new_nthreads - 1;
9132 while (count > 0) {
9133 count = new_nthreads - 1;
9134 for (int f = 1; f < new_nthreads; ++f) {
9135 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9136 count--;
9137 }
9138 }
9139 }
9140}
9141
9142// Globals and functions for hidden helper task
9143kmp_info_t **__kmp_hidden_helper_threads;
9144kmp_info_t *__kmp_hidden_helper_main_thread;
9145std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9146#if KMP_OS_LINUX
9147kmp_int32 __kmp_hidden_helper_threads_num = 8;
9148kmp_int32 __kmp_enable_hidden_helper = TRUE;
9149#else
9150kmp_int32 __kmp_hidden_helper_threads_num = 0;
9151kmp_int32 __kmp_enable_hidden_helper = FALSE;
9152#endif
9153
9154namespace {
9155std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9156
9157void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9158 // This is an explicit synchronization on all hidden helper threads in case
9159 // that when a regular thread pushes a hidden helper task to one hidden
9160 // helper thread, the thread has not been awaken once since they're released
9161 // by the main thread after creating the team.
9162 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9163 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9164 __kmp_hidden_helper_threads_num)
9165 ;
9166
9167 // If main thread, then wait for signal
9168 if (__kmpc_master(nullptr, *gtid)) {
9169 // First, unset the initial state and release the initial thread
9170 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9171 __kmp_hidden_helper_initz_release();
9172 __kmp_hidden_helper_main_thread_wait();
9173 // Now wake up all worker threads
9174 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9175 __kmp_hidden_helper_worker_thread_signal();
9176 }
9177 }
9178}
9179} // namespace
9180
9181void __kmp_hidden_helper_threads_initz_routine() {
9182 // Create a new root for hidden helper team/threads
9183 const int gtid = __kmp_register_root(TRUE);
9184 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9185 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9186 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9187 __kmp_hidden_helper_threads_num;
9188
9189 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9190
9191 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9192
9193 // Set the initialization flag to FALSE
9194 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9195
9196 __kmp_hidden_helper_threads_deinitz_release();
9197}
9198
9199/* Nesting Mode:
9200 Set via KMP_NESTING_MODE, which takes an integer.
9201 Note: we skip duplicate topology levels, and skip levels with only
9202 one entity.
9203 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9204 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9205 in the topology, and initializes the number of threads at each of those
9206 levels to the number of entities at each level, respectively, below the
9207 entity at the parent level.
9208 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9209 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9210 the user to turn nesting on explicitly. This is an even more experimental
9211 option to this experimental feature, and may change or go away in the
9212 future.
9213*/
9214
9215// Allocate space to store nesting levels
9216void __kmp_init_nesting_mode() {
9217 int levels = KMP_HW_LAST;
9218 __kmp_nesting_mode_nlevels = levels;
9219 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9220 for (int i = 0; i < levels; ++i)
9221 __kmp_nesting_nth_level[i] = 0;
9222 if (__kmp_nested_nth.size < levels) {
9223 __kmp_nested_nth.nth =
9224 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9225 __kmp_nested_nth.size = levels;
9226 }
9227}
9228
9229// Set # threads for top levels of nesting; must be called after topology set
9230void __kmp_set_nesting_mode_threads() {
9231 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9232
9233 if (__kmp_nesting_mode == 1)
9234 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9235 else if (__kmp_nesting_mode > 1)
9236 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9237
9238 if (__kmp_topology) { // use topology info
9239 int loc, hw_level;
9240 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9241 loc < __kmp_nesting_mode_nlevels;
9242 loc++, hw_level++) {
9243 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9244 if (__kmp_nesting_nth_level[loc] == 1)
9245 loc--;
9246 }
9247 // Make sure all cores are used
9248 if (__kmp_nesting_mode > 1 && loc > 1) {
9249 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9250 int num_cores = __kmp_topology->get_count(core_level);
9251 int upper_levels = 1;
9252 for (int level = 0; level < loc - 1; ++level)
9253 upper_levels *= __kmp_nesting_nth_level[level];
9254 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9255 __kmp_nesting_nth_level[loc - 1] =
9256 num_cores / __kmp_nesting_nth_level[loc - 2];
9257 }
9258 __kmp_nesting_mode_nlevels = loc;
9259 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9260 } else { // no topology info available; provide a reasonable guesstimation
9261 if (__kmp_avail_proc >= 4) {
9262 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9263 __kmp_nesting_nth_level[1] = 2;
9264 __kmp_nesting_mode_nlevels = 2;
9265 } else {
9266 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9267 __kmp_nesting_mode_nlevels = 1;
9268 }
9269 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9270 }
9271 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9272 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9273 }
9274 set__nproc(thread, __kmp_nesting_nth_level[0]);
9275 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9276 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9277 if (get__max_active_levels(thread) > 1) {
9278 // if max levels was set, set nesting mode levels to same
9279 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9280 }
9281 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9282 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9283}
9284
9285// Empty symbols to export (see exports_so.txt) when feature is disabled
9286extern "C" {
9287#if !KMP_STATS_ENABLED
9288void __kmp_reset_stats() {}
9289#endif
9290#if !USE_DEBUGGER
9291int __kmp_omp_debug_struct_info = FALSE;
9292int __kmp_debugging = FALSE;
9293#endif
9294#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9295void __kmp_itt_fini_ittlib() {}
9296void __kmp_itt_init_ittlib() {}
9297#endif
9298}
9299
9300// end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236