LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34#if OMPD_SUPPORT
35#include "ompd-specific.h"
36#endif
37
38#if OMP_PROFILING_SUPPORT
39#include "llvm/Support/TimeProfiler.h"
40static char *ProfileTraceFile = nullptr;
41#endif
42
43/* these are temporary issues to be dealt with */
44#define KMP_USE_PRCTL 0
45
46#if KMP_OS_WINDOWS
47#include <process.h>
48#endif
49
50#if KMP_OS_WINDOWS
51// windows does not need include files as it doesn't use shared memory
52#else
53#include <sys/mman.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#define SHM_SIZE 1024
57#endif
58
59#if defined(KMP_GOMP_COMPAT)
60char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62#endif /* defined(KMP_GOMP_COMPAT) */
63
64char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66
67#ifdef KMP_DEBUG
68char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70#endif /* KMP_DEBUG */
71
72#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73
74/* ------------------------------------------------------------------------ */
75
76#if KMP_USE_MONITOR
77kmp_info_t __kmp_monitor;
78#endif
79
80/* Forward declarations */
81
82void __kmp_cleanup(void);
83
84static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89#if KMP_AFFINITY_SUPPORTED
90static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92#endif
93static void __kmp_do_serial_initialize(void);
94void __kmp_fork_barrier(int gtid, int tid);
95void __kmp_join_barrier(int gtid);
96void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
98
99#ifdef USE_LOAD_BALANCE
100static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101#endif
102
103static int __kmp_expand_threads(int nNeed);
104#if KMP_OS_WINDOWS
105static int __kmp_unregister_root_other_thread(int gtid);
106#endif
107static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109
110void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111 int new_nthreads);
112void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113
114/* Calculate the identifier of the current thread */
115/* fast (and somewhat portable) way to get unique identifier of executing
116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117int __kmp_get_global_thread_id() {
118 int i;
119 kmp_info_t **other_threads;
120 size_t stack_data;
121 char *stack_addr;
122 size_t stack_size;
123 char *stack_base;
124
125 KA_TRACE(
126 1000,
127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128 __kmp_nth, __kmp_all_nth));
129
130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133 __kmp_init_gtid for this to work. */
134
135 if (!TCR_4(__kmp_init_gtid))
136 return KMP_GTID_DNE;
137
138#ifdef KMP_TDATA_GTID
139 if (TCR_4(__kmp_gtid_mode) >= 3) {
140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141 return __kmp_gtid;
142 }
143#endif
144 if (TCR_4(__kmp_gtid_mode) >= 2) {
145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146 return __kmp_gtid_get_specific();
147 }
148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149
150 stack_addr = (char *)&stack_data;
151 other_threads = __kmp_threads;
152
153 /* ATT: The code below is a source of potential bugs due to unsynchronized
154 access to __kmp_threads array. For example:
155 1. Current thread loads other_threads[i] to thr and checks it, it is
156 non-NULL.
157 2. Current thread is suspended by OS.
158 3. Another thread unregisters and finishes (debug versions of free()
159 may fill memory with something like 0xEF).
160 4. Current thread is resumed.
161 5. Current thread reads junk from *thr.
162 TODO: Fix it. --ln */
163
164 for (i = 0; i < __kmp_threads_capacity; i++) {
165
166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167 if (!thr)
168 continue;
169
170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172
173 /* stack grows down -- search through all of the active threads */
174
175 if (stack_addr <= stack_base) {
176 size_t stack_diff = stack_base - stack_addr;
177
178 if (stack_diff <= stack_size) {
179 /* The only way we can be closer than the allocated */
180 /* stack size is if we are running on this thread. */
181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182 return i;
183 }
184 }
185 }
186
187 /* get specific to try and determine our gtid */
188 KA_TRACE(1000,
189 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190 "thread, using TLS\n"));
191 i = __kmp_gtid_get_specific();
192
193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194
195 /* if we havn't been assigned a gtid, then return code */
196 if (i < 0)
197 return i;
198
199 /* dynamically updated stack window for uber threads to avoid get_specific
200 call */
201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202 KMP_FATAL(StackOverflow, i);
203 }
204
205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206 if (stack_addr > stack_base) {
207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210 stack_base);
211 } else {
212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213 stack_base - stack_addr);
214 }
215
216 /* Reprint stack bounds for ubermaster since they have been refined */
217 if (__kmp_storage_map) {
218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221 other_threads[i]->th.th_info.ds.ds_stacksize,
222 "th_%d stack (refinement)", i);
223 }
224 return i;
225}
226
227int __kmp_get_global_thread_id_reg() {
228 int gtid;
229
230 if (!__kmp_init_serial) {
231 gtid = KMP_GTID_DNE;
232 } else
233#ifdef KMP_TDATA_GTID
234 if (TCR_4(__kmp_gtid_mode) >= 3) {
235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236 gtid = __kmp_gtid;
237 } else
238#endif
239 if (TCR_4(__kmp_gtid_mode) >= 2) {
240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241 gtid = __kmp_gtid_get_specific();
242 } else {
243 KA_TRACE(1000,
244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245 gtid = __kmp_get_global_thread_id();
246 }
247
248 /* we must be a new uber master sibling thread */
249 if (gtid == KMP_GTID_DNE) {
250 KA_TRACE(10,
251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252 "Registering a new gtid.\n"));
253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254 if (!__kmp_init_serial) {
255 __kmp_do_serial_initialize();
256 gtid = __kmp_gtid_get_specific();
257 } else {
258 gtid = __kmp_register_root(FALSE);
259 }
260 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262 }
263
264 KMP_DEBUG_ASSERT(gtid >= 0);
265
266 return gtid;
267}
268
269/* caller must hold forkjoin_lock */
270void __kmp_check_stack_overlap(kmp_info_t *th) {
271 int f;
272 char *stack_beg = NULL;
273 char *stack_end = NULL;
274 int gtid;
275
276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277 if (__kmp_storage_map) {
278 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280
281 gtid = __kmp_gtid_from_thread(th);
282
283 if (gtid == KMP_GTID_MONITOR) {
284 __kmp_print_storage_map_gtid(
285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286 "th_%s stack (%s)", "mon",
287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288 } else {
289 __kmp_print_storage_map_gtid(
290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291 "th_%d stack (%s)", gtid,
292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293 }
294 }
295
296 /* No point in checking ubermaster threads since they use refinement and
297 * cannot overlap */
298 gtid = __kmp_gtid_from_thread(th);
299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300 KA_TRACE(10,
301 ("__kmp_check_stack_overlap: performing extensive checking\n"));
302 if (stack_beg == NULL) {
303 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305 }
306
307 for (f = 0; f < __kmp_threads_capacity; f++) {
308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309
310 if (f_th && f_th != th) {
311 char *other_stack_end =
312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313 char *other_stack_beg =
314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317
318 /* Print the other stack values before the abort */
319 if (__kmp_storage_map)
320 __kmp_print_storage_map_gtid(
321 -1, other_stack_beg, other_stack_end,
322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324
325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326 __kmp_msg_null);
327 }
328 }
329 }
330 }
331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332}
333
334/* ------------------------------------------------------------------------ */
335
336void __kmp_infinite_loop(void) {
337 static int done = FALSE;
338
339 while (!done) {
340 KMP_YIELD(TRUE);
341 }
342}
343
344#define MAX_MESSAGE 512
345
346void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347 char const *format, ...) {
348 char buffer[MAX_MESSAGE];
349 va_list ap;
350
351 va_start(ap, format);
352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353 p2, (unsigned long)size, format);
354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355 __kmp_vprintf(kmp_err, buffer, ap);
356#if KMP_PRINT_DATA_PLACEMENT
357 int node;
358 if (gtid >= 0) {
359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360 if (__kmp_storage_map_verbose) {
361 node = __kmp_get_host_node(p1);
362 if (node < 0) /* doesn't work, so don't try this next time */
363 __kmp_storage_map_verbose = FALSE;
364 else {
365 char *last;
366 int lastNode;
367 int localProc = __kmp_get_cpu_from_gtid(gtid);
368
369 const int page_size = KMP_GET_PAGE_SIZE();
370
371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373 if (localProc >= 0)
374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375 localProc >> 1);
376 else
377 __kmp_printf_no_lock(" GTID %d\n", gtid);
378#if KMP_USE_PRCTL
379 /* The more elaborate format is disabled for now because of the prctl
380 * hanging bug. */
381 do {
382 last = p1;
383 lastNode = node;
384 /* This loop collates adjacent pages with the same host node. */
385 do {
386 (char *)p1 += page_size;
387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389 lastNode);
390 } while (p1 <= p2);
391#else
392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393 (char *)p1 + (page_size - 1),
394 __kmp_get_host_node(p1));
395 if (p1 < p2) {
396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397 (char *)p2 + (page_size - 1),
398 __kmp_get_host_node(p2));
399 }
400#endif
401 }
402 }
403 } else
404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405 }
406#endif /* KMP_PRINT_DATA_PLACEMENT */
407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408}
409
410void __kmp_warn(char const *format, ...) {
411 char buffer[MAX_MESSAGE];
412 va_list ap;
413
414 if (__kmp_generate_warnings == kmp_warnings_off) {
415 return;
416 }
417
418 va_start(ap, format);
419
420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422 __kmp_vprintf(kmp_err, buffer, ap);
423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424
425 va_end(ap);
426}
427
428void __kmp_abort_process() {
429 // Later threads may stall here, but that's ok because abort() will kill them.
430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431
432 if (__kmp_debug_buf) {
433 __kmp_dump_debug_buffer();
434 }
435
436 if (KMP_OS_WINDOWS) {
437 // Let other threads know of abnormal termination and prevent deadlock
438 // if abort happened during library initialization or shutdown
439 __kmp_global.g.g_abort = SIGABRT;
440
441 /* On Windows* OS by default abort() causes pop-up error box, which stalls
442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443 boxes. _set_abort_behavior() works well, but this function is not
444 available in VS7 (this is not problem for DLL, but it is a problem for
445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446 help, at least in some versions of MS C RTL.
447
448 It seems following sequence is the only way to simulate abort() and
449 avoid pop-up error box. */
450 raise(SIGABRT);
451 _exit(3); // Just in case, if signal ignored, exit anyway.
452 } else {
453 __kmp_unregister_library();
454 abort();
455 }
456
457 __kmp_infinite_loop();
458 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459
460} // __kmp_abort_process
461
462void __kmp_abort_thread(void) {
463 // TODO: Eliminate g_abort global variable and this function.
464 // In case of abort just call abort(), it will kill all the threads.
465 __kmp_infinite_loop();
466} // __kmp_abort_thread
467
468/* Print out the storage map for the major kmp_info_t thread data structures
469 that are allocated together. */
470
471static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473 gtid);
474
475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477
478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479 sizeof(kmp_local_t), "th_%d.th_local", gtid);
480
481 __kmp_print_storage_map_gtid(
482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484
485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486 &thr->th.th_bar[bs_plain_barrier + 1],
487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488 gtid);
489
490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491 &thr->th.th_bar[bs_forkjoin_barrier + 1],
492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493 gtid);
494
495#if KMP_FAST_REDUCTION_BARRIER
496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497 &thr->th.th_bar[bs_reduction_barrier + 1],
498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499 gtid);
500#endif // KMP_FAST_REDUCTION_BARRIER
501}
502
503/* Print out the storage map for the major kmp_team_t team data structures
504 that are allocated together. */
505
506static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507 int team_id, int num_thr) {
508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510 header, team_id);
511
512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513 &team->t.t_bar[bs_last_barrier],
514 sizeof(kmp_balign_team_t) * bs_last_barrier,
515 "%s_%d.t_bar", header, team_id);
516
517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518 &team->t.t_bar[bs_plain_barrier + 1],
519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520 header, team_id);
521
522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523 &team->t.t_bar[bs_forkjoin_barrier + 1],
524 sizeof(kmp_balign_team_t),
525 "%s_%d.t_bar[forkjoin]", header, team_id);
526
527#if KMP_FAST_REDUCTION_BARRIER
528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529 &team->t.t_bar[bs_reduction_barrier + 1],
530 sizeof(kmp_balign_team_t),
531 "%s_%d.t_bar[reduction]", header, team_id);
532#endif // KMP_FAST_REDUCTION_BARRIER
533
534 __kmp_print_storage_map_gtid(
535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537
538 __kmp_print_storage_map_gtid(
539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541
542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543 &team->t.t_disp_buffer[num_disp_buff],
544 sizeof(dispatch_shared_info_t) * num_disp_buff,
545 "%s_%d.t_disp_buffer", header, team_id);
546}
547
548static void __kmp_init_allocator() {
549 __kmp_init_memkind();
550 __kmp_init_target_mem();
551}
552static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553
554/* ------------------------------------------------------------------------ */
555
556#if ENABLE_LIBOMPTARGET
557static void __kmp_init_omptarget() {
558 __kmp_init_target_task();
559}
560#endif
561
562/* ------------------------------------------------------------------------ */
563
564#if KMP_DYNAMIC_LIB
565#if KMP_OS_WINDOWS
566
567BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
568 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
569
570 switch (fdwReason) {
571
572 case DLL_PROCESS_ATTACH:
573 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
574
575 return TRUE;
576
577 case DLL_PROCESS_DETACH:
578 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
579
580 // According to Windows* documentation for DllMain entry point:
581 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
582 // lpReserved == NULL when FreeLibrary() is called,
583 // lpReserved != NULL when the process is terminated.
584 // When FreeLibrary() is called, worker threads remain alive. So the
585 // runtime's state is consistent and executing proper shutdown is OK.
586 // When the process is terminated, worker threads have exited or been
587 // forcefully terminated by the OS and only the shutdown thread remains.
588 // This can leave the runtime in an inconsistent state.
589 // Hence, only attempt proper cleanup when FreeLibrary() is called.
590 // Otherwise, rely on OS to reclaim resources.
591 if (lpReserved == NULL)
592 __kmp_internal_end_library(__kmp_gtid_get_specific());
593
594 return TRUE;
595
596 case DLL_THREAD_ATTACH:
597 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
598
599 /* if we want to register new siblings all the time here call
600 * __kmp_get_gtid(); */
601 return TRUE;
602
603 case DLL_THREAD_DETACH:
604 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
605
606 __kmp_internal_end_thread(__kmp_gtid_get_specific());
607 return TRUE;
608 }
609
610 return TRUE;
611}
612
613#endif /* KMP_OS_WINDOWS */
614#endif /* KMP_DYNAMIC_LIB */
615
616/* __kmp_parallel_deo -- Wait until it's our turn. */
617void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
618 int gtid = *gtid_ref;
619#ifdef BUILD_PARALLEL_ORDERED
620 kmp_team_t *team = __kmp_team_from_gtid(gtid);
621#endif /* BUILD_PARALLEL_ORDERED */
622
623 if (__kmp_env_consistency_check) {
624 if (__kmp_threads[gtid]->th.th_root->r.r_active)
625#if KMP_USE_DYNAMIC_LOCK
626 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
627#else
628 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
629#endif
630 }
631#ifdef BUILD_PARALLEL_ORDERED
632 if (!team->t.t_serialized) {
633 KMP_MB();
634 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
635 NULL);
636 KMP_MB();
637 }
638#endif /* BUILD_PARALLEL_ORDERED */
639}
640
641/* __kmp_parallel_dxo -- Signal the next task. */
642void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
643 int gtid = *gtid_ref;
644#ifdef BUILD_PARALLEL_ORDERED
645 int tid = __kmp_tid_from_gtid(gtid);
646 kmp_team_t *team = __kmp_team_from_gtid(gtid);
647#endif /* BUILD_PARALLEL_ORDERED */
648
649 if (__kmp_env_consistency_check) {
650 if (__kmp_threads[gtid]->th.th_root->r.r_active)
651 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
652 }
653#ifdef BUILD_PARALLEL_ORDERED
654 if (!team->t.t_serialized) {
655 KMP_MB(); /* Flush all pending memory write invalidates. */
656
657 /* use the tid of the next thread in this team */
658 /* TODO replace with general release procedure */
659 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
660
661 KMP_MB(); /* Flush all pending memory write invalidates. */
662 }
663#endif /* BUILD_PARALLEL_ORDERED */
664}
665
666/* ------------------------------------------------------------------------ */
667/* The BARRIER for a SINGLE process section is always explicit */
668
669int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
670 int status;
671 kmp_info_t *th;
672 kmp_team_t *team;
673
674 if (!TCR_4(__kmp_init_parallel))
675 __kmp_parallel_initialize();
676 __kmp_resume_if_soft_paused();
677
678 th = __kmp_threads[gtid];
679 team = th->th.th_team;
680 status = 0;
681
682 th->th.th_ident = id_ref;
683
684 if (team->t.t_serialized) {
685 status = 1;
686 } else {
687 kmp_int32 old_this = th->th.th_local.this_construct;
688
689 ++th->th.th_local.this_construct;
690 /* try to set team count to thread count--success means thread got the
691 single block */
692 /* TODO: Should this be acquire or release? */
693 if (team->t.t_construct == old_this) {
694 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
695 th->th.th_local.this_construct);
696 }
697#if USE_ITT_BUILD
698 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
699 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
700 team->t.t_active_level == 1) {
701 // Only report metadata by primary thread of active team at level 1
702 __kmp_itt_metadata_single(id_ref);
703 }
704#endif /* USE_ITT_BUILD */
705 }
706
707 if (__kmp_env_consistency_check) {
708 if (status && push_ws) {
709 __kmp_push_workshare(gtid, ct_psingle, id_ref);
710 } else {
711 __kmp_check_workshare(gtid, ct_psingle, id_ref);
712 }
713 }
714#if USE_ITT_BUILD
715 if (status) {
716 __kmp_itt_single_start(gtid);
717 }
718#endif /* USE_ITT_BUILD */
719 return status;
720}
721
722void __kmp_exit_single(int gtid) {
723#if USE_ITT_BUILD
724 __kmp_itt_single_end(gtid);
725#endif /* USE_ITT_BUILD */
726 if (__kmp_env_consistency_check)
727 __kmp_pop_workshare(gtid, ct_psingle, NULL);
728}
729
730/* determine if we can go parallel or must use a serialized parallel region and
731 * how many threads we can use
732 * set_nproc is the number of threads requested for the team
733 * returns 0 if we should serialize or only use one thread,
734 * otherwise the number of threads to use
735 * The forkjoin lock is held by the caller. */
736static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
737 int master_tid, int set_nthreads,
738 int enter_teams) {
739 int capacity;
740 int new_nthreads;
741 KMP_DEBUG_ASSERT(__kmp_init_serial);
742 KMP_DEBUG_ASSERT(root && parent_team);
743 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
744
745 // If dyn-var is set, dynamically adjust the number of desired threads,
746 // according to the method specified by dynamic_mode.
747 new_nthreads = set_nthreads;
748 if (!get__dynamic_2(parent_team, master_tid)) {
749 ;
750 }
751#ifdef USE_LOAD_BALANCE
752 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
753 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
754 if (new_nthreads == 1) {
755 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
756 "reservation to 1 thread\n",
757 master_tid));
758 return 1;
759 }
760 if (new_nthreads < set_nthreads) {
761 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
762 "reservation to %d threads\n",
763 master_tid, new_nthreads));
764 }
765 }
766#endif /* USE_LOAD_BALANCE */
767 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
768 new_nthreads = __kmp_avail_proc - __kmp_nth +
769 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
770 if (new_nthreads <= 1) {
771 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
772 "reservation to 1 thread\n",
773 master_tid));
774 return 1;
775 }
776 if (new_nthreads < set_nthreads) {
777 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
778 "reservation to %d threads\n",
779 master_tid, new_nthreads));
780 } else {
781 new_nthreads = set_nthreads;
782 }
783 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
784 if (set_nthreads > 2) {
785 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
786 new_nthreads = (new_nthreads % set_nthreads) + 1;
787 if (new_nthreads == 1) {
788 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
789 "reservation to 1 thread\n",
790 master_tid));
791 return 1;
792 }
793 if (new_nthreads < set_nthreads) {
794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
795 "reservation to %d threads\n",
796 master_tid, new_nthreads));
797 }
798 }
799 } else {
800 KMP_ASSERT(0);
801 }
802
803 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
804 if (__kmp_nth + new_nthreads -
805 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
806 __kmp_max_nth) {
807 int tl_nthreads = __kmp_max_nth - __kmp_nth +
808 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809 if (tl_nthreads <= 0) {
810 tl_nthreads = 1;
811 }
812
813 // If dyn-var is false, emit a 1-time warning.
814 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
815 __kmp_reserve_warn = 1;
816 __kmp_msg(kmp_ms_warning,
817 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
818 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
819 }
820 if (tl_nthreads == 1) {
821 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
822 "reduced reservation to 1 thread\n",
823 master_tid));
824 return 1;
825 }
826 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
827 "reservation to %d threads\n",
828 master_tid, tl_nthreads));
829 new_nthreads = tl_nthreads;
830 }
831
832 // Respect OMP_THREAD_LIMIT
833 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
834 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
835 if (cg_nthreads + new_nthreads -
836 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
837 max_cg_threads) {
838 int tl_nthreads = max_cg_threads - cg_nthreads +
839 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
840 if (tl_nthreads <= 0) {
841 tl_nthreads = 1;
842 }
843
844 // If dyn-var is false, emit a 1-time warning.
845 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
846 __kmp_reserve_warn = 1;
847 __kmp_msg(kmp_ms_warning,
848 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
849 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
850 }
851 if (tl_nthreads == 1) {
852 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
853 "reduced reservation to 1 thread\n",
854 master_tid));
855 return 1;
856 }
857 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
858 "reservation to %d threads\n",
859 master_tid, tl_nthreads));
860 new_nthreads = tl_nthreads;
861 }
862
863 // Check if the threads array is large enough, or needs expanding.
864 // See comment in __kmp_register_root() about the adjustment if
865 // __kmp_threads[0] == NULL.
866 capacity = __kmp_threads_capacity;
867 if (TCR_PTR(__kmp_threads[0]) == NULL) {
868 --capacity;
869 }
870 // If it is not for initializing the hidden helper team, we need to take
871 // __kmp_hidden_helper_threads_num out of the capacity because it is included
872 // in __kmp_threads_capacity.
873 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
874 capacity -= __kmp_hidden_helper_threads_num;
875 }
876 if (__kmp_nth + new_nthreads -
877 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
878 capacity) {
879 // Expand the threads array.
880 int slotsRequired = __kmp_nth + new_nthreads -
881 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
882 capacity;
883 int slotsAdded = __kmp_expand_threads(slotsRequired);
884 if (slotsAdded < slotsRequired) {
885 // The threads array was not expanded enough.
886 new_nthreads -= (slotsRequired - slotsAdded);
887 KMP_ASSERT(new_nthreads >= 1);
888
889 // If dyn-var is false, emit a 1-time warning.
890 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
891 __kmp_reserve_warn = 1;
892 if (__kmp_tp_cached) {
893 __kmp_msg(kmp_ms_warning,
894 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
895 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
896 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
897 } else {
898 __kmp_msg(kmp_ms_warning,
899 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
900 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
901 }
902 }
903 }
904 }
905
906#ifdef KMP_DEBUG
907 if (new_nthreads == 1) {
908 KC_TRACE(10,
909 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
910 "dead roots and rechecking; requested %d threads\n",
911 __kmp_get_gtid(), set_nthreads));
912 } else {
913 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
914 " %d threads\n",
915 __kmp_get_gtid(), new_nthreads, set_nthreads));
916 }
917#endif // KMP_DEBUG
918 return new_nthreads;
919}
920
921/* Allocate threads from the thread pool and assign them to the new team. We are
922 assured that there are enough threads available, because we checked on that
923 earlier within critical section forkjoin */
924static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
925 kmp_info_t *master_th, int master_gtid,
926 int fork_teams_workers) {
927 int i;
928 int use_hot_team;
929
930 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
931 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
932 KMP_MB();
933
934 /* first, let's setup the primary thread */
935 master_th->th.th_info.ds.ds_tid = 0;
936 master_th->th.th_team = team;
937 master_th->th.th_team_nproc = team->t.t_nproc;
938 master_th->th.th_team_master = master_th;
939 master_th->th.th_team_serialized = FALSE;
940 master_th->th.th_dispatch = &team->t.t_dispatch[0];
941
942/* make sure we are not the optimized hot team */
943#if KMP_NESTED_HOT_TEAMS
944 use_hot_team = 0;
945 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
946 if (hot_teams) { // hot teams array is not allocated if
947 // KMP_HOT_TEAMS_MAX_LEVEL=0
948 int level = team->t.t_active_level - 1; // index in array of hot teams
949 if (master_th->th.th_teams_microtask) { // are we inside the teams?
950 if (master_th->th.th_teams_size.nteams > 1) {
951 ++level; // level was not increased in teams construct for
952 // team_of_masters
953 }
954 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
955 master_th->th.th_teams_level == team->t.t_level) {
956 ++level; // level was not increased in teams construct for
957 // team_of_workers before the parallel
958 } // team->t.t_level will be increased inside parallel
959 }
960 if (level < __kmp_hot_teams_max_level) {
961 if (hot_teams[level].hot_team) {
962 // hot team has already been allocated for given level
963 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
964 use_hot_team = 1; // the team is ready to use
965 } else {
966 use_hot_team = 0; // AC: threads are not allocated yet
967 hot_teams[level].hot_team = team; // remember new hot team
968 hot_teams[level].hot_team_nth = team->t.t_nproc;
969 }
970 } else {
971 use_hot_team = 0;
972 }
973 }
974#else
975 use_hot_team = team == root->r.r_hot_team;
976#endif
977 if (!use_hot_team) {
978
979 /* install the primary thread */
980 team->t.t_threads[0] = master_th;
981 __kmp_initialize_info(master_th, team, 0, master_gtid);
982
983 /* now, install the worker threads */
984 for (i = 1; i < team->t.t_nproc; i++) {
985
986 /* fork or reallocate a new thread and install it in team */
987 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
988 team->t.t_threads[i] = thr;
989 KMP_DEBUG_ASSERT(thr);
990 KMP_DEBUG_ASSERT(thr->th.th_team == team);
991 /* align team and thread arrived states */
992 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
993 "T#%d(%d:%d) join =%llu, plain=%llu\n",
994 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
995 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
996 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
997 team->t.t_bar[bs_plain_barrier].b_arrived));
998 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
999 thr->th.th_teams_level = master_th->th.th_teams_level;
1000 thr->th.th_teams_size = master_th->th.th_teams_size;
1001 { // Initialize threads' barrier data.
1002 int b;
1003 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1004 for (b = 0; b < bs_last_barrier; ++b) {
1005 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1006 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1007#if USE_DEBUGGER
1008 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1009#endif
1010 }
1011 }
1012 }
1013
1014#if KMP_AFFINITY_SUPPORTED
1015 // Do not partition the places list for teams construct workers who
1016 // haven't actually been forked to do real work yet. This partitioning
1017 // will take place in the parallel region nested within the teams construct.
1018 if (!fork_teams_workers) {
1019 __kmp_partition_places(team);
1020 }
1021#endif
1022
1023 if (team->t.t_nproc > 1 &&
1024 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1025 team->t.b->update_num_threads(team->t.t_nproc);
1026 __kmp_add_threads_to_team(team, team->t.t_nproc);
1027 }
1028 }
1029
1030 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1031 for (i = 0; i < team->t.t_nproc; i++) {
1032 kmp_info_t *thr = team->t.t_threads[i];
1033 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1034 thr->th.th_prev_level != team->t.t_level) {
1035 team->t.t_display_affinity = 1;
1036 break;
1037 }
1038 }
1039 }
1040
1041 KMP_MB();
1042}
1043
1044#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1045// Propagate any changes to the floating point control registers out to the team
1046// We try to avoid unnecessary writes to the relevant cache line in the team
1047// structure, so we don't make changes unless they are needed.
1048inline static void propagateFPControl(kmp_team_t *team) {
1049 if (__kmp_inherit_fp_control) {
1050 kmp_int16 x87_fpu_control_word;
1051 kmp_uint32 mxcsr;
1052
1053 // Get primary thread's values of FPU control flags (both X87 and vector)
1054 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1055 __kmp_store_mxcsr(&mxcsr);
1056 mxcsr &= KMP_X86_MXCSR_MASK;
1057
1058 // There is no point looking at t_fp_control_saved here.
1059 // If it is TRUE, we still have to update the values if they are different
1060 // from those we now have. If it is FALSE we didn't save anything yet, but
1061 // our objective is the same. We have to ensure that the values in the team
1062 // are the same as those we have.
1063 // So, this code achieves what we need whether or not t_fp_control_saved is
1064 // true. By checking whether the value needs updating we avoid unnecessary
1065 // writes that would put the cache-line into a written state, causing all
1066 // threads in the team to have to read it again.
1067 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1068 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1069 // Although we don't use this value, other code in the runtime wants to know
1070 // whether it should restore them. So we must ensure it is correct.
1071 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1072 } else {
1073 // Similarly here. Don't write to this cache-line in the team structure
1074 // unless we have to.
1075 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1076 }
1077}
1078
1079// Do the opposite, setting the hardware registers to the updated values from
1080// the team.
1081inline static void updateHWFPControl(kmp_team_t *team) {
1082 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1083 // Only reset the fp control regs if they have been changed in the team.
1084 // the parallel region that we are exiting.
1085 kmp_int16 x87_fpu_control_word;
1086 kmp_uint32 mxcsr;
1087 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1088 __kmp_store_mxcsr(&mxcsr);
1089 mxcsr &= KMP_X86_MXCSR_MASK;
1090
1091 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1092 __kmp_clear_x87_fpu_status_word();
1093 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1094 }
1095
1096 if (team->t.t_mxcsr != mxcsr) {
1097 __kmp_load_mxcsr(&team->t.t_mxcsr);
1098 }
1099 }
1100}
1101#else
1102#define propagateFPControl(x) ((void)0)
1103#define updateHWFPControl(x) ((void)0)
1104#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1105
1106static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1107 int realloc); // forward declaration
1108
1109/* Run a parallel region that has been serialized, so runs only in a team of the
1110 single primary thread. */
1111void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1112 kmp_info_t *this_thr;
1113 kmp_team_t *serial_team;
1114
1115 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1116
1117 /* Skip all this code for autopar serialized loops since it results in
1118 unacceptable overhead */
1119 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1120 return;
1121
1122 if (!TCR_4(__kmp_init_parallel))
1123 __kmp_parallel_initialize();
1124 __kmp_resume_if_soft_paused();
1125
1126 this_thr = __kmp_threads[global_tid];
1127 serial_team = this_thr->th.th_serial_team;
1128
1129 /* utilize the serialized team held by this thread */
1130 KMP_DEBUG_ASSERT(serial_team);
1131 KMP_MB();
1132
1133 if (__kmp_tasking_mode != tskm_immediate_exec) {
1134 KMP_DEBUG_ASSERT(
1135 this_thr->th.th_task_team ==
1136 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1137 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1138 NULL);
1139 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1140 "team %p, new task_team = NULL\n",
1141 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1142 this_thr->th.th_task_team = NULL;
1143 }
1144
1145 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1146 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1147 proc_bind = proc_bind_false;
1148 } else if (proc_bind == proc_bind_default) {
1149 // No proc_bind clause was specified, so use the current value
1150 // of proc-bind-var for this parallel region.
1151 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1152 }
1153 // Reset for next parallel region
1154 this_thr->th.th_set_proc_bind = proc_bind_default;
1155
1156 // Reset num_threads for next parallel region
1157 this_thr->th.th_set_nproc = 0;
1158
1159#if OMPT_SUPPORT
1160 ompt_data_t ompt_parallel_data = ompt_data_none;
1161 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1162 if (ompt_enabled.enabled &&
1163 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1164
1165 ompt_task_info_t *parent_task_info;
1166 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1167
1168 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1169 if (ompt_enabled.ompt_callback_parallel_begin) {
1170 int team_size = 1;
1171
1172 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1173 &(parent_task_info->task_data), &(parent_task_info->frame),
1174 &ompt_parallel_data, team_size,
1175 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1176 }
1177 }
1178#endif // OMPT_SUPPORT
1179
1180 if (this_thr->th.th_team != serial_team) {
1181 // Nested level will be an index in the nested nthreads array
1182 int level = this_thr->th.th_team->t.t_level;
1183
1184 if (serial_team->t.t_serialized) {
1185 /* this serial team was already used
1186 TODO increase performance by making this locks more specific */
1187 kmp_team_t *new_team;
1188
1189 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1190
1191 new_team =
1192 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1193#if OMPT_SUPPORT
1194 ompt_parallel_data,
1195#endif
1196 proc_bind, &this_thr->th.th_current_task->td_icvs,
1197 0 USE_NESTED_HOT_ARG(NULL));
1198 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1199 KMP_ASSERT(new_team);
1200
1201 /* setup new serialized team and install it */
1202 new_team->t.t_threads[0] = this_thr;
1203 new_team->t.t_parent = this_thr->th.th_team;
1204 serial_team = new_team;
1205 this_thr->th.th_serial_team = serial_team;
1206
1207 KF_TRACE(
1208 10,
1209 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1210 global_tid, serial_team));
1211
1212 /* TODO the above breaks the requirement that if we run out of resources,
1213 then we can still guarantee that serialized teams are ok, since we may
1214 need to allocate a new one */
1215 } else {
1216 KF_TRACE(
1217 10,
1218 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1219 global_tid, serial_team));
1220 }
1221
1222 /* we have to initialize this serial team */
1223 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1224 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1225 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1226 serial_team->t.t_ident = loc;
1227 serial_team->t.t_serialized = 1;
1228 serial_team->t.t_nproc = 1;
1229 serial_team->t.t_parent = this_thr->th.th_team;
1230 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1231 this_thr->th.th_team = serial_team;
1232 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1233
1234 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1235 this_thr->th.th_current_task));
1236 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1237 this_thr->th.th_current_task->td_flags.executing = 0;
1238
1239 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1240
1241 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1242 implicit task for each serialized task represented by
1243 team->t.t_serialized? */
1244 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1245 &this_thr->th.th_current_task->td_parent->td_icvs);
1246
1247 // Thread value exists in the nested nthreads array for the next nested
1248 // level
1249 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1250 this_thr->th.th_current_task->td_icvs.nproc =
1251 __kmp_nested_nth.nth[level + 1];
1252 }
1253
1254 if (__kmp_nested_proc_bind.used &&
1255 (level + 1 < __kmp_nested_proc_bind.used)) {
1256 this_thr->th.th_current_task->td_icvs.proc_bind =
1257 __kmp_nested_proc_bind.bind_types[level + 1];
1258 }
1259
1260#if USE_DEBUGGER
1261 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1262#endif
1263 this_thr->th.th_info.ds.ds_tid = 0;
1264
1265 /* set thread cache values */
1266 this_thr->th.th_team_nproc = 1;
1267 this_thr->th.th_team_master = this_thr;
1268 this_thr->th.th_team_serialized = 1;
1269
1270 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1271 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1272 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1273
1274 propagateFPControl(serial_team);
1275
1276 /* check if we need to allocate dispatch buffers stack */
1277 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1278 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1279 serial_team->t.t_dispatch->th_disp_buffer =
1280 (dispatch_private_info_t *)__kmp_allocate(
1281 sizeof(dispatch_private_info_t));
1282 }
1283 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1284
1285 KMP_MB();
1286
1287 } else {
1288 /* this serialized team is already being used,
1289 * that's fine, just add another nested level */
1290 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1291 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1292 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1293 ++serial_team->t.t_serialized;
1294 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1295
1296 // Nested level will be an index in the nested nthreads array
1297 int level = this_thr->th.th_team->t.t_level;
1298 // Thread value exists in the nested nthreads array for the next nested
1299 // level
1300 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1301 this_thr->th.th_current_task->td_icvs.nproc =
1302 __kmp_nested_nth.nth[level + 1];
1303 }
1304 serial_team->t.t_level++;
1305 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1306 "of serial team %p to %d\n",
1307 global_tid, serial_team, serial_team->t.t_level));
1308
1309 /* allocate/push dispatch buffers stack */
1310 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1311 {
1312 dispatch_private_info_t *disp_buffer =
1313 (dispatch_private_info_t *)__kmp_allocate(
1314 sizeof(dispatch_private_info_t));
1315 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1316 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1317 }
1318 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1319
1320 KMP_MB();
1321 }
1322 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1323
1324 // Perform the display affinity functionality for
1325 // serialized parallel regions
1326 if (__kmp_display_affinity) {
1327 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1328 this_thr->th.th_prev_num_threads != 1) {
1329 // NULL means use the affinity-format-var ICV
1330 __kmp_aux_display_affinity(global_tid, NULL);
1331 this_thr->th.th_prev_level = serial_team->t.t_level;
1332 this_thr->th.th_prev_num_threads = 1;
1333 }
1334 }
1335
1336 if (__kmp_env_consistency_check)
1337 __kmp_push_parallel(global_tid, NULL);
1338#if OMPT_SUPPORT
1339 serial_team->t.ompt_team_info.master_return_address = codeptr;
1340 if (ompt_enabled.enabled &&
1341 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1342 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1343 OMPT_GET_FRAME_ADDRESS(0);
1344
1345 ompt_lw_taskteam_t lw_taskteam;
1346 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1347 &ompt_parallel_data, codeptr);
1348
1349 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1350 // don't use lw_taskteam after linking. content was swaped
1351
1352 /* OMPT implicit task begin */
1353 if (ompt_enabled.ompt_callback_implicit_task) {
1354 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1355 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1356 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1357 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1358 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1359 __kmp_tid_from_gtid(global_tid);
1360 }
1361
1362 /* OMPT state */
1363 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1364 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1365 OMPT_GET_FRAME_ADDRESS(0);
1366 }
1367#endif
1368}
1369
1370// Test if this fork is for a team closely nested in a teams construct
1371static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1372 microtask_t microtask, int level,
1373 int teams_level, kmp_va_list ap) {
1374 return (master_th->th.th_teams_microtask && ap &&
1375 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1376}
1377
1378// Test if this fork is for the teams construct, i.e. to form the outer league
1379// of teams
1380static inline bool __kmp_is_entering_teams(int active_level, int level,
1381 int teams_level, kmp_va_list ap) {
1382 return ((ap == NULL && active_level == 0) ||
1383 (ap && teams_level > 0 && teams_level == level));
1384}
1385
1386// AC: This is start of parallel that is nested inside teams construct.
1387// The team is actual (hot), all workers are ready at the fork barrier.
1388// No lock needed to initialize the team a bit, then free workers.
1389static inline int
1390__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1391 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1392 enum fork_context_e call_context, microtask_t microtask,
1393 launch_t invoker, int master_set_numthreads, int level,
1394#if OMPT_SUPPORT
1395 ompt_data_t ompt_parallel_data, void *return_address,
1396#endif
1397 kmp_va_list ap) {
1398 void **argv;
1399 int i;
1400
1401 parent_team->t.t_ident = loc;
1402 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1403 parent_team->t.t_argc = argc;
1404 argv = (void **)parent_team->t.t_argv;
1405 for (i = argc - 1; i >= 0; --i) {
1406 *argv++ = va_arg(kmp_va_deref(ap), void *);
1407 }
1408 // Increment our nested depth levels, but not increase the serialization
1409 if (parent_team == master_th->th.th_serial_team) {
1410 // AC: we are in serialized parallel
1411 __kmpc_serialized_parallel(loc, gtid);
1412 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1413
1414 if (call_context == fork_context_gnu) {
1415 // AC: need to decrement t_serialized for enquiry functions to work
1416 // correctly, will restore at join time
1417 parent_team->t.t_serialized--;
1418 return TRUE;
1419 }
1420
1421#if OMPD_SUPPORT
1422 parent_team->t.t_pkfn = microtask;
1423#endif
1424
1425#if OMPT_SUPPORT
1426 void *dummy;
1427 void **exit_frame_p;
1428 ompt_data_t *implicit_task_data;
1429 ompt_lw_taskteam_t lw_taskteam;
1430
1431 if (ompt_enabled.enabled) {
1432 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1433 &ompt_parallel_data, return_address);
1434 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1435
1436 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1437 // Don't use lw_taskteam after linking. Content was swapped.
1438
1439 /* OMPT implicit task begin */
1440 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1441 if (ompt_enabled.ompt_callback_implicit_task) {
1442 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1443 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1444 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1445 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1446 }
1447
1448 /* OMPT state */
1449 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1450 } else {
1451 exit_frame_p = &dummy;
1452 }
1453#endif
1454
1455 // AC: need to decrement t_serialized for enquiry functions to work
1456 // correctly, will restore at join time
1457 parent_team->t.t_serialized--;
1458
1459 {
1460 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1461 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1462 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1463#if OMPT_SUPPORT
1464 ,
1465 exit_frame_p
1466#endif
1467 );
1468 }
1469
1470#if OMPT_SUPPORT
1471 if (ompt_enabled.enabled) {
1472 *exit_frame_p = NULL;
1473 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1474 if (ompt_enabled.ompt_callback_implicit_task) {
1475 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1476 ompt_scope_end, NULL, implicit_task_data, 1,
1477 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1478 }
1479 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1480 __ompt_lw_taskteam_unlink(master_th);
1481 if (ompt_enabled.ompt_callback_parallel_end) {
1482 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1483 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1484 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1485 }
1486 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1487 }
1488#endif
1489 return TRUE;
1490 }
1491
1492 parent_team->t.t_pkfn = microtask;
1493 parent_team->t.t_invoke = invoker;
1494 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1495 parent_team->t.t_active_level++;
1496 parent_team->t.t_level++;
1497 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1498
1499 // If the threads allocated to the team are less than the thread limit, update
1500 // the thread limit here. th_teams_size.nth is specific to this team nested
1501 // in a teams construct, the team is fully created, and we're about to do
1502 // the actual fork. Best to do this here so that the subsequent uses below
1503 // and in the join have the correct value.
1504 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1505
1506#if OMPT_SUPPORT
1507 if (ompt_enabled.enabled) {
1508 ompt_lw_taskteam_t lw_taskteam;
1509 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1510 return_address);
1511 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1512 }
1513#endif
1514
1515 /* Change number of threads in the team if requested */
1516 if (master_set_numthreads) { // The parallel has num_threads clause
1517 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1518 // AC: only can reduce number of threads dynamically, can't increase
1519 kmp_info_t **other_threads = parent_team->t.t_threads;
1520 // NOTE: if using distributed barrier, we need to run this code block
1521 // even when the team size appears not to have changed from the max.
1522 int old_proc = master_th->th.th_teams_size.nth;
1523 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1524 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1525 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1526 }
1527 parent_team->t.t_nproc = master_set_numthreads;
1528 for (i = 0; i < master_set_numthreads; ++i) {
1529 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1530 }
1531 }
1532 // Keep extra threads hot in the team for possible next parallels
1533 master_th->th.th_set_nproc = 0;
1534 }
1535
1536#if USE_DEBUGGER
1537 if (__kmp_debugging) { // Let debugger override number of threads.
1538 int nth = __kmp_omp_num_threads(loc);
1539 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1540 master_set_numthreads = nth;
1541 }
1542 }
1543#endif
1544
1545 // Figure out the proc_bind policy for the nested parallel within teams
1546 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1547 // proc_bind_default means don't update
1548 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1549 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1550 proc_bind = proc_bind_false;
1551 } else {
1552 // No proc_bind clause specified; use current proc-bind-var
1553 if (proc_bind == proc_bind_default) {
1554 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1555 }
1556 /* else: The proc_bind policy was specified explicitly on parallel clause.
1557 This overrides proc-bind-var for this parallel region, but does not
1558 change proc-bind-var. */
1559 // Figure the value of proc-bind-var for the child threads.
1560 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1561 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1562 master_th->th.th_current_task->td_icvs.proc_bind)) {
1563 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1564 }
1565 }
1566 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1567 // Need to change the bind-var ICV to correct value for each implicit task
1568 if (proc_bind_icv != proc_bind_default &&
1569 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1570 kmp_info_t **other_threads = parent_team->t.t_threads;
1571 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1572 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1573 }
1574 }
1575 // Reset for next parallel region
1576 master_th->th.th_set_proc_bind = proc_bind_default;
1577
1578#if USE_ITT_BUILD && USE_ITT_NOTIFY
1579 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1580 KMP_ITT_DEBUG) &&
1581 __kmp_forkjoin_frames_mode == 3 &&
1582 parent_team->t.t_active_level == 1 // only report frames at level 1
1583 && master_th->th.th_teams_size.nteams == 1) {
1584 kmp_uint64 tmp_time = __itt_get_timestamp();
1585 master_th->th.th_frame_time = tmp_time;
1586 parent_team->t.t_region_time = tmp_time;
1587 }
1588 if (__itt_stack_caller_create_ptr) {
1589 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1590 // create new stack stitching id before entering fork barrier
1591 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1592 }
1593#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1594#if KMP_AFFINITY_SUPPORTED
1595 __kmp_partition_places(parent_team);
1596#endif
1597
1598 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1599 "master_th=%p, gtid=%d\n",
1600 root, parent_team, master_th, gtid));
1601 __kmp_internal_fork(loc, gtid, parent_team);
1602 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1603 "master_th=%p, gtid=%d\n",
1604 root, parent_team, master_th, gtid));
1605
1606 if (call_context == fork_context_gnu)
1607 return TRUE;
1608
1609 /* Invoke microtask for PRIMARY thread */
1610 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1611 parent_team->t.t_id, parent_team->t.t_pkfn));
1612
1613 if (!parent_team->t.t_invoke(gtid)) {
1614 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1615 }
1616 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1617 parent_team->t.t_id, parent_team->t.t_pkfn));
1618 KMP_MB(); /* Flush all pending memory write invalidates. */
1619
1620 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1621
1622 return TRUE;
1623}
1624
1625// Create a serialized parallel region
1626static inline int
1627__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1628 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1629 kmp_info_t *master_th, kmp_team_t *parent_team,
1630#if OMPT_SUPPORT
1631 ompt_data_t *ompt_parallel_data, void **return_address,
1632 ompt_data_t **parent_task_data,
1633#endif
1634 kmp_va_list ap) {
1635 kmp_team_t *team;
1636 int i;
1637 void **argv;
1638
1639/* josh todo: hypothetical question: what do we do for OS X*? */
1640#if KMP_OS_LINUX && \
1641 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1642 void *args[argc];
1643#else
1644 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1645#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1646 KMP_ARCH_AARCH64) */
1647
1648 KA_TRACE(
1649 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1650
1651 __kmpc_serialized_parallel(loc, gtid);
1652
1653#if OMPD_SUPPORT
1654 master_th->th.th_serial_team->t.t_pkfn = microtask;
1655#endif
1656
1657 if (call_context == fork_context_intel) {
1658 /* TODO this sucks, use the compiler itself to pass args! :) */
1659 master_th->th.th_serial_team->t.t_ident = loc;
1660 if (!ap) {
1661 // revert change made in __kmpc_serialized_parallel()
1662 master_th->th.th_serial_team->t.t_level--;
1663// Get args from parent team for teams construct
1664
1665#if OMPT_SUPPORT
1666 void *dummy;
1667 void **exit_frame_p;
1668 ompt_task_info_t *task_info;
1669 ompt_lw_taskteam_t lw_taskteam;
1670
1671 if (ompt_enabled.enabled) {
1672 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1673 ompt_parallel_data, *return_address);
1674
1675 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1676 // don't use lw_taskteam after linking. content was swaped
1677 task_info = OMPT_CUR_TASK_INFO(master_th);
1678 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1679 if (ompt_enabled.ompt_callback_implicit_task) {
1680 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1681 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1682 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1683 &(task_info->task_data), 1,
1684 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1685 }
1686
1687 /* OMPT state */
1688 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1689 } else {
1690 exit_frame_p = &dummy;
1691 }
1692#endif
1693
1694 {
1695 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1696 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1697 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1698#if OMPT_SUPPORT
1699 ,
1700 exit_frame_p
1701#endif
1702 );
1703 }
1704
1705#if OMPT_SUPPORT
1706 if (ompt_enabled.enabled) {
1707 *exit_frame_p = NULL;
1708 if (ompt_enabled.ompt_callback_implicit_task) {
1709 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1710 ompt_scope_end, NULL, &(task_info->task_data), 1,
1711 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1712 }
1713 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1714 __ompt_lw_taskteam_unlink(master_th);
1715 if (ompt_enabled.ompt_callback_parallel_end) {
1716 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1717 ompt_parallel_data, *parent_task_data,
1718 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1719 }
1720 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1721 }
1722#endif
1723 } else if (microtask == (microtask_t)__kmp_teams_master) {
1724 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1725 team = master_th->th.th_team;
1726 // team->t.t_pkfn = microtask;
1727 team->t.t_invoke = invoker;
1728 __kmp_alloc_argv_entries(argc, team, TRUE);
1729 team->t.t_argc = argc;
1730 argv = (void **)team->t.t_argv;
1731 if (ap) {
1732 for (i = argc - 1; i >= 0; --i)
1733 *argv++ = va_arg(kmp_va_deref(ap), void *);
1734 } else {
1735 for (i = 0; i < argc; ++i)
1736 // Get args from parent team for teams construct
1737 argv[i] = parent_team->t.t_argv[i];
1738 }
1739 // AC: revert change made in __kmpc_serialized_parallel()
1740 // because initial code in teams should have level=0
1741 team->t.t_level--;
1742 // AC: call special invoker for outer "parallel" of teams construct
1743 invoker(gtid);
1744#if OMPT_SUPPORT
1745 if (ompt_enabled.enabled) {
1746 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1747 if (ompt_enabled.ompt_callback_implicit_task) {
1748 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1749 ompt_scope_end, NULL, &(task_info->task_data), 0,
1750 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1751 }
1752 if (ompt_enabled.ompt_callback_parallel_end) {
1753 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1754 ompt_parallel_data, *parent_task_data,
1755 OMPT_INVOKER(call_context) | ompt_parallel_league,
1756 *return_address);
1757 }
1758 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1759 }
1760#endif
1761 } else {
1762 argv = args;
1763 for (i = argc - 1; i >= 0; --i)
1764 *argv++ = va_arg(kmp_va_deref(ap), void *);
1765 KMP_MB();
1766
1767#if OMPT_SUPPORT
1768 void *dummy;
1769 void **exit_frame_p;
1770 ompt_task_info_t *task_info;
1771 ompt_lw_taskteam_t lw_taskteam;
1772 ompt_data_t *implicit_task_data;
1773
1774 if (ompt_enabled.enabled) {
1775 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1776 ompt_parallel_data, *return_address);
1777 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1778 // don't use lw_taskteam after linking. content was swaped
1779 task_info = OMPT_CUR_TASK_INFO(master_th);
1780 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1781
1782 /* OMPT implicit task begin */
1783 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1784 if (ompt_enabled.ompt_callback_implicit_task) {
1785 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1786 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1787 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1788 ompt_task_implicit);
1789 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1790 }
1791
1792 /* OMPT state */
1793 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1794 } else {
1795 exit_frame_p = &dummy;
1796 }
1797#endif
1798
1799 {
1800 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1801 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1802 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1803#if OMPT_SUPPORT
1804 ,
1805 exit_frame_p
1806#endif
1807 );
1808 }
1809
1810#if OMPT_SUPPORT
1811 if (ompt_enabled.enabled) {
1812 *exit_frame_p = NULL;
1813 if (ompt_enabled.ompt_callback_implicit_task) {
1814 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1815 ompt_scope_end, NULL, &(task_info->task_data), 1,
1816 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1817 }
1818
1819 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1820 __ompt_lw_taskteam_unlink(master_th);
1821 if (ompt_enabled.ompt_callback_parallel_end) {
1822 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1823 ompt_parallel_data, *parent_task_data,
1824 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1825 }
1826 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1827 }
1828#endif
1829 }
1830 } else if (call_context == fork_context_gnu) {
1831#if OMPT_SUPPORT
1832 if (ompt_enabled.enabled) {
1833 ompt_lw_taskteam_t lwt;
1834 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1835 *return_address);
1836
1837 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1838 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1839 }
1840// don't use lw_taskteam after linking. content was swaped
1841#endif
1842
1843 // we were called from GNU native code
1844 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1845 return FALSE;
1846 } else {
1847 KMP_ASSERT2(call_context < fork_context_last,
1848 "__kmp_serial_fork_call: unknown fork_context parameter");
1849 }
1850
1851 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1852 KMP_MB();
1853 return FALSE;
1854}
1855
1856/* most of the work for a fork */
1857/* return true if we really went parallel, false if serialized */
1858int __kmp_fork_call(ident_t *loc, int gtid,
1859 enum fork_context_e call_context, // Intel, GNU, ...
1860 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1861 kmp_va_list ap) {
1862 void **argv;
1863 int i;
1864 int master_tid;
1865 int master_this_cons;
1866 kmp_team_t *team;
1867 kmp_team_t *parent_team;
1868 kmp_info_t *master_th;
1869 kmp_root_t *root;
1870 int nthreads;
1871 int master_active;
1872 int master_set_numthreads;
1873 int level;
1874 int active_level;
1875 int teams_level;
1876#if KMP_NESTED_HOT_TEAMS
1877 kmp_hot_team_ptr_t **p_hot_teams;
1878#endif
1879 { // KMP_TIME_BLOCK
1880 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1881 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1882
1883 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1884 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1885 /* Some systems prefer the stack for the root thread(s) to start with */
1886 /* some gap from the parent stack to prevent false sharing. */
1887 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1888 /* These 2 lines below are so this does not get optimized out */
1889 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1890 __kmp_stkpadding += (short)((kmp_int64)dummy);
1891 }
1892
1893 /* initialize if needed */
1894 KMP_DEBUG_ASSERT(
1895 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1896 if (!TCR_4(__kmp_init_parallel))
1897 __kmp_parallel_initialize();
1898 __kmp_resume_if_soft_paused();
1899
1900 /* setup current data */
1901 // AC: potentially unsafe, not in sync with library shutdown,
1902 // __kmp_threads can be freed
1903 master_th = __kmp_threads[gtid];
1904
1905 parent_team = master_th->th.th_team;
1906 master_tid = master_th->th.th_info.ds.ds_tid;
1907 master_this_cons = master_th->th.th_local.this_construct;
1908 root = master_th->th.th_root;
1909 master_active = root->r.r_active;
1910 master_set_numthreads = master_th->th.th_set_nproc;
1911
1912#if OMPT_SUPPORT
1913 ompt_data_t ompt_parallel_data = ompt_data_none;
1914 ompt_data_t *parent_task_data;
1915 ompt_frame_t *ompt_frame;
1916 void *return_address = NULL;
1917
1918 if (ompt_enabled.enabled) {
1919 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1920 NULL, NULL);
1921 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1922 }
1923#endif
1924
1925 // Assign affinity to root thread if it hasn't happened yet
1926 __kmp_assign_root_init_mask();
1927
1928 // Nested level will be an index in the nested nthreads array
1929 level = parent_team->t.t_level;
1930 // used to launch non-serial teams even if nested is not allowed
1931 active_level = parent_team->t.t_active_level;
1932 // needed to check nesting inside the teams
1933 teams_level = master_th->th.th_teams_level;
1934#if KMP_NESTED_HOT_TEAMS
1935 p_hot_teams = &master_th->th.th_hot_teams;
1936 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1937 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1938 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1939 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1940 // it is either actual or not needed (when active_level > 0)
1941 (*p_hot_teams)[0].hot_team_nth = 1;
1942 }
1943#endif
1944
1945#if OMPT_SUPPORT
1946 if (ompt_enabled.enabled) {
1947 if (ompt_enabled.ompt_callback_parallel_begin) {
1948 int team_size = master_set_numthreads
1949 ? master_set_numthreads
1950 : get__nproc_2(parent_team, master_tid);
1951 int flags = OMPT_INVOKER(call_context) |
1952 ((microtask == (microtask_t)__kmp_teams_master)
1953 ? ompt_parallel_league
1954 : ompt_parallel_team);
1955 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1956 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1957 return_address);
1958 }
1959 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1960 }
1961#endif
1962
1963 master_th->th.th_ident = loc;
1964
1965 // Parallel closely nested in teams construct:
1966 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1967 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1968 call_context, microtask, invoker,
1969 master_set_numthreads, level,
1970#if OMPT_SUPPORT
1971 ompt_parallel_data, return_address,
1972#endif
1973 ap);
1974 } // End parallel closely nested in teams construct
1975
1976#if KMP_DEBUG
1977 if (__kmp_tasking_mode != tskm_immediate_exec) {
1978 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1979 parent_team->t.t_task_team[master_th->th.th_task_state]);
1980 }
1981#endif
1982
1983 // Need this to happen before we determine the number of threads, not while
1984 // we are allocating the team
1985 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1986
1987 // Determine the number of threads
1988 int enter_teams =
1989 __kmp_is_entering_teams(active_level, level, teams_level, ap);
1990 if ((!enter_teams &&
1991 (parent_team->t.t_active_level >=
1992 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1993 (__kmp_library == library_serial)) {
1994 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1995 nthreads = 1;
1996 } else {
1997 nthreads = master_set_numthreads
1998 ? master_set_numthreads
1999 // TODO: get nproc directly from current task
2000 : get__nproc_2(parent_team, master_tid);
2001 // Check if we need to take forkjoin lock? (no need for serialized
2002 // parallel out of teams construct).
2003 if (nthreads > 1) {
2004 /* determine how many new threads we can use */
2005 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2006 /* AC: If we execute teams from parallel region (on host), then teams
2007 should be created but each can only have 1 thread if nesting is
2008 disabled. If teams called from serial region, then teams and their
2009 threads should be created regardless of the nesting setting. */
2010 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2011 nthreads, enter_teams);
2012 if (nthreads == 1) {
2013 // Free lock for single thread execution here; for multi-thread
2014 // execution it will be freed later after team of threads created
2015 // and initialized
2016 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2017 }
2018 }
2019 }
2020 KMP_DEBUG_ASSERT(nthreads > 0);
2021
2022 // If we temporarily changed the set number of threads then restore it now
2023 master_th->th.th_set_nproc = 0;
2024
2025 if (nthreads == 1) {
2026 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2027 invoker, master_th, parent_team,
2028#if OMPT_SUPPORT
2029 &ompt_parallel_data, &return_address,
2030 &parent_task_data,
2031#endif
2032 ap);
2033 } // if (nthreads == 1)
2034
2035 // GEH: only modify the executing flag in the case when not serialized
2036 // serialized case is handled in kmpc_serialized_parallel
2037 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2038 "curtask=%p, curtask_max_aclevel=%d\n",
2039 parent_team->t.t_active_level, master_th,
2040 master_th->th.th_current_task,
2041 master_th->th.th_current_task->td_icvs.max_active_levels));
2042 // TODO: GEH - cannot do this assertion because root thread not set up as
2043 // executing
2044 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2045 master_th->th.th_current_task->td_flags.executing = 0;
2046
2047 if (!master_th->th.th_teams_microtask || level > teams_level) {
2048 /* Increment our nested depth level */
2049 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2050 }
2051
2052 // See if we need to make a copy of the ICVs.
2053 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2054 if ((level + 1 < __kmp_nested_nth.used) &&
2055 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2056 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2057 } else {
2058 nthreads_icv = 0; // don't update
2059 }
2060
2061 // Figure out the proc_bind_policy for the new team.
2062 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2063 // proc_bind_default means don't update
2064 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2065 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2066 proc_bind = proc_bind_false;
2067 } else {
2068 // No proc_bind clause specified; use current proc-bind-var for this
2069 // parallel region
2070 if (proc_bind == proc_bind_default) {
2071 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2072 }
2073 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2074 if (master_th->th.th_teams_microtask &&
2075 microtask == (microtask_t)__kmp_teams_master) {
2076 proc_bind = __kmp_teams_proc_bind;
2077 }
2078 /* else: The proc_bind policy was specified explicitly on parallel clause.
2079 This overrides proc-bind-var for this parallel region, but does not
2080 change proc-bind-var. */
2081 // Figure the value of proc-bind-var for the child threads.
2082 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2083 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2084 master_th->th.th_current_task->td_icvs.proc_bind)) {
2085 // Do not modify the proc bind icv for the two teams construct forks
2086 // They just let the proc bind icv pass through
2087 if (!master_th->th.th_teams_microtask ||
2088 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2089 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2090 }
2091 }
2092
2093 // Reset for next parallel region
2094 master_th->th.th_set_proc_bind = proc_bind_default;
2095
2096 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2097 kmp_internal_control_t new_icvs;
2098 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2099 new_icvs.next = NULL;
2100 if (nthreads_icv > 0) {
2101 new_icvs.nproc = nthreads_icv;
2102 }
2103 if (proc_bind_icv != proc_bind_default) {
2104 new_icvs.proc_bind = proc_bind_icv;
2105 }
2106
2107 /* allocate a new parallel team */
2108 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2109 team = __kmp_allocate_team(root, nthreads, nthreads,
2110#if OMPT_SUPPORT
2111 ompt_parallel_data,
2112#endif
2113 proc_bind, &new_icvs,
2114 argc USE_NESTED_HOT_ARG(master_th));
2115 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2116 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2117 } else {
2118 /* allocate a new parallel team */
2119 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2120 team = __kmp_allocate_team(root, nthreads, nthreads,
2121#if OMPT_SUPPORT
2122 ompt_parallel_data,
2123#endif
2124 proc_bind,
2125 &master_th->th.th_current_task->td_icvs,
2126 argc USE_NESTED_HOT_ARG(master_th));
2127 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2128 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2129 &master_th->th.th_current_task->td_icvs);
2130 }
2131 KF_TRACE(
2132 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2133
2134 /* setup the new team */
2135 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2136 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2137 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2138 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2139 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2140#if OMPT_SUPPORT
2141 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2142 return_address);
2143#endif
2144 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2145 // TODO: parent_team->t.t_level == INT_MAX ???
2146 if (!master_th->th.th_teams_microtask || level > teams_level) {
2147 int new_level = parent_team->t.t_level + 1;
2148 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2149 new_level = parent_team->t.t_active_level + 1;
2150 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2151 } else {
2152 // AC: Do not increase parallel level at start of the teams construct
2153 int new_level = parent_team->t.t_level;
2154 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2155 new_level = parent_team->t.t_active_level;
2156 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2157 }
2158 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2159 // set primary thread's schedule as new run-time schedule
2160 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2161
2162 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2163 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2164
2165 // Update the floating point rounding in the team if required.
2166 propagateFPControl(team);
2167#if OMPD_SUPPORT
2168 if (ompd_state & OMPD_ENABLE_BP)
2169 ompd_bp_parallel_begin();
2170#endif
2171
2172 if (__kmp_tasking_mode != tskm_immediate_exec) {
2173 // Set primary thread's task team to team's task team. Unless this is hot
2174 // team, it should be NULL.
2175 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2176 parent_team->t.t_task_team[master_th->th.th_task_state]);
2177 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2178 "%p, new task_team %p / team %p\n",
2179 __kmp_gtid_from_thread(master_th),
2180 master_th->th.th_task_team, parent_team,
2181 team->t.t_task_team[master_th->th.th_task_state], team));
2182
2183 if (active_level || master_th->th.th_task_team) {
2184 // Take a memo of primary thread's task_state
2185 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2186 if (master_th->th.th_task_state_top >=
2187 master_th->th.th_task_state_stack_sz) { // increase size
2188 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2189 kmp_uint8 *old_stack, *new_stack;
2190 kmp_uint32 i;
2191 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2192 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2193 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2194 }
2195 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2196 ++i) { // zero-init rest of stack
2197 new_stack[i] = 0;
2198 }
2199 old_stack = master_th->th.th_task_state_memo_stack;
2200 master_th->th.th_task_state_memo_stack = new_stack;
2201 master_th->th.th_task_state_stack_sz = new_size;
2202 __kmp_free(old_stack);
2203 }
2204 // Store primary thread's task_state on stack
2205 master_th->th
2206 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2207 master_th->th.th_task_state;
2208 master_th->th.th_task_state_top++;
2209#if KMP_NESTED_HOT_TEAMS
2210 if (master_th->th.th_hot_teams &&
2211 active_level < __kmp_hot_teams_max_level &&
2212 team == master_th->th.th_hot_teams[active_level].hot_team) {
2213 // Restore primary thread's nested state if nested hot team
2214 master_th->th.th_task_state =
2215 master_th->th
2216 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2217 } else {
2218#endif
2219 master_th->th.th_task_state = 0;
2220#if KMP_NESTED_HOT_TEAMS
2221 }
2222#endif
2223 }
2224#if !KMP_NESTED_HOT_TEAMS
2225 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2226 (team == root->r.r_hot_team));
2227#endif
2228 }
2229
2230 KA_TRACE(
2231 20,
2232 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2233 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2234 team->t.t_nproc));
2235 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2236 (team->t.t_master_tid == 0 &&
2237 (team->t.t_parent == root->r.r_root_team ||
2238 team->t.t_parent->t.t_serialized)));
2239 KMP_MB();
2240
2241 /* now, setup the arguments */
2242 argv = (void **)team->t.t_argv;
2243 if (ap) {
2244 for (i = argc - 1; i >= 0; --i) {
2245 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2246 KMP_CHECK_UPDATE(*argv, new_argv);
2247 argv++;
2248 }
2249 } else {
2250 for (i = 0; i < argc; ++i) {
2251 // Get args from parent team for teams construct
2252 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2253 }
2254 }
2255
2256 /* now actually fork the threads */
2257 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2258 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2259 root->r.r_active = TRUE;
2260
2261 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2262 __kmp_setup_icv_copy(team, nthreads,
2263 &master_th->th.th_current_task->td_icvs, loc);
2264
2265#if OMPT_SUPPORT
2266 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2267#endif
2268
2269 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2270
2271#if USE_ITT_BUILD
2272 if (team->t.t_active_level == 1 // only report frames at level 1
2273 && !master_th->th.th_teams_microtask) { // not in teams construct
2274#if USE_ITT_NOTIFY
2275 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2276 (__kmp_forkjoin_frames_mode == 3 ||
2277 __kmp_forkjoin_frames_mode == 1)) {
2278 kmp_uint64 tmp_time = 0;
2279 if (__itt_get_timestamp_ptr)
2280 tmp_time = __itt_get_timestamp();
2281 // Internal fork - report frame begin
2282 master_th->th.th_frame_time = tmp_time;
2283 if (__kmp_forkjoin_frames_mode == 3)
2284 team->t.t_region_time = tmp_time;
2285 } else
2286// only one notification scheme (either "submit" or "forking/joined", not both)
2287#endif /* USE_ITT_NOTIFY */
2288 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2289 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2290 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2291 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2292 }
2293 }
2294#endif /* USE_ITT_BUILD */
2295
2296 /* now go on and do the work */
2297 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2298 KMP_MB();
2299 KF_TRACE(10,
2300 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2301 root, team, master_th, gtid));
2302
2303#if USE_ITT_BUILD
2304 if (__itt_stack_caller_create_ptr) {
2305 // create new stack stitching id before entering fork barrier
2306 if (!enter_teams) {
2307 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2308 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2309 } else if (parent_team->t.t_serialized) {
2310 // keep stack stitching id in the serialized parent_team;
2311 // current team will be used for parallel inside the teams;
2312 // if parent_team is active, then it already keeps stack stitching id
2313 // for the league of teams
2314 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2315 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2316 }
2317 }
2318#endif /* USE_ITT_BUILD */
2319
2320 // AC: skip __kmp_internal_fork at teams construct, let only primary
2321 // threads execute
2322 if (ap) {
2323 __kmp_internal_fork(loc, gtid, team);
2324 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2325 "master_th=%p, gtid=%d\n",
2326 root, team, master_th, gtid));
2327 }
2328
2329 if (call_context == fork_context_gnu) {
2330 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2331 return TRUE;
2332 }
2333
2334 /* Invoke microtask for PRIMARY thread */
2335 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2336 team->t.t_id, team->t.t_pkfn));
2337 } // END of timer KMP_fork_call block
2338
2339#if KMP_STATS_ENABLED
2340 // If beginning a teams construct, then change thread state
2341 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2342 if (!ap) {
2343 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2344 }
2345#endif
2346
2347 if (!team->t.t_invoke(gtid)) {
2348 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2349 }
2350
2351#if KMP_STATS_ENABLED
2352 // If was beginning of a teams construct, then reset thread state
2353 if (!ap) {
2354 KMP_SET_THREAD_STATE(previous_state);
2355 }
2356#endif
2357
2358 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2359 team->t.t_id, team->t.t_pkfn));
2360 KMP_MB(); /* Flush all pending memory write invalidates. */
2361
2362 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2363#if OMPT_SUPPORT
2364 if (ompt_enabled.enabled) {
2365 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2366 }
2367#endif
2368
2369 return TRUE;
2370}
2371
2372#if OMPT_SUPPORT
2373static inline void __kmp_join_restore_state(kmp_info_t *thread,
2374 kmp_team_t *team) {
2375 // restore state outside the region
2376 thread->th.ompt_thread_info.state =
2377 ((team->t.t_serialized) ? ompt_state_work_serial
2378 : ompt_state_work_parallel);
2379}
2380
2381static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2382 kmp_team_t *team, ompt_data_t *parallel_data,
2383 int flags, void *codeptr) {
2384 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2385 if (ompt_enabled.ompt_callback_parallel_end) {
2386 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2387 parallel_data, &(task_info->task_data), flags, codeptr);
2388 }
2389
2390 task_info->frame.enter_frame = ompt_data_none;
2391 __kmp_join_restore_state(thread, team);
2392}
2393#endif
2394
2395void __kmp_join_call(ident_t *loc, int gtid
2396#if OMPT_SUPPORT
2397 ,
2398 enum fork_context_e fork_context
2399#endif
2400 ,
2401 int exit_teams) {
2402 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2403 kmp_team_t *team;
2404 kmp_team_t *parent_team;
2405 kmp_info_t *master_th;
2406 kmp_root_t *root;
2407 int master_active;
2408
2409 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2410
2411 /* setup current data */
2412 master_th = __kmp_threads[gtid];
2413 root = master_th->th.th_root;
2414 team = master_th->th.th_team;
2415 parent_team = team->t.t_parent;
2416
2417 master_th->th.th_ident = loc;
2418
2419#if OMPT_SUPPORT
2420 void *team_microtask = (void *)team->t.t_pkfn;
2421 // For GOMP interface with serialized parallel, need the
2422 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2423 // and end-parallel events.
2424 if (ompt_enabled.enabled &&
2425 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2426 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2427 }
2428#endif
2429
2430#if KMP_DEBUG
2431 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2432 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2433 "th_task_team = %p\n",
2434 __kmp_gtid_from_thread(master_th), team,
2435 team->t.t_task_team[master_th->th.th_task_state],
2436 master_th->th.th_task_team));
2437 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2438 team->t.t_task_team[master_th->th.th_task_state]);
2439 }
2440#endif
2441
2442 if (team->t.t_serialized) {
2443 if (master_th->th.th_teams_microtask) {
2444 // We are in teams construct
2445 int level = team->t.t_level;
2446 int tlevel = master_th->th.th_teams_level;
2447 if (level == tlevel) {
2448 // AC: we haven't incremented it earlier at start of teams construct,
2449 // so do it here - at the end of teams construct
2450 team->t.t_level++;
2451 } else if (level == tlevel + 1) {
2452 // AC: we are exiting parallel inside teams, need to increment
2453 // serialization in order to restore it in the next call to
2454 // __kmpc_end_serialized_parallel
2455 team->t.t_serialized++;
2456 }
2457 }
2459
2460#if OMPT_SUPPORT
2461 if (ompt_enabled.enabled) {
2462 if (fork_context == fork_context_gnu) {
2463 __ompt_lw_taskteam_unlink(master_th);
2464 }
2465 __kmp_join_restore_state(master_th, parent_team);
2466 }
2467#endif
2468
2469 return;
2470 }
2471
2472 master_active = team->t.t_master_active;
2473
2474 if (!exit_teams) {
2475 // AC: No barrier for internal teams at exit from teams construct.
2476 // But there is barrier for external team (league).
2477 __kmp_internal_join(loc, gtid, team);
2478#if USE_ITT_BUILD
2479 if (__itt_stack_caller_create_ptr) {
2480 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2481 // destroy the stack stitching id after join barrier
2482 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2483 team->t.t_stack_id = NULL;
2484 }
2485#endif
2486 } else {
2487 master_th->th.th_task_state =
2488 0; // AC: no tasking in teams (out of any parallel)
2489#if USE_ITT_BUILD
2490 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2491 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2492 // destroy the stack stitching id on exit from the teams construct
2493 // if parent_team is active, then the id will be destroyed later on
2494 // by master of the league of teams
2495 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2496 parent_team->t.t_stack_id = NULL;
2497 }
2498#endif
2499 }
2500
2501 KMP_MB();
2502
2503#if OMPT_SUPPORT
2504 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2505 void *codeptr = team->t.ompt_team_info.master_return_address;
2506#endif
2507
2508#if USE_ITT_BUILD
2509 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2510 if (team->t.t_active_level == 1 &&
2511 (!master_th->th.th_teams_microtask || /* not in teams construct */
2512 master_th->th.th_teams_size.nteams == 1)) {
2513 master_th->th.th_ident = loc;
2514 // only one notification scheme (either "submit" or "forking/joined", not
2515 // both)
2516 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2517 __kmp_forkjoin_frames_mode == 3)
2518 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2519 master_th->th.th_frame_time, 0, loc,
2520 master_th->th.th_team_nproc, 1);
2521 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2522 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2523 __kmp_itt_region_joined(gtid);
2524 } // active_level == 1
2525#endif /* USE_ITT_BUILD */
2526
2527#if KMP_AFFINITY_SUPPORTED
2528 if (!exit_teams) {
2529 // Restore master thread's partition.
2530 master_th->th.th_first_place = team->t.t_first_place;
2531 master_th->th.th_last_place = team->t.t_last_place;
2532 }
2533#endif // KMP_AFFINITY_SUPPORTED
2534
2535 if (master_th->th.th_teams_microtask && !exit_teams &&
2536 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2537 team->t.t_level == master_th->th.th_teams_level + 1) {
2538// AC: We need to leave the team structure intact at the end of parallel
2539// inside the teams construct, so that at the next parallel same (hot) team
2540// works, only adjust nesting levels
2541#if OMPT_SUPPORT
2542 ompt_data_t ompt_parallel_data = ompt_data_none;
2543 if (ompt_enabled.enabled) {
2544 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2545 if (ompt_enabled.ompt_callback_implicit_task) {
2546 int ompt_team_size = team->t.t_nproc;
2547 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2548 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2549 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2550 }
2551 task_info->frame.exit_frame = ompt_data_none;
2552 task_info->task_data = ompt_data_none;
2553 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2554 __ompt_lw_taskteam_unlink(master_th);
2555 }
2556#endif
2557 /* Decrement our nested depth level */
2558 team->t.t_level--;
2559 team->t.t_active_level--;
2560 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2561
2562 // Restore number of threads in the team if needed. This code relies on
2563 // the proper adjustment of th_teams_size.nth after the fork in
2564 // __kmp_teams_master on each teams primary thread in the case that
2565 // __kmp_reserve_threads reduced it.
2566 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2567 int old_num = master_th->th.th_team_nproc;
2568 int new_num = master_th->th.th_teams_size.nth;
2569 kmp_info_t **other_threads = team->t.t_threads;
2570 team->t.t_nproc = new_num;
2571 for (int i = 0; i < old_num; ++i) {
2572 other_threads[i]->th.th_team_nproc = new_num;
2573 }
2574 // Adjust states of non-used threads of the team
2575 for (int i = old_num; i < new_num; ++i) {
2576 // Re-initialize thread's barrier data.
2577 KMP_DEBUG_ASSERT(other_threads[i]);
2578 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2579 for (int b = 0; b < bs_last_barrier; ++b) {
2580 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2581 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2582#if USE_DEBUGGER
2583 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2584#endif
2585 }
2586 if (__kmp_tasking_mode != tskm_immediate_exec) {
2587 // Synchronize thread's task state
2588 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2589 }
2590 }
2591 }
2592
2593#if OMPT_SUPPORT
2594 if (ompt_enabled.enabled) {
2595 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2596 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2597 }
2598#endif
2599
2600 return;
2601 }
2602
2603 /* do cleanup and restore the parent team */
2604 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2605 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2606
2607 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2608
2609 /* jc: The following lock has instructions with REL and ACQ semantics,
2610 separating the parallel user code called in this parallel region
2611 from the serial user code called after this function returns. */
2612 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2613
2614 if (!master_th->th.th_teams_microtask ||
2615 team->t.t_level > master_th->th.th_teams_level) {
2616 /* Decrement our nested depth level */
2617 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2618 }
2619 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2620
2621#if OMPT_SUPPORT
2622 if (ompt_enabled.enabled) {
2623 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2624 if (ompt_enabled.ompt_callback_implicit_task) {
2625 int flags = (team_microtask == (void *)__kmp_teams_master)
2626 ? ompt_task_initial
2627 : ompt_task_implicit;
2628 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2629 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2630 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2631 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2632 }
2633 task_info->frame.exit_frame = ompt_data_none;
2634 task_info->task_data = ompt_data_none;
2635 }
2636#endif
2637
2638 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2639 master_th, team));
2640 __kmp_pop_current_task_from_thread(master_th);
2641
2642 master_th->th.th_def_allocator = team->t.t_def_allocator;
2643
2644#if OMPD_SUPPORT
2645 if (ompd_state & OMPD_ENABLE_BP)
2646 ompd_bp_parallel_end();
2647#endif
2648 updateHWFPControl(team);
2649
2650 if (root->r.r_active != master_active)
2651 root->r.r_active = master_active;
2652
2653 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2654 master_th)); // this will free worker threads
2655
2656 /* this race was fun to find. make sure the following is in the critical
2657 region otherwise assertions may fail occasionally since the old team may be
2658 reallocated and the hierarchy appears inconsistent. it is actually safe to
2659 run and won't cause any bugs, but will cause those assertion failures. it's
2660 only one deref&assign so might as well put this in the critical region */
2661 master_th->th.th_team = parent_team;
2662 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2663 master_th->th.th_team_master = parent_team->t.t_threads[0];
2664 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2665
2666 /* restore serialized team, if need be */
2667 if (parent_team->t.t_serialized &&
2668 parent_team != master_th->th.th_serial_team &&
2669 parent_team != root->r.r_root_team) {
2670 __kmp_free_team(root,
2671 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2672 master_th->th.th_serial_team = parent_team;
2673 }
2674
2675 if (__kmp_tasking_mode != tskm_immediate_exec) {
2676 if (master_th->th.th_task_state_top >
2677 0) { // Restore task state from memo stack
2678 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2679 // Remember primary thread's state if we re-use this nested hot team
2680 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2681 master_th->th.th_task_state;
2682 --master_th->th.th_task_state_top; // pop
2683 // Now restore state at this level
2684 master_th->th.th_task_state =
2685 master_th->th
2686 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2687 } else if (team != root->r.r_hot_team) {
2688 // Reset the task state of primary thread if we are not hot team because
2689 // in this case all the worker threads will be free, and their task state
2690 // will be reset. If not reset the primary's, the task state will be
2691 // inconsistent.
2692 master_th->th.th_task_state = 0;
2693 }
2694 // Copy the task team from the parent team to the primary thread
2695 master_th->th.th_task_team =
2696 parent_team->t.t_task_team[master_th->th.th_task_state];
2697 KA_TRACE(20,
2698 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2699 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2700 parent_team));
2701 }
2702
2703 // TODO: GEH - cannot do this assertion because root thread not set up as
2704 // executing
2705 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2706 master_th->th.th_current_task->td_flags.executing = 1;
2707
2708 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2709
2710#if KMP_AFFINITY_SUPPORTED
2711 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2712 __kmp_reset_root_init_mask(gtid);
2713 }
2714#endif
2715#if OMPT_SUPPORT
2716 int flags =
2717 OMPT_INVOKER(fork_context) |
2718 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2719 : ompt_parallel_team);
2720 if (ompt_enabled.enabled) {
2721 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2722 codeptr);
2723 }
2724#endif
2725
2726 KMP_MB();
2727 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2728}
2729
2730/* Check whether we should push an internal control record onto the
2731 serial team stack. If so, do it. */
2732void __kmp_save_internal_controls(kmp_info_t *thread) {
2733
2734 if (thread->th.th_team != thread->th.th_serial_team) {
2735 return;
2736 }
2737 if (thread->th.th_team->t.t_serialized > 1) {
2738 int push = 0;
2739
2740 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2741 push = 1;
2742 } else {
2743 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2744 thread->th.th_team->t.t_serialized) {
2745 push = 1;
2746 }
2747 }
2748 if (push) { /* push a record on the serial team's stack */
2749 kmp_internal_control_t *control =
2750 (kmp_internal_control_t *)__kmp_allocate(
2751 sizeof(kmp_internal_control_t));
2752
2753 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2754
2755 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2756
2757 control->next = thread->th.th_team->t.t_control_stack_top;
2758 thread->th.th_team->t.t_control_stack_top = control;
2759 }
2760 }
2761}
2762
2763/* Changes set_nproc */
2764void __kmp_set_num_threads(int new_nth, int gtid) {
2765 kmp_info_t *thread;
2766 kmp_root_t *root;
2767
2768 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2769 KMP_DEBUG_ASSERT(__kmp_init_serial);
2770
2771 if (new_nth < 1)
2772 new_nth = 1;
2773 else if (new_nth > __kmp_max_nth)
2774 new_nth = __kmp_max_nth;
2775
2776 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2777 thread = __kmp_threads[gtid];
2778 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2779 return; // nothing to do
2780
2781 __kmp_save_internal_controls(thread);
2782
2783 set__nproc(thread, new_nth);
2784
2785 // If this omp_set_num_threads() call will cause the hot team size to be
2786 // reduced (in the absence of a num_threads clause), then reduce it now,
2787 // rather than waiting for the next parallel region.
2788 root = thread->th.th_root;
2789 if (__kmp_init_parallel && (!root->r.r_active) &&
2790 (root->r.r_hot_team->t.t_nproc > new_nth)
2791#if KMP_NESTED_HOT_TEAMS
2792 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2793#endif
2794 ) {
2795 kmp_team_t *hot_team = root->r.r_hot_team;
2796 int f;
2797
2798 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2799
2800 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2801 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2802 }
2803 // Release the extra threads we don't need any more.
2804 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2805 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2806 if (__kmp_tasking_mode != tskm_immediate_exec) {
2807 // When decreasing team size, threads no longer in the team should unref
2808 // task team.
2809 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2810 }
2811 __kmp_free_thread(hot_team->t.t_threads[f]);
2812 hot_team->t.t_threads[f] = NULL;
2813 }
2814 hot_team->t.t_nproc = new_nth;
2815#if KMP_NESTED_HOT_TEAMS
2816 if (thread->th.th_hot_teams) {
2817 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2818 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2819 }
2820#endif
2821
2822 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2823 hot_team->t.b->update_num_threads(new_nth);
2824 __kmp_add_threads_to_team(hot_team, new_nth);
2825 }
2826
2827 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2828
2829 // Update the t_nproc field in the threads that are still active.
2830 for (f = 0; f < new_nth; f++) {
2831 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2832 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2833 }
2834 // Special flag in case omp_set_num_threads() call
2835 hot_team->t.t_size_changed = -1;
2836 }
2837}
2838
2839/* Changes max_active_levels */
2840void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2841 kmp_info_t *thread;
2842
2843 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2844 "%d = (%d)\n",
2845 gtid, max_active_levels));
2846 KMP_DEBUG_ASSERT(__kmp_init_serial);
2847
2848 // validate max_active_levels
2849 if (max_active_levels < 0) {
2850 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2851 // We ignore this call if the user has specified a negative value.
2852 // The current setting won't be changed. The last valid setting will be
2853 // used. A warning will be issued (if warnings are allowed as controlled by
2854 // the KMP_WARNINGS env var).
2855 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2856 "max_active_levels for thread %d = (%d)\n",
2857 gtid, max_active_levels));
2858 return;
2859 }
2860 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2861 // it's OK, the max_active_levels is within the valid range: [ 0;
2862 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2863 // We allow a zero value. (implementation defined behavior)
2864 } else {
2865 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2866 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2867 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2868 // Current upper limit is MAX_INT. (implementation defined behavior)
2869 // If the input exceeds the upper limit, we correct the input to be the
2870 // upper limit. (implementation defined behavior)
2871 // Actually, the flow should never get here until we use MAX_INT limit.
2872 }
2873 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2874 "max_active_levels for thread %d = (%d)\n",
2875 gtid, max_active_levels));
2876
2877 thread = __kmp_threads[gtid];
2878
2879 __kmp_save_internal_controls(thread);
2880
2881 set__max_active_levels(thread, max_active_levels);
2882}
2883
2884/* Gets max_active_levels */
2885int __kmp_get_max_active_levels(int gtid) {
2886 kmp_info_t *thread;
2887
2888 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2889 KMP_DEBUG_ASSERT(__kmp_init_serial);
2890
2891 thread = __kmp_threads[gtid];
2892 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2893 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2894 "curtask_maxaclevel=%d\n",
2895 gtid, thread->th.th_current_task,
2896 thread->th.th_current_task->td_icvs.max_active_levels));
2897 return thread->th.th_current_task->td_icvs.max_active_levels;
2898}
2899
2900// nteams-var per-device ICV
2901void __kmp_set_num_teams(int num_teams) {
2902 if (num_teams > 0)
2903 __kmp_nteams = num_teams;
2904}
2905int __kmp_get_max_teams(void) { return __kmp_nteams; }
2906// teams-thread-limit-var per-device ICV
2907void __kmp_set_teams_thread_limit(int limit) {
2908 if (limit > 0)
2909 __kmp_teams_thread_limit = limit;
2910}
2911int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2912
2913KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2914KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2915
2916/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2917void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2918 kmp_info_t *thread;
2919 kmp_sched_t orig_kind;
2920 // kmp_team_t *team;
2921
2922 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2923 gtid, (int)kind, chunk));
2924 KMP_DEBUG_ASSERT(__kmp_init_serial);
2925
2926 // Check if the kind parameter is valid, correct if needed.
2927 // Valid parameters should fit in one of two intervals - standard or extended:
2928 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2929 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2930 orig_kind = kind;
2931 kind = __kmp_sched_without_mods(kind);
2932
2933 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2934 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2935 // TODO: Hint needs attention in case we change the default schedule.
2936 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2937 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2938 __kmp_msg_null);
2939 kind = kmp_sched_default;
2940 chunk = 0; // ignore chunk value in case of bad kind
2941 }
2942
2943 thread = __kmp_threads[gtid];
2944
2945 __kmp_save_internal_controls(thread);
2946
2947 if (kind < kmp_sched_upper_std) {
2948 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2949 // differ static chunked vs. unchunked: chunk should be invalid to
2950 // indicate unchunked schedule (which is the default)
2951 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2952 } else {
2953 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2954 __kmp_sch_map[kind - kmp_sched_lower - 1];
2955 }
2956 } else {
2957 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2958 // kmp_sched_lower - 2 ];
2959 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2960 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2961 kmp_sched_lower - 2];
2962 }
2963 __kmp_sched_apply_mods_intkind(
2964 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2965 if (kind == kmp_sched_auto || chunk < 1) {
2966 // ignore parameter chunk for schedule auto
2967 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2968 } else {
2969 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2970 }
2971}
2972
2973/* Gets def_sched_var ICV values */
2974void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2975 kmp_info_t *thread;
2976 enum sched_type th_type;
2977
2978 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2979 KMP_DEBUG_ASSERT(__kmp_init_serial);
2980
2981 thread = __kmp_threads[gtid];
2982
2983 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2984 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2985 case kmp_sch_static:
2986 case kmp_sch_static_greedy:
2987 case kmp_sch_static_balanced:
2988 *kind = kmp_sched_static;
2989 __kmp_sched_apply_mods_stdkind(kind, th_type);
2990 *chunk = 0; // chunk was not set, try to show this fact via zero value
2991 return;
2992 case kmp_sch_static_chunked:
2993 *kind = kmp_sched_static;
2994 break;
2995 case kmp_sch_dynamic_chunked:
2996 *kind = kmp_sched_dynamic;
2997 break;
2999 case kmp_sch_guided_iterative_chunked:
3000 case kmp_sch_guided_analytical_chunked:
3001 *kind = kmp_sched_guided;
3002 break;
3003 case kmp_sch_auto:
3004 *kind = kmp_sched_auto;
3005 break;
3006 case kmp_sch_trapezoidal:
3007 *kind = kmp_sched_trapezoidal;
3008 break;
3009#if KMP_STATIC_STEAL_ENABLED
3010 case kmp_sch_static_steal:
3011 *kind = kmp_sched_static_steal;
3012 break;
3013#endif
3014 default:
3015 KMP_FATAL(UnknownSchedulingType, th_type);
3016 }
3017
3018 __kmp_sched_apply_mods_stdkind(kind, th_type);
3019 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3020}
3021
3022int __kmp_get_ancestor_thread_num(int gtid, int level) {
3023
3024 int ii, dd;
3025 kmp_team_t *team;
3026 kmp_info_t *thr;
3027
3028 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3029 KMP_DEBUG_ASSERT(__kmp_init_serial);
3030
3031 // validate level
3032 if (level == 0)
3033 return 0;
3034 if (level < 0)
3035 return -1;
3036 thr = __kmp_threads[gtid];
3037 team = thr->th.th_team;
3038 ii = team->t.t_level;
3039 if (level > ii)
3040 return -1;
3041
3042 if (thr->th.th_teams_microtask) {
3043 // AC: we are in teams region where multiple nested teams have same level
3044 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3045 if (level <=
3046 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3047 KMP_DEBUG_ASSERT(ii >= tlevel);
3048 // AC: As we need to pass by the teams league, we need to artificially
3049 // increase ii
3050 if (ii == tlevel) {
3051 ii += 2; // three teams have same level
3052 } else {
3053 ii++; // two teams have same level
3054 }
3055 }
3056 }
3057
3058 if (ii == level)
3059 return __kmp_tid_from_gtid(gtid);
3060
3061 dd = team->t.t_serialized;
3062 level++;
3063 while (ii > level) {
3064 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3065 }
3066 if ((team->t.t_serialized) && (!dd)) {
3067 team = team->t.t_parent;
3068 continue;
3069 }
3070 if (ii > level) {
3071 team = team->t.t_parent;
3072 dd = team->t.t_serialized;
3073 ii--;
3074 }
3075 }
3076
3077 return (dd > 1) ? (0) : (team->t.t_master_tid);
3078}
3079
3080int __kmp_get_team_size(int gtid, int level) {
3081
3082 int ii, dd;
3083 kmp_team_t *team;
3084 kmp_info_t *thr;
3085
3086 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3087 KMP_DEBUG_ASSERT(__kmp_init_serial);
3088
3089 // validate level
3090 if (level == 0)
3091 return 1;
3092 if (level < 0)
3093 return -1;
3094 thr = __kmp_threads[gtid];
3095 team = thr->th.th_team;
3096 ii = team->t.t_level;
3097 if (level > ii)
3098 return -1;
3099
3100 if (thr->th.th_teams_microtask) {
3101 // AC: we are in teams region where multiple nested teams have same level
3102 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3103 if (level <=
3104 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3105 KMP_DEBUG_ASSERT(ii >= tlevel);
3106 // AC: As we need to pass by the teams league, we need to artificially
3107 // increase ii
3108 if (ii == tlevel) {
3109 ii += 2; // three teams have same level
3110 } else {
3111 ii++; // two teams have same level
3112 }
3113 }
3114 }
3115
3116 while (ii > level) {
3117 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3118 }
3119 if (team->t.t_serialized && (!dd)) {
3120 team = team->t.t_parent;
3121 continue;
3122 }
3123 if (ii > level) {
3124 team = team->t.t_parent;
3125 ii--;
3126 }
3127 }
3128
3129 return team->t.t_nproc;
3130}
3131
3132kmp_r_sched_t __kmp_get_schedule_global() {
3133 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3134 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3135 // independently. So one can get the updated schedule here.
3136
3137 kmp_r_sched_t r_sched;
3138
3139 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3140 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3141 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3142 // different roots (even in OMP 2.5)
3143 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3144 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3145 if (s == kmp_sch_static) {
3146 // replace STATIC with more detailed schedule (balanced or greedy)
3147 r_sched.r_sched_type = __kmp_static;
3148 } else if (s == kmp_sch_guided_chunked) {
3149 // replace GUIDED with more detailed schedule (iterative or analytical)
3150 r_sched.r_sched_type = __kmp_guided;
3151 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3152 r_sched.r_sched_type = __kmp_sched;
3153 }
3154 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3155
3156 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3157 // __kmp_chunk may be wrong here (if it was not ever set)
3158 r_sched.chunk = KMP_DEFAULT_CHUNK;
3159 } else {
3160 r_sched.chunk = __kmp_chunk;
3161 }
3162
3163 return r_sched;
3164}
3165
3166/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3167 at least argc number of *t_argv entries for the requested team. */
3168static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3169
3170 KMP_DEBUG_ASSERT(team);
3171 if (!realloc || argc > team->t.t_max_argc) {
3172
3173 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3174 "current entries=%d\n",
3175 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3176 /* if previously allocated heap space for args, free them */
3177 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3178 __kmp_free((void *)team->t.t_argv);
3179
3180 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3181 /* use unused space in the cache line for arguments */
3182 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3183 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3184 "argv entries\n",
3185 team->t.t_id, team->t.t_max_argc));
3186 team->t.t_argv = &team->t.t_inline_argv[0];
3187 if (__kmp_storage_map) {
3188 __kmp_print_storage_map_gtid(
3189 -1, &team->t.t_inline_argv[0],
3190 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3191 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3192 team->t.t_id);
3193 }
3194 } else {
3195 /* allocate space for arguments in the heap */
3196 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3197 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3198 : 2 * argc;
3199 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3200 "argv entries\n",
3201 team->t.t_id, team->t.t_max_argc));
3202 team->t.t_argv =
3203 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3204 if (__kmp_storage_map) {
3205 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3206 &team->t.t_argv[team->t.t_max_argc],
3207 sizeof(void *) * team->t.t_max_argc,
3208 "team_%d.t_argv", team->t.t_id);
3209 }
3210 }
3211 }
3212}
3213
3214static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3215 int i;
3216 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3217 team->t.t_threads =
3218 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3219 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3220 sizeof(dispatch_shared_info_t) * num_disp_buff);
3221 team->t.t_dispatch =
3222 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3223 team->t.t_implicit_task_taskdata =
3224 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3225 team->t.t_max_nproc = max_nth;
3226
3227 /* setup dispatch buffers */
3228 for (i = 0; i < num_disp_buff; ++i) {
3229 team->t.t_disp_buffer[i].buffer_index = i;
3230 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3231 }
3232}
3233
3234static void __kmp_free_team_arrays(kmp_team_t *team) {
3235 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3236 int i;
3237 for (i = 0; i < team->t.t_max_nproc; ++i) {
3238 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3239 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3240 team->t.t_dispatch[i].th_disp_buffer = NULL;
3241 }
3242 }
3243#if KMP_USE_HIER_SCHED
3244 __kmp_dispatch_free_hierarchies(team);
3245#endif
3246 __kmp_free(team->t.t_threads);
3247 __kmp_free(team->t.t_disp_buffer);
3248 __kmp_free(team->t.t_dispatch);
3249 __kmp_free(team->t.t_implicit_task_taskdata);
3250 team->t.t_threads = NULL;
3251 team->t.t_disp_buffer = NULL;
3252 team->t.t_dispatch = NULL;
3253 team->t.t_implicit_task_taskdata = 0;
3254}
3255
3256static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3257 kmp_info_t **oldThreads = team->t.t_threads;
3258
3259 __kmp_free(team->t.t_disp_buffer);
3260 __kmp_free(team->t.t_dispatch);
3261 __kmp_free(team->t.t_implicit_task_taskdata);
3262 __kmp_allocate_team_arrays(team, max_nth);
3263
3264 KMP_MEMCPY(team->t.t_threads, oldThreads,
3265 team->t.t_nproc * sizeof(kmp_info_t *));
3266
3267 __kmp_free(oldThreads);
3268}
3269
3270static kmp_internal_control_t __kmp_get_global_icvs(void) {
3271
3272 kmp_r_sched_t r_sched =
3273 __kmp_get_schedule_global(); // get current state of scheduling globals
3274
3275 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3276
3277 kmp_internal_control_t g_icvs = {
3278 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3279 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3280 // adjustment of threads (per thread)
3281 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3282 // whether blocktime is explicitly set
3283 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3284#if KMP_USE_MONITOR
3285 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3286// intervals
3287#endif
3288 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3289 // next parallel region (per thread)
3290 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3291 __kmp_cg_max_nth, // int thread_limit;
3292 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3293 // for max_active_levels
3294 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3295 // {sched,chunk} pair
3296 __kmp_nested_proc_bind.bind_types[0],
3297 __kmp_default_device,
3298 NULL // struct kmp_internal_control *next;
3299 };
3300
3301 return g_icvs;
3302}
3303
3304static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3305
3306 kmp_internal_control_t gx_icvs;
3307 gx_icvs.serial_nesting_level =
3308 0; // probably =team->t.t_serial like in save_inter_controls
3309 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3310 gx_icvs.next = NULL;
3311
3312 return gx_icvs;
3313}
3314
3315static void __kmp_initialize_root(kmp_root_t *root) {
3316 int f;
3317 kmp_team_t *root_team;
3318 kmp_team_t *hot_team;
3319 int hot_team_max_nth;
3320 kmp_r_sched_t r_sched =
3321 __kmp_get_schedule_global(); // get current state of scheduling globals
3322 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3323 KMP_DEBUG_ASSERT(root);
3324 KMP_ASSERT(!root->r.r_begin);
3325
3326 /* setup the root state structure */
3327 __kmp_init_lock(&root->r.r_begin_lock);
3328 root->r.r_begin = FALSE;
3329 root->r.r_active = FALSE;
3330 root->r.r_in_parallel = 0;
3331 root->r.r_blocktime = __kmp_dflt_blocktime;
3332#if KMP_AFFINITY_SUPPORTED
3333 root->r.r_affinity_assigned = FALSE;
3334#endif
3335
3336 /* setup the root team for this task */
3337 /* allocate the root team structure */
3338 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3339
3340 root_team =
3341 __kmp_allocate_team(root,
3342 1, // new_nproc
3343 1, // max_nproc
3344#if OMPT_SUPPORT
3345 ompt_data_none, // root parallel id
3346#endif
3347 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3348 0 // argc
3349 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3350 );
3351#if USE_DEBUGGER
3352 // Non-NULL value should be assigned to make the debugger display the root
3353 // team.
3354 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3355#endif
3356
3357 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3358
3359 root->r.r_root_team = root_team;
3360 root_team->t.t_control_stack_top = NULL;
3361
3362 /* initialize root team */
3363 root_team->t.t_threads[0] = NULL;
3364 root_team->t.t_nproc = 1;
3365 root_team->t.t_serialized = 1;
3366 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3367 root_team->t.t_sched.sched = r_sched.sched;
3368 KA_TRACE(
3369 20,
3370 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3371 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3372
3373 /* setup the hot team for this task */
3374 /* allocate the hot team structure */
3375 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3376
3377 hot_team =
3378 __kmp_allocate_team(root,
3379 1, // new_nproc
3380 __kmp_dflt_team_nth_ub * 2, // max_nproc
3381#if OMPT_SUPPORT
3382 ompt_data_none, // root parallel id
3383#endif
3384 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3385 0 // argc
3386 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3387 );
3388 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3389
3390 root->r.r_hot_team = hot_team;
3391 root_team->t.t_control_stack_top = NULL;
3392
3393 /* first-time initialization */
3394 hot_team->t.t_parent = root_team;
3395
3396 /* initialize hot team */
3397 hot_team_max_nth = hot_team->t.t_max_nproc;
3398 for (f = 0; f < hot_team_max_nth; ++f) {
3399 hot_team->t.t_threads[f] = NULL;
3400 }
3401 hot_team->t.t_nproc = 1;
3402 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3403 hot_team->t.t_sched.sched = r_sched.sched;
3404 hot_team->t.t_size_changed = 0;
3405}
3406
3407#ifdef KMP_DEBUG
3408
3409typedef struct kmp_team_list_item {
3410 kmp_team_p const *entry;
3411 struct kmp_team_list_item *next;
3412} kmp_team_list_item_t;
3413typedef kmp_team_list_item_t *kmp_team_list_t;
3414
3415static void __kmp_print_structure_team_accum( // Add team to list of teams.
3416 kmp_team_list_t list, // List of teams.
3417 kmp_team_p const *team // Team to add.
3418) {
3419
3420 // List must terminate with item where both entry and next are NULL.
3421 // Team is added to the list only once.
3422 // List is sorted in ascending order by team id.
3423 // Team id is *not* a key.
3424
3425 kmp_team_list_t l;
3426
3427 KMP_DEBUG_ASSERT(list != NULL);
3428 if (team == NULL) {
3429 return;
3430 }
3431
3432 __kmp_print_structure_team_accum(list, team->t.t_parent);
3433 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3434
3435 // Search list for the team.
3436 l = list;
3437 while (l->next != NULL && l->entry != team) {
3438 l = l->next;
3439 }
3440 if (l->next != NULL) {
3441 return; // Team has been added before, exit.
3442 }
3443
3444 // Team is not found. Search list again for insertion point.
3445 l = list;
3446 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3447 l = l->next;
3448 }
3449
3450 // Insert team.
3451 {
3452 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3453 sizeof(kmp_team_list_item_t));
3454 *item = *l;
3455 l->entry = team;
3456 l->next = item;
3457 }
3458}
3459
3460static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3461
3462) {
3463 __kmp_printf("%s", title);
3464 if (team != NULL) {
3465 __kmp_printf("%2x %p\n", team->t.t_id, team);
3466 } else {
3467 __kmp_printf(" - (nil)\n");
3468 }
3469}
3470
3471static void __kmp_print_structure_thread(char const *title,
3472 kmp_info_p const *thread) {
3473 __kmp_printf("%s", title);
3474 if (thread != NULL) {
3475 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3476 } else {
3477 __kmp_printf(" - (nil)\n");
3478 }
3479}
3480
3481void __kmp_print_structure(void) {
3482
3483 kmp_team_list_t list;
3484
3485 // Initialize list of teams.
3486 list =
3487 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3488 list->entry = NULL;
3489 list->next = NULL;
3490
3491 __kmp_printf("\n------------------------------\nGlobal Thread "
3492 "Table\n------------------------------\n");
3493 {
3494 int gtid;
3495 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3496 __kmp_printf("%2d", gtid);
3497 if (__kmp_threads != NULL) {
3498 __kmp_printf(" %p", __kmp_threads[gtid]);
3499 }
3500 if (__kmp_root != NULL) {
3501 __kmp_printf(" %p", __kmp_root[gtid]);
3502 }
3503 __kmp_printf("\n");
3504 }
3505 }
3506
3507 // Print out __kmp_threads array.
3508 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3509 "----------\n");
3510 if (__kmp_threads != NULL) {
3511 int gtid;
3512 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3513 kmp_info_t const *thread = __kmp_threads[gtid];
3514 if (thread != NULL) {
3515 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3516 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3517 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3518 __kmp_print_structure_team(" Serial Team: ",
3519 thread->th.th_serial_team);
3520 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3521 __kmp_print_structure_thread(" Primary: ",
3522 thread->th.th_team_master);
3523 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3524 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3525 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3526 __kmp_print_structure_thread(" Next in pool: ",
3527 thread->th.th_next_pool);
3528 __kmp_printf("\n");
3529 __kmp_print_structure_team_accum(list, thread->th.th_team);
3530 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3531 }
3532 }
3533 } else {
3534 __kmp_printf("Threads array is not allocated.\n");
3535 }
3536
3537 // Print out __kmp_root array.
3538 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3539 "--------\n");
3540 if (__kmp_root != NULL) {
3541 int gtid;
3542 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3543 kmp_root_t const *root = __kmp_root[gtid];
3544 if (root != NULL) {
3545 __kmp_printf("GTID %2d %p:\n", gtid, root);
3546 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3547 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3548 __kmp_print_structure_thread(" Uber Thread: ",
3549 root->r.r_uber_thread);
3550 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3551 __kmp_printf(" In Parallel: %2d\n",
3552 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3553 __kmp_printf("\n");
3554 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3555 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3556 }
3557 }
3558 } else {
3559 __kmp_printf("Ubers array is not allocated.\n");
3560 }
3561
3562 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3563 "--------\n");
3564 while (list->next != NULL) {
3565 kmp_team_p const *team = list->entry;
3566 int i;
3567 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3568 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3569 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3570 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3571 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3572 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3573 for (i = 0; i < team->t.t_nproc; ++i) {
3574 __kmp_printf(" Thread %2d: ", i);
3575 __kmp_print_structure_thread("", team->t.t_threads[i]);
3576 }
3577 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3578 __kmp_printf("\n");
3579 list = list->next;
3580 }
3581
3582 // Print out __kmp_thread_pool and __kmp_team_pool.
3583 __kmp_printf("\n------------------------------\nPools\n----------------------"
3584 "--------\n");
3585 __kmp_print_structure_thread("Thread pool: ",
3586 CCAST(kmp_info_t *, __kmp_thread_pool));
3587 __kmp_print_structure_team("Team pool: ",
3588 CCAST(kmp_team_t *, __kmp_team_pool));
3589 __kmp_printf("\n");
3590
3591 // Free team list.
3592 while (list != NULL) {
3593 kmp_team_list_item_t *item = list;
3594 list = list->next;
3595 KMP_INTERNAL_FREE(item);
3596 }
3597}
3598
3599#endif
3600
3601//---------------------------------------------------------------------------
3602// Stuff for per-thread fast random number generator
3603// Table of primes
3604static const unsigned __kmp_primes[] = {
3605 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3606 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3607 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3608 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3609 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3610 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3611 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3612 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3613 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3614 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3615 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3616
3617//---------------------------------------------------------------------------
3618// __kmp_get_random: Get a random number using a linear congruential method.
3619unsigned short __kmp_get_random(kmp_info_t *thread) {
3620 unsigned x = thread->th.th_x;
3621 unsigned short r = (unsigned short)(x >> 16);
3622
3623 thread->th.th_x = x * thread->th.th_a + 1;
3624
3625 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3626 thread->th.th_info.ds.ds_tid, r));
3627
3628 return r;
3629}
3630//--------------------------------------------------------
3631// __kmp_init_random: Initialize a random number generator
3632void __kmp_init_random(kmp_info_t *thread) {
3633 unsigned seed = thread->th.th_info.ds.ds_tid;
3634
3635 thread->th.th_a =
3636 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3637 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3638 KA_TRACE(30,
3639 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3640}
3641
3642#if KMP_OS_WINDOWS
3643/* reclaim array entries for root threads that are already dead, returns number
3644 * reclaimed */
3645static int __kmp_reclaim_dead_roots(void) {
3646 int i, r = 0;
3647
3648 for (i = 0; i < __kmp_threads_capacity; ++i) {
3649 if (KMP_UBER_GTID(i) &&
3650 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3651 !__kmp_root[i]
3652 ->r.r_active) { // AC: reclaim only roots died in non-active state
3653 r += __kmp_unregister_root_other_thread(i);
3654 }
3655 }
3656 return r;
3657}
3658#endif
3659
3660/* This function attempts to create free entries in __kmp_threads and
3661 __kmp_root, and returns the number of free entries generated.
3662
3663 For Windows* OS static library, the first mechanism used is to reclaim array
3664 entries for root threads that are already dead.
3665
3666 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3667 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3668 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3669 threadprivate cache array has been created. Synchronization with
3670 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3671
3672 After any dead root reclamation, if the clipping value allows array expansion
3673 to result in the generation of a total of nNeed free slots, the function does
3674 that expansion. If not, nothing is done beyond the possible initial root
3675 thread reclamation.
3676
3677 If any argument is negative, the behavior is undefined. */
3678static int __kmp_expand_threads(int nNeed) {
3679 int added = 0;
3680 int minimumRequiredCapacity;
3681 int newCapacity;
3682 kmp_info_t **newThreads;
3683 kmp_root_t **newRoot;
3684
3685 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3686 // resizing __kmp_threads does not need additional protection if foreign
3687 // threads are present
3688
3689#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3690 /* only for Windows static library */
3691 /* reclaim array entries for root threads that are already dead */
3692 added = __kmp_reclaim_dead_roots();
3693
3694 if (nNeed) {
3695 nNeed -= added;
3696 if (nNeed < 0)
3697 nNeed = 0;
3698 }
3699#endif
3700 if (nNeed <= 0)
3701 return added;
3702
3703 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3704 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3705 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3706 // > __kmp_max_nth in one of two ways:
3707 //
3708 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3709 // may not be reused by another thread, so we may need to increase
3710 // __kmp_threads_capacity to __kmp_max_nth + 1.
3711 //
3712 // 2) New foreign root(s) are encountered. We always register new foreign
3713 // roots. This may cause a smaller # of threads to be allocated at
3714 // subsequent parallel regions, but the worker threads hang around (and
3715 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3716 //
3717 // Anyway, that is the reason for moving the check to see if
3718 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3719 // instead of having it performed here. -BB
3720
3721 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3722
3723 /* compute expansion headroom to check if we can expand */
3724 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3725 /* possible expansion too small -- give up */
3726 return added;
3727 }
3728 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3729
3730 newCapacity = __kmp_threads_capacity;
3731 do {
3732 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3733 : __kmp_sys_max_nth;
3734 } while (newCapacity < minimumRequiredCapacity);
3735 newThreads = (kmp_info_t **)__kmp_allocate(
3736 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3737 newRoot =
3738 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3739 KMP_MEMCPY(newThreads, __kmp_threads,
3740 __kmp_threads_capacity * sizeof(kmp_info_t *));
3741 KMP_MEMCPY(newRoot, __kmp_root,
3742 __kmp_threads_capacity * sizeof(kmp_root_t *));
3743 // Put old __kmp_threads array on a list. Any ongoing references to the old
3744 // list will be valid. This list is cleaned up at library shutdown.
3745 kmp_old_threads_list_t *node =
3746 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3747 node->threads = __kmp_threads;
3748 node->next = __kmp_old_threads_list;
3749 __kmp_old_threads_list = node;
3750
3751 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3752 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3753 added += newCapacity - __kmp_threads_capacity;
3754 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3755
3756 if (newCapacity > __kmp_tp_capacity) {
3757 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3758 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3759 __kmp_threadprivate_resize_cache(newCapacity);
3760 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3761 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3762 }
3763 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3764 }
3765
3766 return added;
3767}
3768
3769/* Register the current thread as a root thread and obtain our gtid. We must
3770 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3771 thread that calls from __kmp_do_serial_initialize() */
3772int __kmp_register_root(int initial_thread) {
3773 kmp_info_t *root_thread;
3774 kmp_root_t *root;
3775 int gtid;
3776 int capacity;
3777 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3778 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3779 KMP_MB();
3780
3781 /* 2007-03-02:
3782 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3783 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3784 work as expected -- it may return false (that means there is at least one
3785 empty slot in __kmp_threads array), but it is possible the only free slot
3786 is #0, which is reserved for initial thread and so cannot be used for this
3787 one. Following code workarounds this bug.
3788
3789 However, right solution seems to be not reserving slot #0 for initial
3790 thread because:
3791 (1) there is no magic in slot #0,
3792 (2) we cannot detect initial thread reliably (the first thread which does
3793 serial initialization may be not a real initial thread).
3794 */
3795 capacity = __kmp_threads_capacity;
3796 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3797 --capacity;
3798 }
3799
3800 // If it is not for initializing the hidden helper team, we need to take
3801 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3802 // in __kmp_threads_capacity.
3803 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3804 capacity -= __kmp_hidden_helper_threads_num;
3805 }
3806
3807 /* see if there are too many threads */
3808 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3809 if (__kmp_tp_cached) {
3810 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3811 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3812 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3813 } else {
3814 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3815 __kmp_msg_null);
3816 }
3817 }
3818
3819 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3820 // 0: initial thread, also a regular OpenMP thread.
3821 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3822 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3823 // regular OpenMP threads.
3824 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3825 // Find an available thread slot for hidden helper thread. Slots for hidden
3826 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3827 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3828 gtid <= __kmp_hidden_helper_threads_num;
3829 gtid++)
3830 ;
3831 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3832 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3833 "hidden helper thread: T#%d\n",
3834 gtid));
3835 } else {
3836 /* find an available thread slot */
3837 // Don't reassign the zero slot since we need that to only be used by
3838 // initial thread. Slots for hidden helper threads should also be skipped.
3839 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3840 gtid = 0;
3841 } else {
3842 for (gtid = __kmp_hidden_helper_threads_num + 1;
3843 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3844 ;
3845 }
3846 KA_TRACE(
3847 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3848 KMP_ASSERT(gtid < __kmp_threads_capacity);
3849 }
3850
3851 /* update global accounting */
3852 __kmp_all_nth++;
3853 TCW_4(__kmp_nth, __kmp_nth + 1);
3854
3855 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3856 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3857 if (__kmp_adjust_gtid_mode) {
3858 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3859 if (TCR_4(__kmp_gtid_mode) != 2) {
3860 TCW_4(__kmp_gtid_mode, 2);
3861 }
3862 } else {
3863 if (TCR_4(__kmp_gtid_mode) != 1) {
3864 TCW_4(__kmp_gtid_mode, 1);
3865 }
3866 }
3867 }
3868
3869#ifdef KMP_ADJUST_BLOCKTIME
3870 /* Adjust blocktime to zero if necessary */
3871 /* Middle initialization might not have occurred yet */
3872 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3873 if (__kmp_nth > __kmp_avail_proc) {
3874 __kmp_zero_bt = TRUE;
3875 }
3876 }
3877#endif /* KMP_ADJUST_BLOCKTIME */
3878
3879 /* setup this new hierarchy */
3880 if (!(root = __kmp_root[gtid])) {
3881 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3882 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3883 }
3884
3885#if KMP_STATS_ENABLED
3886 // Initialize stats as soon as possible (right after gtid assignment).
3887 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3888 __kmp_stats_thread_ptr->startLife();
3889 KMP_SET_THREAD_STATE(SERIAL_REGION);
3890 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3891#endif
3892 __kmp_initialize_root(root);
3893
3894 /* setup new root thread structure */
3895 if (root->r.r_uber_thread) {
3896 root_thread = root->r.r_uber_thread;
3897 } else {
3898 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3899 if (__kmp_storage_map) {
3900 __kmp_print_thread_storage_map(root_thread, gtid);
3901 }
3902 root_thread->th.th_info.ds.ds_gtid = gtid;
3903#if OMPT_SUPPORT
3904 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3905#endif
3906 root_thread->th.th_root = root;
3907 if (__kmp_env_consistency_check) {
3908 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3909 }
3910#if USE_FAST_MEMORY
3911 __kmp_initialize_fast_memory(root_thread);
3912#endif /* USE_FAST_MEMORY */
3913
3914#if KMP_USE_BGET
3915 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3916 __kmp_initialize_bget(root_thread);
3917#endif
3918 __kmp_init_random(root_thread); // Initialize random number generator
3919 }
3920
3921 /* setup the serial team held in reserve by the root thread */
3922 if (!root_thread->th.th_serial_team) {
3923 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3924 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3925 root_thread->th.th_serial_team = __kmp_allocate_team(
3926 root, 1, 1,
3927#if OMPT_SUPPORT
3928 ompt_data_none, // root parallel id
3929#endif
3930 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3931 }
3932 KMP_ASSERT(root_thread->th.th_serial_team);
3933 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3934 root_thread->th.th_serial_team));
3935
3936 /* drop root_thread into place */
3937 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3938
3939 root->r.r_root_team->t.t_threads[0] = root_thread;
3940 root->r.r_hot_team->t.t_threads[0] = root_thread;
3941 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3942 // AC: the team created in reserve, not for execution (it is unused for now).
3943 root_thread->th.th_serial_team->t.t_serialized = 0;
3944 root->r.r_uber_thread = root_thread;
3945
3946 /* initialize the thread, get it ready to go */
3947 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3948 TCW_4(__kmp_init_gtid, TRUE);
3949
3950 /* prepare the primary thread for get_gtid() */
3951 __kmp_gtid_set_specific(gtid);
3952
3953#if USE_ITT_BUILD
3954 __kmp_itt_thread_name(gtid);
3955#endif /* USE_ITT_BUILD */
3956
3957#ifdef KMP_TDATA_GTID
3958 __kmp_gtid = gtid;
3959#endif
3960 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3961 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3962
3963 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3964 "plain=%u\n",
3965 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3966 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3967 KMP_INIT_BARRIER_STATE));
3968 { // Initialize barrier data.
3969 int b;
3970 for (b = 0; b < bs_last_barrier; ++b) {
3971 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3972#if USE_DEBUGGER
3973 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3974#endif
3975 }
3976 }
3977 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3978 KMP_INIT_BARRIER_STATE);
3979
3980#if KMP_AFFINITY_SUPPORTED
3981 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3982 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3983 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3984 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3985#endif /* KMP_AFFINITY_SUPPORTED */
3986 root_thread->th.th_def_allocator = __kmp_def_allocator;
3987 root_thread->th.th_prev_level = 0;
3988 root_thread->th.th_prev_num_threads = 1;
3989
3990 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3991 tmp->cg_root = root_thread;
3992 tmp->cg_thread_limit = __kmp_cg_max_nth;
3993 tmp->cg_nthreads = 1;
3994 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3995 " cg_nthreads init to 1\n",
3996 root_thread, tmp));
3997 tmp->up = NULL;
3998 root_thread->th.th_cg_roots = tmp;
3999
4000 __kmp_root_counter++;
4001
4002#if OMPT_SUPPORT
4003 if (!initial_thread && ompt_enabled.enabled) {
4004
4005 kmp_info_t *root_thread = ompt_get_thread();
4006
4007 ompt_set_thread_state(root_thread, ompt_state_overhead);
4008
4009 if (ompt_enabled.ompt_callback_thread_begin) {
4010 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4011 ompt_thread_initial, __ompt_get_thread_data_internal());
4012 }
4013 ompt_data_t *task_data;
4014 ompt_data_t *parallel_data;
4015 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4016 NULL);
4017 if (ompt_enabled.ompt_callback_implicit_task) {
4018 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4019 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4020 }
4021
4022 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4023 }
4024#endif
4025#if OMPD_SUPPORT
4026 if (ompd_state & OMPD_ENABLE_BP)
4027 ompd_bp_thread_begin();
4028#endif
4029
4030 KMP_MB();
4031 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4032
4033 return gtid;
4034}
4035
4036#if KMP_NESTED_HOT_TEAMS
4037static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4038 const int max_level) {
4039 int i, n, nth;
4040 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4041 if (!hot_teams || !hot_teams[level].hot_team) {
4042 return 0;
4043 }
4044 KMP_DEBUG_ASSERT(level < max_level);
4045 kmp_team_t *team = hot_teams[level].hot_team;
4046 nth = hot_teams[level].hot_team_nth;
4047 n = nth - 1; // primary thread is not freed
4048 if (level < max_level - 1) {
4049 for (i = 0; i < nth; ++i) {
4050 kmp_info_t *th = team->t.t_threads[i];
4051 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4052 if (i > 0 && th->th.th_hot_teams) {
4053 __kmp_free(th->th.th_hot_teams);
4054 th->th.th_hot_teams = NULL;
4055 }
4056 }
4057 }
4058 __kmp_free_team(root, team, NULL);
4059 return n;
4060}
4061#endif
4062
4063// Resets a root thread and clear its root and hot teams.
4064// Returns the number of __kmp_threads entries directly and indirectly freed.
4065static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4066 kmp_team_t *root_team = root->r.r_root_team;
4067 kmp_team_t *hot_team = root->r.r_hot_team;
4068 int n = hot_team->t.t_nproc;
4069 int i;
4070
4071 KMP_DEBUG_ASSERT(!root->r.r_active);
4072
4073 root->r.r_root_team = NULL;
4074 root->r.r_hot_team = NULL;
4075 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4076 // before call to __kmp_free_team().
4077 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4078#if KMP_NESTED_HOT_TEAMS
4079 if (__kmp_hot_teams_max_level >
4080 0) { // need to free nested hot teams and their threads if any
4081 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4082 kmp_info_t *th = hot_team->t.t_threads[i];
4083 if (__kmp_hot_teams_max_level > 1) {
4084 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4085 }
4086 if (th->th.th_hot_teams) {
4087 __kmp_free(th->th.th_hot_teams);
4088 th->th.th_hot_teams = NULL;
4089 }
4090 }
4091 }
4092#endif
4093 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4094
4095 // Before we can reap the thread, we need to make certain that all other
4096 // threads in the teams that had this root as ancestor have stopped trying to
4097 // steal tasks.
4098 if (__kmp_tasking_mode != tskm_immediate_exec) {
4099 __kmp_wait_to_unref_task_teams();
4100 }
4101
4102#if KMP_OS_WINDOWS
4103 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4104 KA_TRACE(
4105 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4106 "\n",
4107 (LPVOID) & (root->r.r_uber_thread->th),
4108 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4109 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4110#endif /* KMP_OS_WINDOWS */
4111
4112#if OMPD_SUPPORT
4113 if (ompd_state & OMPD_ENABLE_BP)
4114 ompd_bp_thread_end();
4115#endif
4116
4117#if OMPT_SUPPORT
4118 ompt_data_t *task_data;
4119 ompt_data_t *parallel_data;
4120 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4121 NULL);
4122 if (ompt_enabled.ompt_callback_implicit_task) {
4123 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4124 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4125 }
4126 if (ompt_enabled.ompt_callback_thread_end) {
4127 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4128 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4129 }
4130#endif
4131
4132 TCW_4(__kmp_nth,
4133 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4134 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4135 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4136 " to %d\n",
4137 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4138 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4139 if (i == 1) {
4140 // need to free contention group structure
4141 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4142 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4143 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4144 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4145 root->r.r_uber_thread->th.th_cg_roots = NULL;
4146 }
4147 __kmp_reap_thread(root->r.r_uber_thread, 1);
4148
4149 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4150 // instead of freeing.
4151 root->r.r_uber_thread = NULL;
4152 /* mark root as no longer in use */
4153 root->r.r_begin = FALSE;
4154
4155 return n;
4156}
4157
4158void __kmp_unregister_root_current_thread(int gtid) {
4159 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4160 /* this lock should be ok, since unregister_root_current_thread is never
4161 called during an abort, only during a normal close. furthermore, if you
4162 have the forkjoin lock, you should never try to get the initz lock */
4163 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4164 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4165 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4166 "exiting T#%d\n",
4167 gtid));
4168 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4169 return;
4170 }
4171 kmp_root_t *root = __kmp_root[gtid];
4172
4173 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4174 KMP_ASSERT(KMP_UBER_GTID(gtid));
4175 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4176 KMP_ASSERT(root->r.r_active == FALSE);
4177
4178 KMP_MB();
4179
4180 kmp_info_t *thread = __kmp_threads[gtid];
4181 kmp_team_t *team = thread->th.th_team;
4182 kmp_task_team_t *task_team = thread->th.th_task_team;
4183
4184 // we need to wait for the proxy tasks before finishing the thread
4185 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4186 task_team->tt.tt_hidden_helper_task_encountered)) {
4187#if OMPT_SUPPORT
4188 // the runtime is shutting down so we won't report any events
4189 thread->th.ompt_thread_info.state = ompt_state_undefined;
4190#endif
4191 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4192 }
4193
4194 __kmp_reset_root(gtid, root);
4195
4196 KMP_MB();
4197 KC_TRACE(10,
4198 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4199
4200 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4201}
4202
4203#if KMP_OS_WINDOWS
4204/* __kmp_forkjoin_lock must be already held
4205 Unregisters a root thread that is not the current thread. Returns the number
4206 of __kmp_threads entries freed as a result. */
4207static int __kmp_unregister_root_other_thread(int gtid) {
4208 kmp_root_t *root = __kmp_root[gtid];
4209 int r;
4210
4211 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4212 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4213 KMP_ASSERT(KMP_UBER_GTID(gtid));
4214 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4215 KMP_ASSERT(root->r.r_active == FALSE);
4216
4217 r = __kmp_reset_root(gtid, root);
4218 KC_TRACE(10,
4219 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4220 return r;
4221}
4222#endif
4223
4224#if KMP_DEBUG
4225void __kmp_task_info() {
4226
4227 kmp_int32 gtid = __kmp_entry_gtid();
4228 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4229 kmp_info_t *this_thr = __kmp_threads[gtid];
4230 kmp_team_t *steam = this_thr->th.th_serial_team;
4231 kmp_team_t *team = this_thr->th.th_team;
4232
4233 __kmp_printf(
4234 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4235 "ptask=%p\n",
4236 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4237 team->t.t_implicit_task_taskdata[tid].td_parent);
4238}
4239#endif // KMP_DEBUG
4240
4241/* TODO optimize with one big memclr, take out what isn't needed, split
4242 responsibility to workers as much as possible, and delay initialization of
4243 features as much as possible */
4244static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4245 int tid, int gtid) {
4246 /* this_thr->th.th_info.ds.ds_gtid is setup in
4247 kmp_allocate_thread/create_worker.
4248 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4249 KMP_DEBUG_ASSERT(this_thr != NULL);
4250 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4251 KMP_DEBUG_ASSERT(team);
4252 KMP_DEBUG_ASSERT(team->t.t_threads);
4253 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4254 kmp_info_t *master = team->t.t_threads[0];
4255 KMP_DEBUG_ASSERT(master);
4256 KMP_DEBUG_ASSERT(master->th.th_root);
4257
4258 KMP_MB();
4259
4260 TCW_SYNC_PTR(this_thr->th.th_team, team);
4261
4262 this_thr->th.th_info.ds.ds_tid = tid;
4263 this_thr->th.th_set_nproc = 0;
4264 if (__kmp_tasking_mode != tskm_immediate_exec)
4265 // When tasking is possible, threads are not safe to reap until they are
4266 // done tasking; this will be set when tasking code is exited in wait
4267 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4268 else // no tasking --> always safe to reap
4269 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4270 this_thr->th.th_set_proc_bind = proc_bind_default;
4271#if KMP_AFFINITY_SUPPORTED
4272 this_thr->th.th_new_place = this_thr->th.th_current_place;
4273#endif
4274 this_thr->th.th_root = master->th.th_root;
4275
4276 /* setup the thread's cache of the team structure */
4277 this_thr->th.th_team_nproc = team->t.t_nproc;
4278 this_thr->th.th_team_master = master;
4279 this_thr->th.th_team_serialized = team->t.t_serialized;
4280
4281 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4282
4283 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4284 tid, gtid, this_thr, this_thr->th.th_current_task));
4285
4286 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4287 team, tid, TRUE);
4288
4289 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4290 tid, gtid, this_thr, this_thr->th.th_current_task));
4291 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4292 // __kmp_initialize_team()?
4293
4294 /* TODO no worksharing in speculative threads */
4295 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4296
4297 this_thr->th.th_local.this_construct = 0;
4298
4299 if (!this_thr->th.th_pri_common) {
4300 this_thr->th.th_pri_common =
4301 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4302 if (__kmp_storage_map) {
4303 __kmp_print_storage_map_gtid(
4304 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4305 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4306 }
4307 this_thr->th.th_pri_head = NULL;
4308 }
4309
4310 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4311 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4312 // Make new thread's CG root same as primary thread's
4313 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4314 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4315 if (tmp) {
4316 // worker changes CG, need to check if old CG should be freed
4317 int i = tmp->cg_nthreads--;
4318 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4319 " on node %p of thread %p to %d\n",
4320 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4321 if (i == 1) {
4322 __kmp_free(tmp); // last thread left CG --> free it
4323 }
4324 }
4325 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4326 // Increment new thread's CG root's counter to add the new thread
4327 this_thr->th.th_cg_roots->cg_nthreads++;
4328 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4329 " node %p of thread %p to %d\n",
4330 this_thr, this_thr->th.th_cg_roots,
4331 this_thr->th.th_cg_roots->cg_root,
4332 this_thr->th.th_cg_roots->cg_nthreads));
4333 this_thr->th.th_current_task->td_icvs.thread_limit =
4334 this_thr->th.th_cg_roots->cg_thread_limit;
4335 }
4336
4337 /* Initialize dynamic dispatch */
4338 {
4339 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4340 // Use team max_nproc since this will never change for the team.
4341 size_t disp_size =
4342 sizeof(dispatch_private_info_t) *
4343 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4344 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4345 team->t.t_max_nproc));
4346 KMP_ASSERT(dispatch);
4347 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4348 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4349
4350 dispatch->th_disp_index = 0;
4351 dispatch->th_doacross_buf_idx = 0;
4352 if (!dispatch->th_disp_buffer) {
4353 dispatch->th_disp_buffer =
4354 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4355
4356 if (__kmp_storage_map) {
4357 __kmp_print_storage_map_gtid(
4358 gtid, &dispatch->th_disp_buffer[0],
4359 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4360 ? 1
4361 : __kmp_dispatch_num_buffers],
4362 disp_size,
4363 "th_%d.th_dispatch.th_disp_buffer "
4364 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4365 gtid, team->t.t_id, gtid);
4366 }
4367 } else {
4368 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4369 }
4370
4371 dispatch->th_dispatch_pr_current = 0;
4372 dispatch->th_dispatch_sh_current = 0;
4373
4374 dispatch->th_deo_fcn = 0; /* ORDERED */
4375 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4376 }
4377
4378 this_thr->th.th_next_pool = NULL;
4379
4380 if (!this_thr->th.th_task_state_memo_stack) {
4381 size_t i;
4382 this_thr->th.th_task_state_memo_stack =
4383 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4384 this_thr->th.th_task_state_top = 0;
4385 this_thr->th.th_task_state_stack_sz = 4;
4386 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4387 ++i) // zero init the stack
4388 this_thr->th.th_task_state_memo_stack[i] = 0;
4389 }
4390
4391 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4392 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4393
4394 KMP_MB();
4395}
4396
4397/* allocate a new thread for the requesting team. this is only called from
4398 within a forkjoin critical section. we will first try to get an available
4399 thread from the thread pool. if none is available, we will fork a new one
4400 assuming we are able to create a new one. this should be assured, as the
4401 caller should check on this first. */
4402kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4403 int new_tid) {
4404 kmp_team_t *serial_team;
4405 kmp_info_t *new_thr;
4406 int new_gtid;
4407
4408 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4409 KMP_DEBUG_ASSERT(root && team);
4410#if !KMP_NESTED_HOT_TEAMS
4411 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4412#endif
4413 KMP_MB();
4414
4415 /* first, try to get one from the thread pool */
4416 if (__kmp_thread_pool) {
4417 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4418 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4419 if (new_thr == __kmp_thread_pool_insert_pt) {
4420 __kmp_thread_pool_insert_pt = NULL;
4421 }
4422 TCW_4(new_thr->th.th_in_pool, FALSE);
4423 __kmp_suspend_initialize_thread(new_thr);
4424 __kmp_lock_suspend_mx(new_thr);
4425 if (new_thr->th.th_active_in_pool == TRUE) {
4426 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4427 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4428 new_thr->th.th_active_in_pool = FALSE;
4429 }
4430 __kmp_unlock_suspend_mx(new_thr);
4431
4432 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4433 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4434 KMP_ASSERT(!new_thr->th.th_team);
4435 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4436
4437 /* setup the thread structure */
4438 __kmp_initialize_info(new_thr, team, new_tid,
4439 new_thr->th.th_info.ds.ds_gtid);
4440 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4441
4442 TCW_4(__kmp_nth, __kmp_nth + 1);
4443
4444 new_thr->th.th_task_state = 0;
4445 new_thr->th.th_task_state_top = 0;
4446 new_thr->th.th_task_state_stack_sz = 4;
4447
4448 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4449 // Make sure pool thread has transitioned to waiting on own thread struct
4450 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4451 // Thread activated in __kmp_allocate_team when increasing team size
4452 }
4453
4454#ifdef KMP_ADJUST_BLOCKTIME
4455 /* Adjust blocktime back to zero if necessary */
4456 /* Middle initialization might not have occurred yet */
4457 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4458 if (__kmp_nth > __kmp_avail_proc) {
4459 __kmp_zero_bt = TRUE;
4460 }
4461 }
4462#endif /* KMP_ADJUST_BLOCKTIME */
4463
4464#if KMP_DEBUG
4465 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4466 // KMP_BARRIER_PARENT_FLAG.
4467 int b;
4468 kmp_balign_t *balign = new_thr->th.th_bar;
4469 for (b = 0; b < bs_last_barrier; ++b)
4470 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4471#endif
4472
4473 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4474 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4475
4476 KMP_MB();
4477 return new_thr;
4478 }
4479
4480 /* no, well fork a new one */
4481 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4482 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4483
4484#if KMP_USE_MONITOR
4485 // If this is the first worker thread the RTL is creating, then also
4486 // launch the monitor thread. We try to do this as early as possible.
4487 if (!TCR_4(__kmp_init_monitor)) {
4488 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4489 if (!TCR_4(__kmp_init_monitor)) {
4490 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4491 TCW_4(__kmp_init_monitor, 1);
4492 __kmp_create_monitor(&__kmp_monitor);
4493 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4494#if KMP_OS_WINDOWS
4495 // AC: wait until monitor has started. This is a fix for CQ232808.
4496 // The reason is that if the library is loaded/unloaded in a loop with
4497 // small (parallel) work in between, then there is high probability that
4498 // monitor thread started after the library shutdown. At shutdown it is
4499 // too late to cope with the problem, because when the primary thread is
4500 // in DllMain (process detach) the monitor has no chances to start (it is
4501 // blocked), and primary thread has no means to inform the monitor that
4502 // the library has gone, because all the memory which the monitor can
4503 // access is going to be released/reset.
4504 while (TCR_4(__kmp_init_monitor) < 2) {
4505 KMP_YIELD(TRUE);
4506 }
4507 KF_TRACE(10, ("after monitor thread has started\n"));
4508#endif
4509 }
4510 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4511 }
4512#endif
4513
4514 KMP_MB();
4515
4516 {
4517 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4518 ? 1
4519 : __kmp_hidden_helper_threads_num + 1;
4520
4521 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4522 ++new_gtid) {
4523 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4524 }
4525
4526 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4527 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4528 }
4529 }
4530
4531 /* allocate space for it. */
4532 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4533
4534 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4535
4536#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4537 // suppress race conditions detection on synchronization flags in debug mode
4538 // this helps to analyze library internals eliminating false positives
4539 __itt_suppress_mark_range(
4540 __itt_suppress_range, __itt_suppress_threading_errors,
4541 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4542 __itt_suppress_mark_range(
4543 __itt_suppress_range, __itt_suppress_threading_errors,
4544 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4545#if KMP_OS_WINDOWS
4546 __itt_suppress_mark_range(
4547 __itt_suppress_range, __itt_suppress_threading_errors,
4548 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4549#else
4550 __itt_suppress_mark_range(__itt_suppress_range,
4551 __itt_suppress_threading_errors,
4552 &new_thr->th.th_suspend_init_count,
4553 sizeof(new_thr->th.th_suspend_init_count));
4554#endif
4555 // TODO: check if we need to also suppress b_arrived flags
4556 __itt_suppress_mark_range(__itt_suppress_range,
4557 __itt_suppress_threading_errors,
4558 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4559 sizeof(new_thr->th.th_bar[0].bb.b_go));
4560 __itt_suppress_mark_range(__itt_suppress_range,
4561 __itt_suppress_threading_errors,
4562 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4563 sizeof(new_thr->th.th_bar[1].bb.b_go));
4564 __itt_suppress_mark_range(__itt_suppress_range,
4565 __itt_suppress_threading_errors,
4566 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4567 sizeof(new_thr->th.th_bar[2].bb.b_go));
4568#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4569 if (__kmp_storage_map) {
4570 __kmp_print_thread_storage_map(new_thr, new_gtid);
4571 }
4572
4573 // add the reserve serialized team, initialized from the team's primary thread
4574 {
4575 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4576 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4577 new_thr->th.th_serial_team = serial_team =
4578 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4579#if OMPT_SUPPORT
4580 ompt_data_none, // root parallel id
4581#endif
4582 proc_bind_default, &r_icvs,
4583 0 USE_NESTED_HOT_ARG(NULL));
4584 }
4585 KMP_ASSERT(serial_team);
4586 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4587 // execution (it is unused for now).
4588 serial_team->t.t_threads[0] = new_thr;
4589 KF_TRACE(10,
4590 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4591 new_thr));
4592
4593 /* setup the thread structures */
4594 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4595
4596#if USE_FAST_MEMORY
4597 __kmp_initialize_fast_memory(new_thr);
4598#endif /* USE_FAST_MEMORY */
4599
4600#if KMP_USE_BGET
4601 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4602 __kmp_initialize_bget(new_thr);
4603#endif
4604
4605 __kmp_init_random(new_thr); // Initialize random number generator
4606
4607 /* Initialize these only once when thread is grabbed for a team allocation */
4608 KA_TRACE(20,
4609 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4610 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4611
4612 int b;
4613 kmp_balign_t *balign = new_thr->th.th_bar;
4614 for (b = 0; b < bs_last_barrier; ++b) {
4615 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4616 balign[b].bb.team = NULL;
4617 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4618 balign[b].bb.use_oncore_barrier = 0;
4619 }
4620
4621 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4622 new_thr->th.th_sleep_loc_type = flag_unset;
4623
4624 new_thr->th.th_spin_here = FALSE;
4625 new_thr->th.th_next_waiting = 0;
4626#if KMP_OS_UNIX
4627 new_thr->th.th_blocking = false;
4628#endif
4629
4630#if KMP_AFFINITY_SUPPORTED
4631 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4632 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4633 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4634 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4635#endif
4636 new_thr->th.th_def_allocator = __kmp_def_allocator;
4637 new_thr->th.th_prev_level = 0;
4638 new_thr->th.th_prev_num_threads = 1;
4639
4640 TCW_4(new_thr->th.th_in_pool, FALSE);
4641 new_thr->th.th_active_in_pool = FALSE;
4642 TCW_4(new_thr->th.th_active, TRUE);
4643
4644 /* adjust the global counters */
4645 __kmp_all_nth++;
4646 __kmp_nth++;
4647
4648 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4649 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4650 if (__kmp_adjust_gtid_mode) {
4651 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4652 if (TCR_4(__kmp_gtid_mode) != 2) {
4653 TCW_4(__kmp_gtid_mode, 2);
4654 }
4655 } else {
4656 if (TCR_4(__kmp_gtid_mode) != 1) {
4657 TCW_4(__kmp_gtid_mode, 1);
4658 }
4659 }
4660 }
4661
4662#ifdef KMP_ADJUST_BLOCKTIME
4663 /* Adjust blocktime back to zero if necessary */
4664 /* Middle initialization might not have occurred yet */
4665 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4666 if (__kmp_nth > __kmp_avail_proc) {
4667 __kmp_zero_bt = TRUE;
4668 }
4669 }
4670#endif /* KMP_ADJUST_BLOCKTIME */
4671
4672 /* actually fork it and create the new worker thread */
4673 KF_TRACE(
4674 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4675 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4676 KF_TRACE(10,
4677 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4678
4679 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4680 new_gtid));
4681 KMP_MB();
4682 return new_thr;
4683}
4684
4685/* Reinitialize team for reuse.
4686 The hot team code calls this case at every fork barrier, so EPCC barrier
4687 test are extremely sensitive to changes in it, esp. writes to the team
4688 struct, which cause a cache invalidation in all threads.
4689 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4690static void __kmp_reinitialize_team(kmp_team_t *team,
4691 kmp_internal_control_t *new_icvs,
4692 ident_t *loc) {
4693 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4694 team->t.t_threads[0], team));
4695 KMP_DEBUG_ASSERT(team && new_icvs);
4696 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4697 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4698
4699 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4700 // Copy ICVs to the primary thread's implicit taskdata
4701 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4702 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4703
4704 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4705 team->t.t_threads[0], team));
4706}
4707
4708/* Initialize the team data structure.
4709 This assumes the t_threads and t_max_nproc are already set.
4710 Also, we don't touch the arguments */
4711static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4712 kmp_internal_control_t *new_icvs,
4713 ident_t *loc) {
4714 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4715
4716 /* verify */
4717 KMP_DEBUG_ASSERT(team);
4718 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4719 KMP_DEBUG_ASSERT(team->t.t_threads);
4720 KMP_MB();
4721
4722 team->t.t_master_tid = 0; /* not needed */
4723 /* team->t.t_master_bar; not needed */
4724 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4725 team->t.t_nproc = new_nproc;
4726
4727 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4728 team->t.t_next_pool = NULL;
4729 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4730 * up hot team */
4731
4732 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4733 team->t.t_invoke = NULL; /* not needed */
4734
4735 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4736 team->t.t_sched.sched = new_icvs->sched.sched;
4737
4738#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4739 team->t.t_fp_control_saved = FALSE; /* not needed */
4740 team->t.t_x87_fpu_control_word = 0; /* not needed */
4741 team->t.t_mxcsr = 0; /* not needed */
4742#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4743
4744 team->t.t_construct = 0;
4745
4746 team->t.t_ordered.dt.t_value = 0;
4747 team->t.t_master_active = FALSE;
4748
4749#ifdef KMP_DEBUG
4750 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4751#endif
4752#if KMP_OS_WINDOWS
4753 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4754#endif
4755
4756 team->t.t_control_stack_top = NULL;
4757
4758 __kmp_reinitialize_team(team, new_icvs, loc);
4759
4760 KMP_MB();
4761 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4762}
4763
4764#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4765/* Sets full mask for thread and returns old mask, no changes to structures. */
4766static void
4767__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4768 if (KMP_AFFINITY_CAPABLE()) {
4769 int status;
4770 if (old_mask != NULL) {
4771 status = __kmp_get_system_affinity(old_mask, TRUE);
4772 int error = errno;
4773 if (status != 0) {
4774 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4775 __kmp_msg_null);
4776 }
4777 }
4778 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4779 }
4780}
4781#endif
4782
4783#if KMP_AFFINITY_SUPPORTED
4784
4785// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4786// It calculates the worker + primary thread's partition based upon the parent
4787// thread's partition, and binds each worker to a thread in their partition.
4788// The primary thread's partition should already include its current binding.
4789static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4790 // Do not partition places for the hidden helper team
4791 if (KMP_HIDDEN_HELPER_TEAM(team))
4792 return;
4793 // Copy the primary thread's place partition to the team struct
4794 kmp_info_t *master_th = team->t.t_threads[0];
4795 KMP_DEBUG_ASSERT(master_th != NULL);
4796 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4797 int first_place = master_th->th.th_first_place;
4798 int last_place = master_th->th.th_last_place;
4799 int masters_place = master_th->th.th_current_place;
4800 int num_masks = __kmp_affinity.num_masks;
4801 team->t.t_first_place = first_place;
4802 team->t.t_last_place = last_place;
4803
4804 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4805 "bound to place %d partition = [%d,%d]\n",
4806 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4807 team->t.t_id, masters_place, first_place, last_place));
4808
4809 switch (proc_bind) {
4810
4811 case proc_bind_default:
4812 // Serial teams might have the proc_bind policy set to proc_bind_default.
4813 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4814 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4815 break;
4816
4817 case proc_bind_primary: {
4818 int f;
4819 int n_th = team->t.t_nproc;
4820 for (f = 1; f < n_th; f++) {
4821 kmp_info_t *th = team->t.t_threads[f];
4822 KMP_DEBUG_ASSERT(th != NULL);
4823 th->th.th_first_place = first_place;
4824 th->th.th_last_place = last_place;
4825 th->th.th_new_place = masters_place;
4826 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4827 team->t.t_display_affinity != 1) {
4828 team->t.t_display_affinity = 1;
4829 }
4830
4831 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4832 "partition = [%d,%d]\n",
4833 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4834 f, masters_place, first_place, last_place));
4835 }
4836 } break;
4837
4838 case proc_bind_close: {
4839 int f;
4840 int n_th = team->t.t_nproc;
4841 int n_places;
4842 if (first_place <= last_place) {
4843 n_places = last_place - first_place + 1;
4844 } else {
4845 n_places = num_masks - first_place + last_place + 1;
4846 }
4847 if (n_th <= n_places) {
4848 int place = masters_place;
4849 for (f = 1; f < n_th; f++) {
4850 kmp_info_t *th = team->t.t_threads[f];
4851 KMP_DEBUG_ASSERT(th != NULL);
4852
4853 if (place == last_place) {
4854 place = first_place;
4855 } else if (place == (num_masks - 1)) {
4856 place = 0;
4857 } else {
4858 place++;
4859 }
4860 th->th.th_first_place = first_place;
4861 th->th.th_last_place = last_place;
4862 th->th.th_new_place = place;
4863 if (__kmp_display_affinity && place != th->th.th_current_place &&
4864 team->t.t_display_affinity != 1) {
4865 team->t.t_display_affinity = 1;
4866 }
4867
4868 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4869 "partition = [%d,%d]\n",
4870 __kmp_gtid_from_thread(team->t.t_threads[f]),
4871 team->t.t_id, f, place, first_place, last_place));
4872 }
4873 } else {
4874 int S, rem, gap, s_count;
4875 S = n_th / n_places;
4876 s_count = 0;
4877 rem = n_th - (S * n_places);
4878 gap = rem > 0 ? n_places / rem : n_places;
4879 int place = masters_place;
4880 int gap_ct = gap;
4881 for (f = 0; f < n_th; f++) {
4882 kmp_info_t *th = team->t.t_threads[f];
4883 KMP_DEBUG_ASSERT(th != NULL);
4884
4885 th->th.th_first_place = first_place;
4886 th->th.th_last_place = last_place;
4887 th->th.th_new_place = place;
4888 if (__kmp_display_affinity && place != th->th.th_current_place &&
4889 team->t.t_display_affinity != 1) {
4890 team->t.t_display_affinity = 1;
4891 }
4892 s_count++;
4893
4894 if ((s_count == S) && rem && (gap_ct == gap)) {
4895 // do nothing, add an extra thread to place on next iteration
4896 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4897 // we added an extra thread to this place; move to next place
4898 if (place == last_place) {
4899 place = first_place;
4900 } else if (place == (num_masks - 1)) {
4901 place = 0;
4902 } else {
4903 place++;
4904 }
4905 s_count = 0;
4906 gap_ct = 1;
4907 rem--;
4908 } else if (s_count == S) { // place full; don't add extra
4909 if (place == last_place) {
4910 place = first_place;
4911 } else if (place == (num_masks - 1)) {
4912 place = 0;
4913 } else {
4914 place++;
4915 }
4916 gap_ct++;
4917 s_count = 0;
4918 }
4919
4920 KA_TRACE(100,
4921 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4922 "partition = [%d,%d]\n",
4923 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4924 th->th.th_new_place, first_place, last_place));
4925 }
4926 KMP_DEBUG_ASSERT(place == masters_place);
4927 }
4928 } break;
4929
4930 case proc_bind_spread: {
4931 int f;
4932 int n_th = team->t.t_nproc;
4933 int n_places;
4934 int thidx;
4935 if (first_place <= last_place) {
4936 n_places = last_place - first_place + 1;
4937 } else {
4938 n_places = num_masks - first_place + last_place + 1;
4939 }
4940 if (n_th <= n_places) {
4941 int place = -1;
4942
4943 if (n_places != num_masks) {
4944 int S = n_places / n_th;
4945 int s_count, rem, gap, gap_ct;
4946
4947 place = masters_place;
4948 rem = n_places - n_th * S;
4949 gap = rem ? n_th / rem : 1;
4950 gap_ct = gap;
4951 thidx = n_th;
4952 if (update_master_only == 1)
4953 thidx = 1;
4954 for (f = 0; f < thidx; f++) {
4955 kmp_info_t *th = team->t.t_threads[f];
4956 KMP_DEBUG_ASSERT(th != NULL);
4957
4958 th->th.th_first_place = place;
4959 th->th.th_new_place = place;
4960 if (__kmp_display_affinity && place != th->th.th_current_place &&
4961 team->t.t_display_affinity != 1) {
4962 team->t.t_display_affinity = 1;
4963 }
4964 s_count = 1;
4965 while (s_count < S) {
4966 if (place == last_place) {
4967 place = first_place;
4968 } else if (place == (num_masks - 1)) {
4969 place = 0;
4970 } else {
4971 place++;
4972 }
4973 s_count++;
4974 }
4975 if (rem && (gap_ct == gap)) {
4976 if (place == last_place) {
4977 place = first_place;
4978 } else if (place == (num_masks - 1)) {
4979 place = 0;
4980 } else {
4981 place++;
4982 }
4983 rem--;
4984 gap_ct = 0;
4985 }
4986 th->th.th_last_place = place;
4987 gap_ct++;
4988
4989 if (place == last_place) {
4990 place = first_place;
4991 } else if (place == (num_masks - 1)) {
4992 place = 0;
4993 } else {
4994 place++;
4995 }
4996
4997 KA_TRACE(100,
4998 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4999 "partition = [%d,%d], num_masks: %u\n",
5000 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5001 f, th->th.th_new_place, th->th.th_first_place,
5002 th->th.th_last_place, num_masks));
5003 }
5004 } else {
5005 /* Having uniform space of available computation places I can create
5006 T partitions of round(P/T) size and put threads into the first
5007 place of each partition. */
5008 double current = static_cast<double>(masters_place);
5009 double spacing =
5010 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5011 int first, last;
5012 kmp_info_t *th;
5013
5014 thidx = n_th + 1;
5015 if (update_master_only == 1)
5016 thidx = 1;
5017 for (f = 0; f < thidx; f++) {
5018 first = static_cast<int>(current);
5019 last = static_cast<int>(current + spacing) - 1;
5020 KMP_DEBUG_ASSERT(last >= first);
5021 if (first >= n_places) {
5022 if (masters_place) {
5023 first -= n_places;
5024 last -= n_places;
5025 if (first == (masters_place + 1)) {
5026 KMP_DEBUG_ASSERT(f == n_th);
5027 first--;
5028 }
5029 if (last == masters_place) {
5030 KMP_DEBUG_ASSERT(f == (n_th - 1));
5031 last--;
5032 }
5033 } else {
5034 KMP_DEBUG_ASSERT(f == n_th);
5035 first = 0;
5036 last = 0;
5037 }
5038 }
5039 if (last >= n_places) {
5040 last = (n_places - 1);
5041 }
5042 place = first;
5043 current += spacing;
5044 if (f < n_th) {
5045 KMP_DEBUG_ASSERT(0 <= first);
5046 KMP_DEBUG_ASSERT(n_places > first);
5047 KMP_DEBUG_ASSERT(0 <= last);
5048 KMP_DEBUG_ASSERT(n_places > last);
5049 KMP_DEBUG_ASSERT(last_place >= first_place);
5050 th = team->t.t_threads[f];
5051 KMP_DEBUG_ASSERT(th);
5052 th->th.th_first_place = first;
5053 th->th.th_new_place = place;
5054 th->th.th_last_place = last;
5055 if (__kmp_display_affinity && place != th->th.th_current_place &&
5056 team->t.t_display_affinity != 1) {
5057 team->t.t_display_affinity = 1;
5058 }
5059 KA_TRACE(100,
5060 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5061 "partition = [%d,%d], spacing = %.4f\n",
5062 __kmp_gtid_from_thread(team->t.t_threads[f]),
5063 team->t.t_id, f, th->th.th_new_place,
5064 th->th.th_first_place, th->th.th_last_place, spacing));
5065 }
5066 }
5067 }
5068 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5069 } else {
5070 int S, rem, gap, s_count;
5071 S = n_th / n_places;
5072 s_count = 0;
5073 rem = n_th - (S * n_places);
5074 gap = rem > 0 ? n_places / rem : n_places;
5075 int place = masters_place;
5076 int gap_ct = gap;
5077 thidx = n_th;
5078 if (update_master_only == 1)
5079 thidx = 1;
5080 for (f = 0; f < thidx; f++) {
5081 kmp_info_t *th = team->t.t_threads[f];
5082 KMP_DEBUG_ASSERT(th != NULL);
5083
5084 th->th.th_first_place = place;
5085 th->th.th_last_place = place;
5086 th->th.th_new_place = place;
5087 if (__kmp_display_affinity && place != th->th.th_current_place &&
5088 team->t.t_display_affinity != 1) {
5089 team->t.t_display_affinity = 1;
5090 }
5091 s_count++;
5092
5093 if ((s_count == S) && rem && (gap_ct == gap)) {
5094 // do nothing, add an extra thread to place on next iteration
5095 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5096 // we added an extra thread to this place; move on to next place
5097 if (place == last_place) {
5098 place = first_place;
5099 } else if (place == (num_masks - 1)) {
5100 place = 0;
5101 } else {
5102 place++;
5103 }
5104 s_count = 0;
5105 gap_ct = 1;
5106 rem--;
5107 } else if (s_count == S) { // place is full; don't add extra thread
5108 if (place == last_place) {
5109 place = first_place;
5110 } else if (place == (num_masks - 1)) {
5111 place = 0;
5112 } else {
5113 place++;
5114 }
5115 gap_ct++;
5116 s_count = 0;
5117 }
5118
5119 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5120 "partition = [%d,%d]\n",
5121 __kmp_gtid_from_thread(team->t.t_threads[f]),
5122 team->t.t_id, f, th->th.th_new_place,
5123 th->th.th_first_place, th->th.th_last_place));
5124 }
5125 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5126 }
5127 } break;
5128
5129 default:
5130 break;
5131 }
5132
5133 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5134}
5135
5136#endif // KMP_AFFINITY_SUPPORTED
5137
5138/* allocate a new team data structure to use. take one off of the free pool if
5139 available */
5140kmp_team_t *
5141__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5142#if OMPT_SUPPORT
5143 ompt_data_t ompt_parallel_data,
5144#endif
5145 kmp_proc_bind_t new_proc_bind,
5146 kmp_internal_control_t *new_icvs,
5147 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5148 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5149 int f;
5150 kmp_team_t *team;
5151 int use_hot_team = !root->r.r_active;
5152 int level = 0;
5153 int do_place_partition = 1;
5154
5155 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5156 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5157 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5158 KMP_MB();
5159
5160#if KMP_NESTED_HOT_TEAMS
5161 kmp_hot_team_ptr_t *hot_teams;
5162 if (master) {
5163 team = master->th.th_team;
5164 level = team->t.t_active_level;
5165 if (master->th.th_teams_microtask) { // in teams construct?
5166 if (master->th.th_teams_size.nteams > 1 &&
5167 ( // #teams > 1
5168 team->t.t_pkfn ==
5169 (microtask_t)__kmp_teams_master || // inner fork of the teams
5170 master->th.th_teams_level <
5171 team->t.t_level)) { // or nested parallel inside the teams
5172 ++level; // not increment if #teams==1, or for outer fork of the teams;
5173 // increment otherwise
5174 }
5175 // Do not perform the place partition if inner fork of the teams
5176 // Wait until nested parallel region encountered inside teams construct
5177 if ((master->th.th_teams_size.nteams == 1 &&
5178 master->th.th_teams_level >= team->t.t_level) ||
5179 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5180 do_place_partition = 0;
5181 }
5182 hot_teams = master->th.th_hot_teams;
5183 if (level < __kmp_hot_teams_max_level && hot_teams &&
5184 hot_teams[level].hot_team) {
5185 // hot team has already been allocated for given level
5186 use_hot_team = 1;
5187 } else {
5188 use_hot_team = 0;
5189 }
5190 } else {
5191 // check we won't access uninitialized hot_teams, just in case
5192 KMP_DEBUG_ASSERT(new_nproc == 1);
5193 }
5194#endif
5195 // Optimization to use a "hot" team
5196 if (use_hot_team && new_nproc > 1) {
5197 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5198#if KMP_NESTED_HOT_TEAMS
5199 team = hot_teams[level].hot_team;
5200#else
5201 team = root->r.r_hot_team;
5202#endif
5203#if KMP_DEBUG
5204 if (__kmp_tasking_mode != tskm_immediate_exec) {
5205 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5206 "task_team[1] = %p before reinit\n",
5207 team->t.t_task_team[0], team->t.t_task_team[1]));
5208 }
5209#endif
5210
5211 if (team->t.t_nproc != new_nproc &&
5212 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5213 // Distributed barrier may need a resize
5214 int old_nthr = team->t.t_nproc;
5215 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5216 }
5217
5218 // If not doing the place partition, then reset the team's proc bind
5219 // to indicate that partitioning of all threads still needs to take place
5220 if (do_place_partition == 0)
5221 team->t.t_proc_bind = proc_bind_default;
5222 // Has the number of threads changed?
5223 /* Let's assume the most common case is that the number of threads is
5224 unchanged, and put that case first. */
5225 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5226 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5227 // This case can mean that omp_set_num_threads() was called and the hot
5228 // team size was already reduced, so we check the special flag
5229 if (team->t.t_size_changed == -1) {
5230 team->t.t_size_changed = 1;
5231 } else {
5232 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5233 }
5234
5235 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5236 kmp_r_sched_t new_sched = new_icvs->sched;
5237 // set primary thread's schedule as new run-time schedule
5238 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5239
5240 __kmp_reinitialize_team(team, new_icvs,
5241 root->r.r_uber_thread->th.th_ident);
5242
5243 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5244 team->t.t_threads[0], team));
5245 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5246
5247#if KMP_AFFINITY_SUPPORTED
5248 if ((team->t.t_size_changed == 0) &&
5249 (team->t.t_proc_bind == new_proc_bind)) {
5250 if (new_proc_bind == proc_bind_spread) {
5251 if (do_place_partition) {
5252 // add flag to update only master for spread
5253 __kmp_partition_places(team, 1);
5254 }
5255 }
5256 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5257 "proc_bind = %d, partition = [%d,%d]\n",
5258 team->t.t_id, new_proc_bind, team->t.t_first_place,
5259 team->t.t_last_place));
5260 } else {
5261 if (do_place_partition) {
5262 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5263 __kmp_partition_places(team);
5264 }
5265 }
5266#else
5267 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5268#endif /* KMP_AFFINITY_SUPPORTED */
5269 } else if (team->t.t_nproc > new_nproc) {
5270 KA_TRACE(20,
5271 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5272 new_nproc));
5273
5274 team->t.t_size_changed = 1;
5275 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5276 // Barrier size already reduced earlier in this function
5277 // Activate team threads via th_used_in_team
5278 __kmp_add_threads_to_team(team, new_nproc);
5279 }
5280#if KMP_NESTED_HOT_TEAMS
5281 if (__kmp_hot_teams_mode == 0) {
5282 // AC: saved number of threads should correspond to team's value in this
5283 // mode, can be bigger in mode 1, when hot team has threads in reserve
5284 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5285 hot_teams[level].hot_team_nth = new_nproc;
5286#endif // KMP_NESTED_HOT_TEAMS
5287 /* release the extra threads we don't need any more */
5288 for (f = new_nproc; f < team->t.t_nproc; f++) {
5289 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5290 if (__kmp_tasking_mode != tskm_immediate_exec) {
5291 // When decreasing team size, threads no longer in the team should
5292 // unref task team.
5293 team->t.t_threads[f]->th.th_task_team = NULL;
5294 }
5295 __kmp_free_thread(team->t.t_threads[f]);
5296 team->t.t_threads[f] = NULL;
5297 }
5298#if KMP_NESTED_HOT_TEAMS
5299 } // (__kmp_hot_teams_mode == 0)
5300 else {
5301 // When keeping extra threads in team, switch threads to wait on own
5302 // b_go flag
5303 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5304 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5305 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5306 for (int b = 0; b < bs_last_barrier; ++b) {
5307 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5308 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5309 }
5310 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5311 }
5312 }
5313 }
5314#endif // KMP_NESTED_HOT_TEAMS
5315 team->t.t_nproc = new_nproc;
5316 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5317 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5318 __kmp_reinitialize_team(team, new_icvs,
5319 root->r.r_uber_thread->th.th_ident);
5320
5321 // Update remaining threads
5322 for (f = 0; f < new_nproc; ++f) {
5323 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5324 }
5325
5326 // restore the current task state of the primary thread: should be the
5327 // implicit task
5328 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5329 team->t.t_threads[0], team));
5330
5331 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5332
5333#ifdef KMP_DEBUG
5334 for (f = 0; f < team->t.t_nproc; f++) {
5335 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5336 team->t.t_threads[f]->th.th_team_nproc ==
5337 team->t.t_nproc);
5338 }
5339#endif
5340
5341 if (do_place_partition) {
5342 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5343#if KMP_AFFINITY_SUPPORTED
5344 __kmp_partition_places(team);
5345#endif
5346 }
5347 } else { // team->t.t_nproc < new_nproc
5348#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5349 kmp_affin_mask_t *old_mask;
5350 if (KMP_AFFINITY_CAPABLE()) {
5351 KMP_CPU_ALLOC(old_mask);
5352 }
5353#endif
5354
5355 KA_TRACE(20,
5356 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5357 new_nproc));
5358 int old_nproc = team->t.t_nproc; // save old value and use to update only
5359 team->t.t_size_changed = 1;
5360
5361#if KMP_NESTED_HOT_TEAMS
5362 int avail_threads = hot_teams[level].hot_team_nth;
5363 if (new_nproc < avail_threads)
5364 avail_threads = new_nproc;
5365 kmp_info_t **other_threads = team->t.t_threads;
5366 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5367 // Adjust barrier data of reserved threads (if any) of the team
5368 // Other data will be set in __kmp_initialize_info() below.
5369 int b;
5370 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5371 for (b = 0; b < bs_last_barrier; ++b) {
5372 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5373 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5374#if USE_DEBUGGER
5375 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5376#endif
5377 }
5378 }
5379 if (hot_teams[level].hot_team_nth >= new_nproc) {
5380 // we have all needed threads in reserve, no need to allocate any
5381 // this only possible in mode 1, cannot have reserved threads in mode 0
5382 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5383 team->t.t_nproc = new_nproc; // just get reserved threads involved
5384 } else {
5385 // We may have some threads in reserve, but not enough;
5386 // get reserved threads involved if any.
5387 team->t.t_nproc = hot_teams[level].hot_team_nth;
5388 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5389#endif // KMP_NESTED_HOT_TEAMS
5390 if (team->t.t_max_nproc < new_nproc) {
5391 /* reallocate larger arrays */
5392 __kmp_reallocate_team_arrays(team, new_nproc);
5393 __kmp_reinitialize_team(team, new_icvs, NULL);
5394 }
5395
5396#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5397 /* Temporarily set full mask for primary thread before creation of
5398 workers. The reason is that workers inherit the affinity from the
5399 primary thread, so if a lot of workers are created on the single
5400 core quickly, they don't get a chance to set their own affinity for
5401 a long time. */
5402 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5403#endif
5404
5405 /* allocate new threads for the hot team */
5406 for (f = team->t.t_nproc; f < new_nproc; f++) {
5407 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5408 KMP_DEBUG_ASSERT(new_worker);
5409 team->t.t_threads[f] = new_worker;
5410
5411 KA_TRACE(20,
5412 ("__kmp_allocate_team: team %d init T#%d arrived: "
5413 "join=%llu, plain=%llu\n",
5414 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5415 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5416 team->t.t_bar[bs_plain_barrier].b_arrived));
5417
5418 { // Initialize barrier data for new threads.
5419 int b;
5420 kmp_balign_t *balign = new_worker->th.th_bar;
5421 for (b = 0; b < bs_last_barrier; ++b) {
5422 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5423 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5424 KMP_BARRIER_PARENT_FLAG);
5425#if USE_DEBUGGER
5426 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5427#endif
5428 }
5429 }
5430 }
5431
5432#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5433 if (KMP_AFFINITY_CAPABLE()) {
5434 /* Restore initial primary thread's affinity mask */
5435 __kmp_set_system_affinity(old_mask, TRUE);
5436 KMP_CPU_FREE(old_mask);
5437 }
5438#endif
5439#if KMP_NESTED_HOT_TEAMS
5440 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5441#endif // KMP_NESTED_HOT_TEAMS
5442 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5443 // Barrier size already increased earlier in this function
5444 // Activate team threads via th_used_in_team
5445 __kmp_add_threads_to_team(team, new_nproc);
5446 }
5447 /* make sure everyone is syncronized */
5448 // new threads below
5449 __kmp_initialize_team(team, new_nproc, new_icvs,
5450 root->r.r_uber_thread->th.th_ident);
5451
5452 /* reinitialize the threads */
5453 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5454 for (f = 0; f < team->t.t_nproc; ++f)
5455 __kmp_initialize_info(team->t.t_threads[f], team, f,
5456 __kmp_gtid_from_tid(f, team));
5457
5458 // set th_task_state for new threads in hot team with older thread's state
5459 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5460 for (f = old_nproc; f < team->t.t_nproc; ++f)
5461 team->t.t_threads[f]->th.th_task_state = old_state;
5462
5463#ifdef KMP_DEBUG
5464 for (f = 0; f < team->t.t_nproc; ++f) {
5465 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5466 team->t.t_threads[f]->th.th_team_nproc ==
5467 team->t.t_nproc);
5468 }
5469#endif
5470
5471 if (do_place_partition) {
5472 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5473#if KMP_AFFINITY_SUPPORTED
5474 __kmp_partition_places(team);
5475#endif
5476 }
5477 } // Check changes in number of threads
5478
5479 kmp_info_t *master = team->t.t_threads[0];
5480 if (master->th.th_teams_microtask) {
5481 for (f = 1; f < new_nproc; ++f) {
5482 // propagate teams construct specific info to workers
5483 kmp_info_t *thr = team->t.t_threads[f];
5484 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5485 thr->th.th_teams_level = master->th.th_teams_level;
5486 thr->th.th_teams_size = master->th.th_teams_size;
5487 }
5488 }
5489#if KMP_NESTED_HOT_TEAMS
5490 if (level) {
5491 // Sync barrier state for nested hot teams, not needed for outermost hot
5492 // team.
5493 for (f = 1; f < new_nproc; ++f) {
5494 kmp_info_t *thr = team->t.t_threads[f];
5495 int b;
5496 kmp_balign_t *balign = thr->th.th_bar;
5497 for (b = 0; b < bs_last_barrier; ++b) {
5498 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5499 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5500#if USE_DEBUGGER
5501 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5502#endif
5503 }
5504 }
5505 }
5506#endif // KMP_NESTED_HOT_TEAMS
5507
5508 /* reallocate space for arguments if necessary */
5509 __kmp_alloc_argv_entries(argc, team, TRUE);
5510 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5511 // The hot team re-uses the previous task team,
5512 // if untouched during the previous release->gather phase.
5513
5514 KF_TRACE(10, (" hot_team = %p\n", team));
5515
5516#if KMP_DEBUG
5517 if (__kmp_tasking_mode != tskm_immediate_exec) {
5518 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5519 "task_team[1] = %p after reinit\n",
5520 team->t.t_task_team[0], team->t.t_task_team[1]));
5521 }
5522#endif
5523
5524#if OMPT_SUPPORT
5525 __ompt_team_assign_id(team, ompt_parallel_data);
5526#endif
5527
5528 KMP_MB();
5529
5530 return team;
5531 }
5532
5533 /* next, let's try to take one from the team pool */
5534 KMP_MB();
5535 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5536 /* TODO: consider resizing undersized teams instead of reaping them, now
5537 that we have a resizing mechanism */
5538 if (team->t.t_max_nproc >= max_nproc) {
5539 /* take this team from the team pool */
5540 __kmp_team_pool = team->t.t_next_pool;
5541
5542 if (max_nproc > 1 &&
5543 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5544 if (!team->t.b) { // Allocate barrier structure
5545 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5546 }
5547 }
5548
5549 /* setup the team for fresh use */
5550 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5551
5552 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5553 "task_team[1] %p to NULL\n",
5554 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5555 team->t.t_task_team[0] = NULL;
5556 team->t.t_task_team[1] = NULL;
5557
5558 /* reallocate space for arguments if necessary */
5559 __kmp_alloc_argv_entries(argc, team, TRUE);
5560 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5561
5562 KA_TRACE(
5563 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5564 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5565 { // Initialize barrier data.
5566 int b;
5567 for (b = 0; b < bs_last_barrier; ++b) {
5568 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5569#if USE_DEBUGGER
5570 team->t.t_bar[b].b_master_arrived = 0;
5571 team->t.t_bar[b].b_team_arrived = 0;
5572#endif
5573 }
5574 }
5575
5576 team->t.t_proc_bind = new_proc_bind;
5577
5578 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5579 team->t.t_id));
5580
5581#if OMPT_SUPPORT
5582 __ompt_team_assign_id(team, ompt_parallel_data);
5583#endif
5584
5585 KMP_MB();
5586
5587 return team;
5588 }
5589
5590 /* reap team if it is too small, then loop back and check the next one */
5591 // not sure if this is wise, but, will be redone during the hot-teams
5592 // rewrite.
5593 /* TODO: Use technique to find the right size hot-team, don't reap them */
5594 team = __kmp_reap_team(team);
5595 __kmp_team_pool = team;
5596 }
5597
5598 /* nothing available in the pool, no matter, make a new team! */
5599 KMP_MB();
5600 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5601
5602 /* and set it up */
5603 team->t.t_max_nproc = max_nproc;
5604 if (max_nproc > 1 &&
5605 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5606 // Allocate barrier structure
5607 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5608 }
5609
5610 /* NOTE well, for some reason allocating one big buffer and dividing it up
5611 seems to really hurt performance a lot on the P4, so, let's not use this */
5612 __kmp_allocate_team_arrays(team, max_nproc);
5613
5614 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5615 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5616
5617 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5618 "%p to NULL\n",
5619 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5620 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5621 // memory, no need to duplicate
5622 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5623 // memory, no need to duplicate
5624
5625 if (__kmp_storage_map) {
5626 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5627 }
5628
5629 /* allocate space for arguments */
5630 __kmp_alloc_argv_entries(argc, team, FALSE);
5631 team->t.t_argc = argc;
5632
5633 KA_TRACE(20,
5634 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5635 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5636 { // Initialize barrier data.
5637 int b;
5638 for (b = 0; b < bs_last_barrier; ++b) {
5639 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5640#if USE_DEBUGGER
5641 team->t.t_bar[b].b_master_arrived = 0;
5642 team->t.t_bar[b].b_team_arrived = 0;
5643#endif
5644 }
5645 }
5646
5647 team->t.t_proc_bind = new_proc_bind;
5648
5649#if OMPT_SUPPORT
5650 __ompt_team_assign_id(team, ompt_parallel_data);
5651 team->t.ompt_serialized_team_info = NULL;
5652#endif
5653
5654 KMP_MB();
5655
5656 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5657 team->t.t_id));
5658
5659 return team;
5660}
5661
5662/* TODO implement hot-teams at all levels */
5663/* TODO implement lazy thread release on demand (disband request) */
5664
5665/* free the team. return it to the team pool. release all the threads
5666 * associated with it */
5667void __kmp_free_team(kmp_root_t *root,
5668 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5669 int f;
5670 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5671 team->t.t_id));
5672
5673 /* verify state */
5674 KMP_DEBUG_ASSERT(root);
5675 KMP_DEBUG_ASSERT(team);
5676 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5677 KMP_DEBUG_ASSERT(team->t.t_threads);
5678
5679 int use_hot_team = team == root->r.r_hot_team;
5680#if KMP_NESTED_HOT_TEAMS
5681 int level;
5682 if (master) {
5683 level = team->t.t_active_level - 1;
5684 if (master->th.th_teams_microtask) { // in teams construct?
5685 if (master->th.th_teams_size.nteams > 1) {
5686 ++level; // level was not increased in teams construct for
5687 // team_of_masters
5688 }
5689 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5690 master->th.th_teams_level == team->t.t_level) {
5691 ++level; // level was not increased in teams construct for
5692 // team_of_workers before the parallel
5693 } // team->t.t_level will be increased inside parallel
5694 }
5695#if KMP_DEBUG
5696 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5697#endif
5698 if (level < __kmp_hot_teams_max_level) {
5699 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5700 use_hot_team = 1;
5701 }
5702 }
5703#endif // KMP_NESTED_HOT_TEAMS
5704
5705 /* team is done working */
5706 TCW_SYNC_PTR(team->t.t_pkfn,
5707 NULL); // Important for Debugging Support Library.
5708#if KMP_OS_WINDOWS
5709 team->t.t_copyin_counter = 0; // init counter for possible reuse
5710#endif
5711 // Do not reset pointer to parent team to NULL for hot teams.
5712
5713 /* if we are non-hot team, release our threads */
5714 if (!use_hot_team) {
5715 if (__kmp_tasking_mode != tskm_immediate_exec) {
5716 // Wait for threads to reach reapable state
5717 for (f = 1; f < team->t.t_nproc; ++f) {
5718 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5719 kmp_info_t *th = team->t.t_threads[f];
5720 volatile kmp_uint32 *state = &th->th.th_reap_state;
5721 while (*state != KMP_SAFE_TO_REAP) {
5722#if KMP_OS_WINDOWS
5723 // On Windows a thread can be killed at any time, check this
5724 DWORD ecode;
5725 if (!__kmp_is_thread_alive(th, &ecode)) {
5726 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5727 break;
5728 }
5729#endif
5730 // first check if thread is sleeping
5731 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5732 if (fl.is_sleeping())
5733 fl.resume(__kmp_gtid_from_thread(th));
5734 KMP_CPU_PAUSE();
5735 }
5736 }
5737
5738 // Delete task teams
5739 int tt_idx;
5740 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5741 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5742 if (task_team != NULL) {
5743 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5744 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5745 team->t.t_threads[f]->th.th_task_team = NULL;
5746 }
5747 KA_TRACE(
5748 20,
5749 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5750 __kmp_get_gtid(), task_team, team->t.t_id));
5751#if KMP_NESTED_HOT_TEAMS
5752 __kmp_free_task_team(master, task_team);
5753#endif
5754 team->t.t_task_team[tt_idx] = NULL;
5755 }
5756 }
5757 }
5758
5759 // Reset pointer to parent team only for non-hot teams.
5760 team->t.t_parent = NULL;
5761 team->t.t_level = 0;
5762 team->t.t_active_level = 0;
5763
5764 /* free the worker threads */
5765 for (f = 1; f < team->t.t_nproc; ++f) {
5766 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769 1, 2);
5770 }
5771 __kmp_free_thread(team->t.t_threads[f]);
5772 }
5773
5774 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775 if (team->t.b) {
5776 // wake up thread at old location
5777 team->t.b->go_release();
5778 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779 for (f = 1; f < team->t.t_nproc; ++f) {
5780 if (team->t.b->sleep[f].sleep) {
5781 __kmp_atomic_resume_64(
5782 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783 (kmp_atomic_flag_64<> *)NULL);
5784 }
5785 }
5786 }
5787 // Wait for threads to be removed from team
5788 for (int f = 1; f < team->t.t_nproc; ++f) {
5789 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790 KMP_CPU_PAUSE();
5791 }
5792 }
5793 }
5794
5795 for (f = 1; f < team->t.t_nproc; ++f) {
5796 team->t.t_threads[f] = NULL;
5797 }
5798
5799 if (team->t.t_max_nproc > 1 &&
5800 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801 distributedBarrier::deallocate(team->t.b);
5802 team->t.b = NULL;
5803 }
5804 /* put the team back in the team pool */
5805 /* TODO limit size of team pool, call reap_team if pool too large */
5806 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807 __kmp_team_pool = (volatile kmp_team_t *)team;
5808 } else { // Check if team was created for primary threads in teams construct
5809 // See if first worker is a CG root
5810 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811 team->t.t_threads[1]->th.th_cg_roots);
5812 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813 // Clean up the CG root nodes on workers so that this team can be re-used
5814 for (f = 1; f < team->t.t_nproc; ++f) {
5815 kmp_info_t *thr = team->t.t_threads[f];
5816 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817 thr->th.th_cg_roots->cg_root == thr);
5818 // Pop current CG root off list
5819 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820 thr->th.th_cg_roots = tmp->up;
5821 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822 " up to node %p. cg_nthreads was %d\n",
5823 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824 int i = tmp->cg_nthreads--;
5825 if (i == 1) {
5826 __kmp_free(tmp); // free CG if we are the last thread in it
5827 }
5828 // Restore current task's thread_limit from CG root
5829 if (thr->th.th_cg_roots)
5830 thr->th.th_current_task->td_icvs.thread_limit =
5831 thr->th.th_cg_roots->cg_thread_limit;
5832 }
5833 }
5834 }
5835
5836 KMP_MB();
5837}
5838
5839/* reap the team. destroy it, reclaim all its resources and free its memory */
5840kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841 kmp_team_t *next_pool = team->t.t_next_pool;
5842
5843 KMP_DEBUG_ASSERT(team);
5844 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846 KMP_DEBUG_ASSERT(team->t.t_threads);
5847 KMP_DEBUG_ASSERT(team->t.t_argv);
5848
5849 /* TODO clean the threads that are a part of this? */
5850
5851 /* free stuff */
5852 __kmp_free_team_arrays(team);
5853 if (team->t.t_argv != &team->t.t_inline_argv[0])
5854 __kmp_free((void *)team->t.t_argv);
5855 __kmp_free(team);
5856
5857 KMP_MB();
5858 return next_pool;
5859}
5860
5861// Free the thread. Don't reap it, just place it on the pool of available
5862// threads.
5863//
5864// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865// binding for the affinity mechanism to be useful.
5866//
5867// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868// However, we want to avoid a potential performance problem by always
5869// scanning through the list to find the correct point at which to insert
5870// the thread (potential N**2 behavior). To do this we keep track of the
5871// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872// With single-level parallelism, threads will always be added to the tail
5873// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5874// parallelism, all bets are off and we may need to scan through the entire
5875// free list.
5876//
5877// This change also has a potentially large performance benefit, for some
5878// applications. Previously, as threads were freed from the hot team, they
5879// would be placed back on the free list in inverse order. If the hot team
5880// grew back to it's original size, then the freed thread would be placed
5881// back on the hot team in reverse order. This could cause bad cache
5882// locality problems on programs where the size of the hot team regularly
5883// grew and shrunk.
5884//
5885// Now, for single-level parallelism, the OMP tid is always == gtid.
5886void __kmp_free_thread(kmp_info_t *this_th) {
5887 int gtid;
5888 kmp_info_t **scan;
5889
5890 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892
5893 KMP_DEBUG_ASSERT(this_th);
5894
5895 // When moving thread to pool, switch thread to wait on own b_go flag, and
5896 // uninitialized (NULL team).
5897 int b;
5898 kmp_balign_t *balign = this_th->th.th_bar;
5899 for (b = 0; b < bs_last_barrier; ++b) {
5900 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902 balign[b].bb.team = NULL;
5903 balign[b].bb.leaf_kids = 0;
5904 }
5905 this_th->th.th_task_state = 0;
5906 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907
5908 /* put thread back on the free pool */
5909 TCW_PTR(this_th->th.th_team, NULL);
5910 TCW_PTR(this_th->th.th_root, NULL);
5911 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912
5913 while (this_th->th.th_cg_roots) {
5914 this_th->th.th_cg_roots->cg_nthreads--;
5915 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916 " %p of thread %p to %d\n",
5917 this_th, this_th->th.th_cg_roots,
5918 this_th->th.th_cg_roots->cg_root,
5919 this_th->th.th_cg_roots->cg_nthreads));
5920 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921 if (tmp->cg_root == this_th) { // Thread is a cg_root
5922 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923 KA_TRACE(
5924 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925 this_th->th.th_cg_roots = tmp->up;
5926 __kmp_free(tmp);
5927 } else { // Worker thread
5928 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929 __kmp_free(tmp);
5930 }
5931 this_th->th.th_cg_roots = NULL;
5932 break;
5933 }
5934 }
5935
5936 /* If the implicit task assigned to this thread can be used by other threads
5937 * -> multiple threads can share the data and try to free the task at
5938 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939 * with higher probability when hot team is disabled but can occurs even when
5940 * the hot team is enabled */
5941 __kmp_free_implicit_task(this_th);
5942 this_th->th.th_current_task = NULL;
5943
5944 // If the __kmp_thread_pool_insert_pt is already past the new insert
5945 // point, then we need to re-scan the entire list.
5946 gtid = this_th->th.th_info.ds.ds_gtid;
5947 if (__kmp_thread_pool_insert_pt != NULL) {
5948 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950 __kmp_thread_pool_insert_pt = NULL;
5951 }
5952 }
5953
5954 // Scan down the list to find the place to insert the thread.
5955 // scan is the address of a link in the list, possibly the address of
5956 // __kmp_thread_pool itself.
5957 //
5958 // In the absence of nested parallelism, the for loop will have 0 iterations.
5959 if (__kmp_thread_pool_insert_pt != NULL) {
5960 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961 } else {
5962 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963 }
5964 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965 scan = &((*scan)->th.th_next_pool))
5966 ;
5967
5968 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969 // to its address.
5970 TCW_PTR(this_th->th.th_next_pool, *scan);
5971 __kmp_thread_pool_insert_pt = *scan = this_th;
5972 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973 (this_th->th.th_info.ds.ds_gtid <
5974 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975 TCW_4(this_th->th.th_in_pool, TRUE);
5976 __kmp_suspend_initialize_thread(this_th);
5977 __kmp_lock_suspend_mx(this_th);
5978 if (this_th->th.th_active == TRUE) {
5979 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980 this_th->th.th_active_in_pool = TRUE;
5981 }
5982#if KMP_DEBUG
5983 else {
5984 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985 }
5986#endif
5987 __kmp_unlock_suspend_mx(this_th);
5988
5989 TCW_4(__kmp_nth, __kmp_nth - 1);
5990
5991#ifdef KMP_ADJUST_BLOCKTIME
5992 /* Adjust blocktime back to user setting or default if necessary */
5993 /* Middle initialization might never have occurred */
5994 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996 if (__kmp_nth <= __kmp_avail_proc) {
5997 __kmp_zero_bt = FALSE;
5998 }
5999 }
6000#endif /* KMP_ADJUST_BLOCKTIME */
6001
6002 KMP_MB();
6003}
6004
6005/* ------------------------------------------------------------------------ */
6006
6007void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008#if OMP_PROFILING_SUPPORT
6009 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010 // TODO: add a configuration option for time granularity
6011 if (ProfileTraceFile)
6012 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013#endif
6014
6015 int gtid = this_thr->th.th_info.ds.ds_gtid;
6016 /* void *stack_data;*/
6017 kmp_team_t **volatile pteam;
6018
6019 KMP_MB();
6020 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021
6022 if (__kmp_env_consistency_check) {
6023 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024 }
6025
6026#if OMPD_SUPPORT
6027 if (ompd_state & OMPD_ENABLE_BP)
6028 ompd_bp_thread_begin();
6029#endif
6030
6031#if OMPT_SUPPORT
6032 ompt_data_t *thread_data = nullptr;
6033 if (ompt_enabled.enabled) {
6034 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035 *thread_data = ompt_data_none;
6036
6037 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038 this_thr->th.ompt_thread_info.wait_id = 0;
6039 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040 this_thr->th.ompt_thread_info.parallel_flags = 0;
6041 if (ompt_enabled.ompt_callback_thread_begin) {
6042 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043 ompt_thread_worker, thread_data);
6044 }
6045 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046 }
6047#endif
6048
6049 /* This is the place where threads wait for work */
6050 while (!TCR_4(__kmp_global.g.g_done)) {
6051 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052 KMP_MB();
6053
6054 /* wait for work to do */
6055 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056
6057 /* No tid yet since not part of a team */
6058 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059
6060#if OMPT_SUPPORT
6061 if (ompt_enabled.enabled) {
6062 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063 }
6064#endif
6065
6066 pteam = &this_thr->th.th_team;
6067
6068 /* have we been allocated? */
6069 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070 /* we were just woken up, so run our new task */
6071 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072 int rc;
6073 KA_TRACE(20,
6074 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076 (*pteam)->t.t_pkfn));
6077
6078 updateHWFPControl(*pteam);
6079
6080#if OMPT_SUPPORT
6081 if (ompt_enabled.enabled) {
6082 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083 }
6084#endif
6085
6086 rc = (*pteam)->t.t_invoke(gtid);
6087 KMP_ASSERT(rc);
6088
6089 KMP_MB();
6090 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092 (*pteam)->t.t_pkfn));
6093 }
6094#if OMPT_SUPPORT
6095 if (ompt_enabled.enabled) {
6096 /* no frame set while outside task */
6097 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098
6099 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100 }
6101#endif
6102 /* join barrier after parallel region */
6103 __kmp_join_barrier(gtid);
6104 }
6105 }
6106 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6107
6108#if OMPD_SUPPORT
6109 if (ompd_state & OMPD_ENABLE_BP)
6110 ompd_bp_thread_end();
6111#endif
6112
6113#if OMPT_SUPPORT
6114 if (ompt_enabled.ompt_callback_thread_end) {
6115 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6116 }
6117#endif
6118
6119 this_thr->th.th_task_team = NULL;
6120 /* run the destructors for the threadprivate data for this thread */
6121 __kmp_common_destroy_gtid(gtid);
6122
6123 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6124 KMP_MB();
6125
6126#if OMP_PROFILING_SUPPORT
6127 llvm::timeTraceProfilerFinishThread();
6128#endif
6129 return this_thr;
6130}
6131
6132/* ------------------------------------------------------------------------ */
6133
6134void __kmp_internal_end_dest(void *specific_gtid) {
6135 // Make sure no significant bits are lost
6136 int gtid;
6137 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6138
6139 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6140 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6141 * this is because 0 is reserved for the nothing-stored case */
6142
6143 __kmp_internal_end_thread(gtid);
6144}
6145
6146#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6147
6148__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6149 __kmp_internal_end_atexit();
6150}
6151
6152#endif
6153
6154/* [Windows] josh: when the atexit handler is called, there may still be more
6155 than one thread alive */
6156void __kmp_internal_end_atexit(void) {
6157 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6158 /* [Windows]
6159 josh: ideally, we want to completely shutdown the library in this atexit
6160 handler, but stat code that depends on thread specific data for gtid fails
6161 because that data becomes unavailable at some point during the shutdown, so
6162 we call __kmp_internal_end_thread instead. We should eventually remove the
6163 dependency on __kmp_get_specific_gtid in the stat code and use
6164 __kmp_internal_end_library to cleanly shutdown the library.
6165
6166 // TODO: Can some of this comment about GVS be removed?
6167 I suspect that the offending stat code is executed when the calling thread
6168 tries to clean up a dead root thread's data structures, resulting in GVS
6169 code trying to close the GVS structures for that thread, but since the stat
6170 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6171 the calling thread is cleaning up itself instead of another thread, it get
6172 confused. This happens because allowing a thread to unregister and cleanup
6173 another thread is a recent modification for addressing an issue.
6174 Based on the current design (20050722), a thread may end up
6175 trying to unregister another thread only if thread death does not trigger
6176 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6177 thread specific data destructor function to detect thread death. For
6178 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6179 is nothing. Thus, the workaround is applicable only for Windows static
6180 stat library. */
6181 __kmp_internal_end_library(-1);
6182#if KMP_OS_WINDOWS
6183 __kmp_close_console();
6184#endif
6185}
6186
6187static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6188 // It is assumed __kmp_forkjoin_lock is acquired.
6189
6190 int gtid;
6191
6192 KMP_DEBUG_ASSERT(thread != NULL);
6193
6194 gtid = thread->th.th_info.ds.ds_gtid;
6195
6196 if (!is_root) {
6197 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6198 /* Assume the threads are at the fork barrier here */
6199 KA_TRACE(
6200 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6201 gtid));
6202 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6203 while (
6204 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6205 KMP_CPU_PAUSE();
6206 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6207 } else {
6208 /* Need release fence here to prevent seg faults for tree forkjoin
6209 barrier (GEH) */
6210 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6211 thread);
6212 __kmp_release_64(&flag);
6213 }
6214 }
6215
6216 // Terminate OS thread.
6217 __kmp_reap_worker(thread);
6218
6219 // The thread was killed asynchronously. If it was actively
6220 // spinning in the thread pool, decrement the global count.
6221 //
6222 // There is a small timing hole here - if the worker thread was just waking
6223 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6224 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6225 // the global counter might not get updated.
6226 //
6227 // Currently, this can only happen as the library is unloaded,
6228 // so there are no harmful side effects.
6229 if (thread->th.th_active_in_pool) {
6230 thread->th.th_active_in_pool = FALSE;
6231 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6232 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6233 }
6234 }
6235
6236 __kmp_free_implicit_task(thread);
6237
6238// Free the fast memory for tasking
6239#if USE_FAST_MEMORY
6240 __kmp_free_fast_memory(thread);
6241#endif /* USE_FAST_MEMORY */
6242
6243 __kmp_suspend_uninitialize_thread(thread);
6244
6245 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6246 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6247
6248 --__kmp_all_nth;
6249 // __kmp_nth was decremented when thread is added to the pool.
6250
6251#ifdef KMP_ADJUST_BLOCKTIME
6252 /* Adjust blocktime back to user setting or default if necessary */
6253 /* Middle initialization might never have occurred */
6254 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6255 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6256 if (__kmp_nth <= __kmp_avail_proc) {
6257 __kmp_zero_bt = FALSE;
6258 }
6259 }
6260#endif /* KMP_ADJUST_BLOCKTIME */
6261
6262 /* free the memory being used */
6263 if (__kmp_env_consistency_check) {
6264 if (thread->th.th_cons) {
6265 __kmp_free_cons_stack(thread->th.th_cons);
6266 thread->th.th_cons = NULL;
6267 }
6268 }
6269
6270 if (thread->th.th_pri_common != NULL) {
6271 __kmp_free(thread->th.th_pri_common);
6272 thread->th.th_pri_common = NULL;
6273 }
6274
6275 if (thread->th.th_task_state_memo_stack != NULL) {
6276 __kmp_free(thread->th.th_task_state_memo_stack);
6277 thread->th.th_task_state_memo_stack = NULL;
6278 }
6279
6280#if KMP_USE_BGET
6281 if (thread->th.th_local.bget_data != NULL) {
6282 __kmp_finalize_bget(thread);
6283 }
6284#endif
6285
6286#if KMP_AFFINITY_SUPPORTED
6287 if (thread->th.th_affin_mask != NULL) {
6288 KMP_CPU_FREE(thread->th.th_affin_mask);
6289 thread->th.th_affin_mask = NULL;
6290 }
6291#endif /* KMP_AFFINITY_SUPPORTED */
6292
6293#if KMP_USE_HIER_SCHED
6294 if (thread->th.th_hier_bar_data != NULL) {
6295 __kmp_free(thread->th.th_hier_bar_data);
6296 thread->th.th_hier_bar_data = NULL;
6297 }
6298#endif
6299
6300 __kmp_reap_team(thread->th.th_serial_team);
6301 thread->th.th_serial_team = NULL;
6302 __kmp_free(thread);
6303
6304 KMP_MB();
6305
6306} // __kmp_reap_thread
6307
6308static void __kmp_itthash_clean(kmp_info_t *th) {
6309#if USE_ITT_NOTIFY
6310 if (__kmp_itt_region_domains.count > 0) {
6311 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6312 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6313 while (bucket) {
6314 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6315 __kmp_thread_free(th, bucket);
6316 bucket = next;
6317 }
6318 }
6319 }
6320 if (__kmp_itt_barrier_domains.count > 0) {
6321 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6322 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6323 while (bucket) {
6324 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6325 __kmp_thread_free(th, bucket);
6326 bucket = next;
6327 }
6328 }
6329 }
6330#endif
6331}
6332
6333static void __kmp_internal_end(void) {
6334 int i;
6335
6336 /* First, unregister the library */
6337 __kmp_unregister_library();
6338
6339#if KMP_OS_WINDOWS
6340 /* In Win static library, we can't tell when a root actually dies, so we
6341 reclaim the data structures for any root threads that have died but not
6342 unregistered themselves, in order to shut down cleanly.
6343 In Win dynamic library we also can't tell when a thread dies. */
6344 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6345// dead roots
6346#endif
6347
6348 for (i = 0; i < __kmp_threads_capacity; i++)
6349 if (__kmp_root[i])
6350 if (__kmp_root[i]->r.r_active)
6351 break;
6352 KMP_MB(); /* Flush all pending memory write invalidates. */
6353 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6354
6355 if (i < __kmp_threads_capacity) {
6356#if KMP_USE_MONITOR
6357 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6358 KMP_MB(); /* Flush all pending memory write invalidates. */
6359
6360 // Need to check that monitor was initialized before reaping it. If we are
6361 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6362 // __kmp_monitor will appear to contain valid data, but it is only valid in
6363 // the parent process, not the child.
6364 // New behavior (201008): instead of keying off of the flag
6365 // __kmp_init_parallel, the monitor thread creation is keyed off
6366 // of the new flag __kmp_init_monitor.
6367 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6368 if (TCR_4(__kmp_init_monitor)) {
6369 __kmp_reap_monitor(&__kmp_monitor);
6370 TCW_4(__kmp_init_monitor, 0);
6371 }
6372 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6373 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6374#endif // KMP_USE_MONITOR
6375 } else {
6376/* TODO move this to cleanup code */
6377#ifdef KMP_DEBUG
6378 /* make sure that everything has properly ended */
6379 for (i = 0; i < __kmp_threads_capacity; i++) {
6380 if (__kmp_root[i]) {
6381 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6382 // there can be uber threads alive here
6383 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6384 }
6385 }
6386#endif
6387
6388 KMP_MB();
6389
6390 // Reap the worker threads.
6391 // This is valid for now, but be careful if threads are reaped sooner.
6392 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6393 // Get the next thread from the pool.
6394 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6395 __kmp_thread_pool = thread->th.th_next_pool;
6396 // Reap it.
6397 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6398 thread->th.th_next_pool = NULL;
6399 thread->th.th_in_pool = FALSE;
6400 __kmp_reap_thread(thread, 0);
6401 }
6402 __kmp_thread_pool_insert_pt = NULL;
6403
6404 // Reap teams.
6405 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6406 // Get the next team from the pool.
6407 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6408 __kmp_team_pool = team->t.t_next_pool;
6409 // Reap it.
6410 team->t.t_next_pool = NULL;
6411 __kmp_reap_team(team);
6412 }
6413
6414 __kmp_reap_task_teams();
6415
6416#if KMP_OS_UNIX
6417 // Threads that are not reaped should not access any resources since they
6418 // are going to be deallocated soon, so the shutdown sequence should wait
6419 // until all threads either exit the final spin-waiting loop or begin
6420 // sleeping after the given blocktime.
6421 for (i = 0; i < __kmp_threads_capacity; i++) {
6422 kmp_info_t *thr = __kmp_threads[i];
6423 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6424 KMP_CPU_PAUSE();
6425 }
6426#endif
6427
6428 for (i = 0; i < __kmp_threads_capacity; ++i) {
6429 // TBD: Add some checking...
6430 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6431 }
6432
6433 /* Make sure all threadprivate destructors get run by joining with all
6434 worker threads before resetting this flag */
6435 TCW_SYNC_4(__kmp_init_common, FALSE);
6436
6437 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6438 KMP_MB();
6439
6440#if KMP_USE_MONITOR
6441 // See note above: One of the possible fixes for CQ138434 / CQ140126
6442 //
6443 // FIXME: push both code fragments down and CSE them?
6444 // push them into __kmp_cleanup() ?
6445 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6446 if (TCR_4(__kmp_init_monitor)) {
6447 __kmp_reap_monitor(&__kmp_monitor);
6448 TCW_4(__kmp_init_monitor, 0);
6449 }
6450 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6451 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6452#endif
6453 } /* else !__kmp_global.t_active */
6454 TCW_4(__kmp_init_gtid, FALSE);
6455 KMP_MB(); /* Flush all pending memory write invalidates. */
6456
6457 __kmp_cleanup();
6458#if OMPT_SUPPORT
6459 ompt_fini();
6460#endif
6461}
6462
6463void __kmp_internal_end_library(int gtid_req) {
6464 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6465 /* this shouldn't be a race condition because __kmp_internal_end() is the
6466 only place to clear __kmp_serial_init */
6467 /* we'll check this later too, after we get the lock */
6468 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6469 // redundant, because the next check will work in any case.
6470 if (__kmp_global.g.g_abort) {
6471 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6472 /* TODO abort? */
6473 return;
6474 }
6475 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6476 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6477 return;
6478 }
6479
6480 // If hidden helper team has been initialized, we need to deinit it
6481 if (TCR_4(__kmp_init_hidden_helper) &&
6482 !TCR_4(__kmp_hidden_helper_team_done)) {
6483 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6484 // First release the main thread to let it continue its work
6485 __kmp_hidden_helper_main_thread_release();
6486 // Wait until the hidden helper team has been destroyed
6487 __kmp_hidden_helper_threads_deinitz_wait();
6488 }
6489
6490 KMP_MB(); /* Flush all pending memory write invalidates. */
6491 /* find out who we are and what we should do */
6492 {
6493 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6494 KA_TRACE(
6495 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6496 if (gtid == KMP_GTID_SHUTDOWN) {
6497 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6498 "already shutdown\n"));
6499 return;
6500 } else if (gtid == KMP_GTID_MONITOR) {
6501 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6502 "registered, or system shutdown\n"));
6503 return;
6504 } else if (gtid == KMP_GTID_DNE) {
6505 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6506 "shutdown\n"));
6507 /* we don't know who we are, but we may still shutdown the library */
6508 } else if (KMP_UBER_GTID(gtid)) {
6509 /* unregister ourselves as an uber thread. gtid is no longer valid */
6510 if (__kmp_root[gtid]->r.r_active) {
6511 __kmp_global.g.g_abort = -1;
6512 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6513 __kmp_unregister_library();
6514 KA_TRACE(10,
6515 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6516 gtid));
6517 return;
6518 } else {
6519 __kmp_itthash_clean(__kmp_threads[gtid]);
6520 KA_TRACE(
6521 10,
6522 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6523 __kmp_unregister_root_current_thread(gtid);
6524 }
6525 } else {
6526/* worker threads may call this function through the atexit handler, if they
6527 * call exit() */
6528/* For now, skip the usual subsequent processing and just dump the debug buffer.
6529 TODO: do a thorough shutdown instead */
6530#ifdef DUMP_DEBUG_ON_EXIT
6531 if (__kmp_debug_buf)
6532 __kmp_dump_debug_buffer();
6533#endif
6534 // added unregister library call here when we switch to shm linux
6535 // if we don't, it will leave lots of files in /dev/shm
6536 // cleanup shared memory file before exiting.
6537 __kmp_unregister_library();
6538 return;
6539 }
6540 }
6541 /* synchronize the termination process */
6542 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6543
6544 /* have we already finished */
6545 if (__kmp_global.g.g_abort) {
6546 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6547 /* TODO abort? */
6548 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6549 return;
6550 }
6551 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6552 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6553 return;
6554 }
6555
6556 /* We need this lock to enforce mutex between this reading of
6557 __kmp_threads_capacity and the writing by __kmp_register_root.
6558 Alternatively, we can use a counter of roots that is atomically updated by
6559 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6560 __kmp_internal_end_*. */
6561 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6562
6563 /* now we can safely conduct the actual termination */
6564 __kmp_internal_end();
6565
6566 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6567 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6568
6569 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6570
6571#ifdef DUMP_DEBUG_ON_EXIT
6572 if (__kmp_debug_buf)
6573 __kmp_dump_debug_buffer();
6574#endif
6575
6576#if KMP_OS_WINDOWS
6577 __kmp_close_console();
6578#endif
6579
6580 __kmp_fini_allocator();
6581
6582} // __kmp_internal_end_library
6583
6584void __kmp_internal_end_thread(int gtid_req) {
6585 int i;
6586
6587 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6588 /* this shouldn't be a race condition because __kmp_internal_end() is the
6589 * only place to clear __kmp_serial_init */
6590 /* we'll check this later too, after we get the lock */
6591 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6592 // redundant, because the next check will work in any case.
6593 if (__kmp_global.g.g_abort) {
6594 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6595 /* TODO abort? */
6596 return;
6597 }
6598 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6599 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6600 return;
6601 }
6602
6603 // If hidden helper team has been initialized, we need to deinit it
6604 if (TCR_4(__kmp_init_hidden_helper) &&
6605 !TCR_4(__kmp_hidden_helper_team_done)) {
6606 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6607 // First release the main thread to let it continue its work
6608 __kmp_hidden_helper_main_thread_release();
6609 // Wait until the hidden helper team has been destroyed
6610 __kmp_hidden_helper_threads_deinitz_wait();
6611 }
6612
6613 KMP_MB(); /* Flush all pending memory write invalidates. */
6614
6615 /* find out who we are and what we should do */
6616 {
6617 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6618 KA_TRACE(10,
6619 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6620 if (gtid == KMP_GTID_SHUTDOWN) {
6621 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6622 "already shutdown\n"));
6623 return;
6624 } else if (gtid == KMP_GTID_MONITOR) {
6625 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6626 "registered, or system shutdown\n"));
6627 return;
6628 } else if (gtid == KMP_GTID_DNE) {
6629 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6630 "shutdown\n"));
6631 return;
6632 /* we don't know who we are */
6633 } else if (KMP_UBER_GTID(gtid)) {
6634 /* unregister ourselves as an uber thread. gtid is no longer valid */
6635 if (__kmp_root[gtid]->r.r_active) {
6636 __kmp_global.g.g_abort = -1;
6637 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6638 KA_TRACE(10,
6639 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6640 gtid));
6641 return;
6642 } else {
6643 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6644 gtid));
6645 __kmp_unregister_root_current_thread(gtid);
6646 }
6647 } else {
6648 /* just a worker thread, let's leave */
6649 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6650
6651 if (gtid >= 0) {
6652 __kmp_threads[gtid]->th.th_task_team = NULL;
6653 }
6654
6655 KA_TRACE(10,
6656 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6657 gtid));
6658 return;
6659 }
6660 }
6661#if KMP_DYNAMIC_LIB
6662 if (__kmp_pause_status != kmp_hard_paused)
6663 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6664 // because we will better shutdown later in the library destructor.
6665 {
6666 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6667 return;
6668 }
6669#endif
6670 /* synchronize the termination process */
6671 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6672
6673 /* have we already finished */
6674 if (__kmp_global.g.g_abort) {
6675 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6676 /* TODO abort? */
6677 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678 return;
6679 }
6680 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6681 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6682 return;
6683 }
6684
6685 /* We need this lock to enforce mutex between this reading of
6686 __kmp_threads_capacity and the writing by __kmp_register_root.
6687 Alternatively, we can use a counter of roots that is atomically updated by
6688 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6689 __kmp_internal_end_*. */
6690
6691 /* should we finish the run-time? are all siblings done? */
6692 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6693
6694 for (i = 0; i < __kmp_threads_capacity; ++i) {
6695 if (KMP_UBER_GTID(i)) {
6696 KA_TRACE(
6697 10,
6698 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6699 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6700 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6701 return;
6702 }
6703 }
6704
6705 /* now we can safely conduct the actual termination */
6706
6707 __kmp_internal_end();
6708
6709 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6710 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6711
6712 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6713
6714#ifdef DUMP_DEBUG_ON_EXIT
6715 if (__kmp_debug_buf)
6716 __kmp_dump_debug_buffer();
6717#endif
6718} // __kmp_internal_end_thread
6719
6720// -----------------------------------------------------------------------------
6721// Library registration stuff.
6722
6723static long __kmp_registration_flag = 0;
6724// Random value used to indicate library initialization.
6725static char *__kmp_registration_str = NULL;
6726// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6727
6728static inline char *__kmp_reg_status_name() {
6729/* On RHEL 3u5 if linked statically, getpid() returns different values in
6730 each thread. If registration and unregistration go in different threads
6731 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6732 env var can not be found, because the name will contain different pid. */
6733// macOS* complains about name being too long with additional getuid()
6734#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6735 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6736 (int)getuid());
6737#else
6738 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6739#endif
6740} // __kmp_reg_status_get
6741
6742#if defined(KMP_USE_SHM)
6743// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6744char *temp_reg_status_file_name = nullptr;
6745#endif
6746
6747void __kmp_register_library_startup(void) {
6748
6749 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6750 int done = 0;
6751 union {
6752 double dtime;
6753 long ltime;
6754 } time;
6755#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6756 __kmp_initialize_system_tick();
6757#endif
6758 __kmp_read_system_time(&time.dtime);
6759 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6760 __kmp_registration_str =
6761 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6762 __kmp_registration_flag, KMP_LIBRARY_FILE);
6763
6764 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6765 __kmp_registration_str));
6766
6767 while (!done) {
6768
6769 char *value = NULL; // Actual value of the environment variable.
6770
6771#if defined(KMP_USE_SHM)
6772 char *shm_name = __kmp_str_format("/%s", name);
6773 int shm_preexist = 0;
6774 char *data1;
6775 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6776 if ((fd1 == -1) && (errno == EEXIST)) {
6777 // file didn't open because it already exists.
6778 // try opening existing file
6779 fd1 = shm_open(shm_name, O_RDWR, 0666);
6780 if (fd1 == -1) { // file didn't open
6781 // error out here
6782 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6783 __kmp_msg_null);
6784 } else {
6785 // able to open existing file
6786 shm_preexist = 1;
6787 }
6788 } else if (fd1 == -1) {
6789 // SHM didn't open; it was due to error other than already exists. Try to
6790 // create a temp file under /tmp.
6791 // TODO: /tmp might not always be the temporary directory. For now we will
6792 // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6793 char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6794 fd1 = mkstemp(temp_file_name);
6795 if (fd1 == -1) {
6796 // error out here.
6797 __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6798 __kmp_msg_null);
6799 }
6800 temp_reg_status_file_name = temp_file_name;
6801 }
6802 if (shm_preexist == 0) {
6803 // we created SHM now set size
6804 if (ftruncate(fd1, SHM_SIZE) == -1) {
6805 // error occured setting size;
6806 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6807 KMP_ERR(errno), __kmp_msg_null);
6808 }
6809 }
6810 data1 =
6811 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6812 if (data1 == MAP_FAILED) {
6813 // failed to map shared memory
6814 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6815 __kmp_msg_null);
6816 }
6817 if (shm_preexist == 0) { // set data to SHM, set value
6818 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6819 }
6820 // Read value from either what we just wrote or existing file.
6821 value = __kmp_str_format("%s", data1); // read value from SHM
6822 munmap(data1, SHM_SIZE);
6823 close(fd1);
6824#else // Windows and unix with static library
6825 // Set environment variable, but do not overwrite if it is exist.
6826 __kmp_env_set(name, __kmp_registration_str, 0);
6827 // read value to see if it got set
6828 value = __kmp_env_get(name);
6829#endif
6830
6831 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6832 done = 1; // Ok, environment variable set successfully, exit the loop.
6833 } else {
6834 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6835 // Check whether it alive or dead.
6836 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6837 char *tail = value;
6838 char *flag_addr_str = NULL;
6839 char *flag_val_str = NULL;
6840 char const *file_name = NULL;
6841 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6842 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6843 file_name = tail;
6844 if (tail != NULL) {
6845 unsigned long *flag_addr = 0;
6846 unsigned long flag_val = 0;
6847 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6848 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6849 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6850 // First, check whether environment-encoded address is mapped into
6851 // addr space.
6852 // If so, dereference it to see if it still has the right value.
6853 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6854 neighbor = 1;
6855 } else {
6856 // If not, then we know the other copy of the library is no longer
6857 // running.
6858 neighbor = 2;
6859 }
6860 }
6861 }
6862 switch (neighbor) {
6863 case 0: // Cannot parse environment variable -- neighbor status unknown.
6864 // Assume it is the incompatible format of future version of the
6865 // library. Assume the other library is alive.
6866 // WARN( ... ); // TODO: Issue a warning.
6867 file_name = "unknown library";
6868 KMP_FALLTHROUGH();
6869 // Attention! Falling to the next case. That's intentional.
6870 case 1: { // Neighbor is alive.
6871 // Check it is allowed.
6872 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6873 if (!__kmp_str_match_true(duplicate_ok)) {
6874 // That's not allowed. Issue fatal error.
6875 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6876 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6877 }
6878 KMP_INTERNAL_FREE(duplicate_ok);
6879 __kmp_duplicate_library_ok = 1;
6880 done = 1; // Exit the loop.
6881 } break;
6882 case 2: { // Neighbor is dead.
6883
6884#if defined(KMP_USE_SHM)
6885 // close shared memory.
6886 shm_unlink(shm_name); // this removes file in /dev/shm
6887#else
6888 // Clear the variable and try to register library again.
6889 __kmp_env_unset(name);
6890#endif
6891 } break;
6892 default: {
6893 KMP_DEBUG_ASSERT(0);
6894 } break;
6895 }
6896 }
6897 KMP_INTERNAL_FREE((void *)value);
6898#if defined(KMP_USE_SHM)
6899 KMP_INTERNAL_FREE((void *)shm_name);
6900#endif
6901 } // while
6902 KMP_INTERNAL_FREE((void *)name);
6903
6904} // func __kmp_register_library_startup
6905
6906void __kmp_unregister_library(void) {
6907
6908 char *name = __kmp_reg_status_name();
6909 char *value = NULL;
6910
6911#if defined(KMP_USE_SHM)
6912 bool use_shm = true;
6913 char *shm_name = __kmp_str_format("/%s", name);
6914 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6915 if (fd1 == -1) {
6916 // File did not open. Try the temporary file.
6917 use_shm = false;
6918 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6919 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6920 if (fd1 == -1) {
6921 // give it up now.
6922 return;
6923 }
6924 }
6925 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6926 if (data1 != MAP_FAILED) {
6927 value = __kmp_str_format("%s", data1); // read value from SHM
6928 munmap(data1, SHM_SIZE);
6929 }
6930 close(fd1);
6931#else
6932 value = __kmp_env_get(name);
6933#endif
6934
6935 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6936 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6937 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6938// Ok, this is our variable. Delete it.
6939#if defined(KMP_USE_SHM)
6940 if (use_shm) {
6941 shm_unlink(shm_name); // this removes file in /dev/shm
6942 } else {
6943 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6944 unlink(temp_reg_status_file_name); // this removes the temp file
6945 }
6946#else
6947 __kmp_env_unset(name);
6948#endif
6949 }
6950
6951#if defined(KMP_USE_SHM)
6952 KMP_INTERNAL_FREE(shm_name);
6953 if (!use_shm) {
6954 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6955 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6956 }
6957#endif
6958
6959 KMP_INTERNAL_FREE(__kmp_registration_str);
6960 KMP_INTERNAL_FREE(value);
6961 KMP_INTERNAL_FREE(name);
6962
6963 __kmp_registration_flag = 0;
6964 __kmp_registration_str = NULL;
6965
6966} // __kmp_unregister_library
6967
6968// End of Library registration stuff.
6969// -----------------------------------------------------------------------------
6970
6971#if KMP_MIC_SUPPORTED
6972
6973static void __kmp_check_mic_type() {
6974 kmp_cpuid_t cpuid_state = {0};
6975 kmp_cpuid_t *cs_p = &cpuid_state;
6976 __kmp_x86_cpuid(1, 0, cs_p);
6977 // We don't support mic1 at the moment
6978 if ((cs_p->eax & 0xff0) == 0xB10) {
6979 __kmp_mic_type = mic2;
6980 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6981 __kmp_mic_type = mic3;
6982 } else {
6983 __kmp_mic_type = non_mic;
6984 }
6985}
6986
6987#endif /* KMP_MIC_SUPPORTED */
6988
6989#if KMP_HAVE_UMWAIT
6990static void __kmp_user_level_mwait_init() {
6991 struct kmp_cpuid buf;
6992 __kmp_x86_cpuid(7, 0, &buf);
6993 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6994 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6995 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6996 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6997 __kmp_umwait_enabled));
6998}
6999#elif KMP_HAVE_MWAIT
7000#ifndef AT_INTELPHIUSERMWAIT
7001// Spurious, non-existent value that should always fail to return anything.
7002// Will be replaced with the correct value when we know that.
7003#define AT_INTELPHIUSERMWAIT 10000
7004#endif
7005// getauxval() function is available in RHEL7 and SLES12. If a system with an
7006// earlier OS is used to build the RTL, we'll use the following internal
7007// function when the entry is not found.
7008unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7009unsigned long getauxval(unsigned long) { return 0; }
7010
7011static void __kmp_user_level_mwait_init() {
7012 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7013 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7014 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7015 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7016 if (__kmp_mic_type == mic3) {
7017 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7018 if ((res & 0x1) || __kmp_user_level_mwait) {
7019 __kmp_mwait_enabled = TRUE;
7020 if (__kmp_user_level_mwait) {
7021 KMP_INFORM(EnvMwaitWarn);
7022 }
7023 } else {
7024 __kmp_mwait_enabled = FALSE;
7025 }
7026 }
7027 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7028 "__kmp_mwait_enabled = %d\n",
7029 __kmp_mic_type, __kmp_mwait_enabled));
7030}
7031#endif /* KMP_HAVE_UMWAIT */
7032
7033static void __kmp_do_serial_initialize(void) {
7034 int i, gtid;
7035 size_t size;
7036
7037 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7038
7039 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7040 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7041 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7042 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7043 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7044
7045#if OMPT_SUPPORT
7046 ompt_pre_init();
7047#endif
7048#if OMPD_SUPPORT
7049 __kmp_env_dump();
7050 ompd_init();
7051#endif
7052
7053 __kmp_validate_locks();
7054
7055#if ENABLE_LIBOMPTARGET
7056 /* Initialize functions from libomptarget */
7057 __kmp_init_omptarget();
7058#endif
7059
7060 /* Initialize internal memory allocator */
7061 __kmp_init_allocator();
7062
7063 /* Register the library startup via an environment variable or via mapped
7064 shared memory file and check to see whether another copy of the library is
7065 already registered. Since forked child process is often terminated, we
7066 postpone the registration till middle initialization in the child */
7067 if (__kmp_need_register_serial)
7068 __kmp_register_library_startup();
7069
7070 /* TODO reinitialization of library */
7071 if (TCR_4(__kmp_global.g.g_done)) {
7072 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7073 }
7074
7075 __kmp_global.g.g_abort = 0;
7076 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7077
7078/* initialize the locks */
7079#if KMP_USE_ADAPTIVE_LOCKS
7080#if KMP_DEBUG_ADAPTIVE_LOCKS
7081 __kmp_init_speculative_stats();
7082#endif
7083#endif
7084#if KMP_STATS_ENABLED
7085 __kmp_stats_init();
7086#endif
7087 __kmp_init_lock(&__kmp_global_lock);
7088 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7089 __kmp_init_lock(&__kmp_debug_lock);
7090 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7091 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7092 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7093 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7094 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7095 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7096 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7097 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7098 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7099 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7100 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7101 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7102 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7103 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7104 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7105#if KMP_USE_MONITOR
7106 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7107#endif
7108 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7109
7110 /* conduct initialization and initial setup of configuration */
7111
7112 __kmp_runtime_initialize();
7113
7114#if KMP_MIC_SUPPORTED
7115 __kmp_check_mic_type();
7116#endif
7117
7118// Some global variable initialization moved here from kmp_env_initialize()
7119#ifdef KMP_DEBUG
7120 kmp_diag = 0;
7121#endif
7122 __kmp_abort_delay = 0;
7123
7124 // From __kmp_init_dflt_team_nth()
7125 /* assume the entire machine will be used */
7126 __kmp_dflt_team_nth_ub = __kmp_xproc;
7127 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7128 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7129 }
7130 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7131 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7132 }
7133 __kmp_max_nth = __kmp_sys_max_nth;
7134 __kmp_cg_max_nth = __kmp_sys_max_nth;
7135 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7136 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7137 __kmp_teams_max_nth = __kmp_sys_max_nth;
7138 }
7139
7140 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7141 // part
7142 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7143#if KMP_USE_MONITOR
7144 __kmp_monitor_wakeups =
7145 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7146 __kmp_bt_intervals =
7147 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7148#endif
7149 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7150 __kmp_library = library_throughput;
7151 // From KMP_SCHEDULE initialization
7152 __kmp_static = kmp_sch_static_balanced;
7153// AC: do not use analytical here, because it is non-monotonous
7154//__kmp_guided = kmp_sch_guided_iterative_chunked;
7155//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7156// need to repeat assignment
7157// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7158// bit control and barrier method control parts
7159#if KMP_FAST_REDUCTION_BARRIER
7160#define kmp_reduction_barrier_gather_bb ((int)1)
7161#define kmp_reduction_barrier_release_bb ((int)1)
7162#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7163#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7164#endif // KMP_FAST_REDUCTION_BARRIER
7165 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7166 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7167 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7168 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7169 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7170#if KMP_FAST_REDUCTION_BARRIER
7171 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7172 // lin_64 ): hyper,1
7173 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7174 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7175 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7176 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7177 }
7178#endif // KMP_FAST_REDUCTION_BARRIER
7179 }
7180#if KMP_FAST_REDUCTION_BARRIER
7181#undef kmp_reduction_barrier_release_pat
7182#undef kmp_reduction_barrier_gather_pat
7183#undef kmp_reduction_barrier_release_bb
7184#undef kmp_reduction_barrier_gather_bb
7185#endif // KMP_FAST_REDUCTION_BARRIER
7186#if KMP_MIC_SUPPORTED
7187 if (__kmp_mic_type == mic2) { // KNC
7188 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7189 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7190 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7191 1; // forkjoin release
7192 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7193 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7194 }
7195#if KMP_FAST_REDUCTION_BARRIER
7196 if (__kmp_mic_type == mic2) { // KNC
7197 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7198 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7199 }
7200#endif // KMP_FAST_REDUCTION_BARRIER
7201#endif // KMP_MIC_SUPPORTED
7202
7203// From KMP_CHECKS initialization
7204#ifdef KMP_DEBUG
7205 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7206#else
7207 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7208#endif
7209
7210 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7211 __kmp_foreign_tp = TRUE;
7212
7213 __kmp_global.g.g_dynamic = FALSE;
7214 __kmp_global.g.g_dynamic_mode = dynamic_default;
7215
7216 __kmp_init_nesting_mode();
7217
7218 __kmp_env_initialize(NULL);
7219
7220#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7221 __kmp_user_level_mwait_init();
7222#endif
7223// Print all messages in message catalog for testing purposes.
7224#ifdef KMP_DEBUG
7225 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7226 if (__kmp_str_match_true(val)) {
7227 kmp_str_buf_t buffer;
7228 __kmp_str_buf_init(&buffer);
7229 __kmp_i18n_dump_catalog(&buffer);
7230 __kmp_printf("%s", buffer.str);
7231 __kmp_str_buf_free(&buffer);
7232 }
7233 __kmp_env_free(&val);
7234#endif
7235
7236 __kmp_threads_capacity =
7237 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7238 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7239 __kmp_tp_capacity = __kmp_default_tp_capacity(
7240 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7241
7242 // If the library is shut down properly, both pools must be NULL. Just in
7243 // case, set them to NULL -- some memory may leak, but subsequent code will
7244 // work even if pools are not freed.
7245 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7246 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7247 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7248 __kmp_thread_pool = NULL;
7249 __kmp_thread_pool_insert_pt = NULL;
7250 __kmp_team_pool = NULL;
7251
7252 /* Allocate all of the variable sized records */
7253 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7254 * expandable */
7255 /* Since allocation is cache-aligned, just add extra padding at the end */
7256 size =
7257 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7258 CACHE_LINE;
7259 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7260 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7261 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7262
7263 /* init thread counts */
7264 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7265 0); // Asserts fail if the library is reinitializing and
7266 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7267 __kmp_all_nth = 0;
7268 __kmp_nth = 0;
7269
7270 /* setup the uber master thread and hierarchy */
7271 gtid = __kmp_register_root(TRUE);
7272 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7273 KMP_ASSERT(KMP_UBER_GTID(gtid));
7274 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7275
7276 KMP_MB(); /* Flush all pending memory write invalidates. */
7277
7278 __kmp_common_initialize();
7279
7280#if KMP_OS_UNIX
7281 /* invoke the child fork handler */
7282 __kmp_register_atfork();
7283#endif
7284
7285#if !KMP_DYNAMIC_LIB || \
7286 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7287 {
7288 /* Invoke the exit handler when the program finishes, only for static
7289 library and macOS* dynamic. For other dynamic libraries, we already
7290 have _fini and DllMain. */
7291 int rc = atexit(__kmp_internal_end_atexit);
7292 if (rc != 0) {
7293 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7294 __kmp_msg_null);
7295 }
7296 }
7297#endif
7298
7299#if KMP_HANDLE_SIGNALS
7300#if KMP_OS_UNIX
7301 /* NOTE: make sure that this is called before the user installs their own
7302 signal handlers so that the user handlers are called first. this way they
7303 can return false, not call our handler, avoid terminating the library, and
7304 continue execution where they left off. */
7305 __kmp_install_signals(FALSE);
7306#endif /* KMP_OS_UNIX */
7307#if KMP_OS_WINDOWS
7308 __kmp_install_signals(TRUE);
7309#endif /* KMP_OS_WINDOWS */
7310#endif
7311
7312 /* we have finished the serial initialization */
7313 __kmp_init_counter++;
7314
7315 __kmp_init_serial = TRUE;
7316
7317 if (__kmp_settings) {
7318 __kmp_env_print();
7319 }
7320
7321 if (__kmp_display_env || __kmp_display_env_verbose) {
7322 __kmp_env_print_2();
7323 }
7324
7325#if OMPT_SUPPORT
7326 ompt_post_init();
7327#endif
7328
7329 KMP_MB();
7330
7331 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7332}
7333
7334void __kmp_serial_initialize(void) {
7335 if (__kmp_init_serial) {
7336 return;
7337 }
7338 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7339 if (__kmp_init_serial) {
7340 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7341 return;
7342 }
7343 __kmp_do_serial_initialize();
7344 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7345}
7346
7347static void __kmp_do_middle_initialize(void) {
7348 int i, j;
7349 int prev_dflt_team_nth;
7350
7351 if (!__kmp_init_serial) {
7352 __kmp_do_serial_initialize();
7353 }
7354
7355 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7356
7357 if (UNLIKELY(!__kmp_need_register_serial)) {
7358 // We are in a forked child process. The registration was skipped during
7359 // serial initialization in __kmp_atfork_child handler. Do it here.
7360 __kmp_register_library_startup();
7361 }
7362
7363 // Save the previous value for the __kmp_dflt_team_nth so that
7364 // we can avoid some reinitialization if it hasn't changed.
7365 prev_dflt_team_nth = __kmp_dflt_team_nth;
7366
7367#if KMP_AFFINITY_SUPPORTED
7368 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7369 // number of cores on the machine.
7370 __kmp_affinity_initialize(__kmp_affinity);
7371
7372#endif /* KMP_AFFINITY_SUPPORTED */
7373
7374 KMP_ASSERT(__kmp_xproc > 0);
7375 if (__kmp_avail_proc == 0) {
7376 __kmp_avail_proc = __kmp_xproc;
7377 }
7378
7379 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7380 // correct them now
7381 j = 0;
7382 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7383 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7384 __kmp_avail_proc;
7385 j++;
7386 }
7387
7388 if (__kmp_dflt_team_nth == 0) {
7389#ifdef KMP_DFLT_NTH_CORES
7390 // Default #threads = #cores
7391 __kmp_dflt_team_nth = __kmp_ncores;
7392 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7393 "__kmp_ncores (%d)\n",
7394 __kmp_dflt_team_nth));
7395#else
7396 // Default #threads = #available OS procs
7397 __kmp_dflt_team_nth = __kmp_avail_proc;
7398 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7399 "__kmp_avail_proc(%d)\n",
7400 __kmp_dflt_team_nth));
7401#endif /* KMP_DFLT_NTH_CORES */
7402 }
7403
7404 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7405 __kmp_dflt_team_nth = KMP_MIN_NTH;
7406 }
7407 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7408 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7409 }
7410
7411 if (__kmp_nesting_mode > 0)
7412 __kmp_set_nesting_mode_threads();
7413
7414 // There's no harm in continuing if the following check fails,
7415 // but it indicates an error in the previous logic.
7416 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7417
7418 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7419 // Run through the __kmp_threads array and set the num threads icv for each
7420 // root thread that is currently registered with the RTL (which has not
7421 // already explicitly set its nthreads-var with a call to
7422 // omp_set_num_threads()).
7423 for (i = 0; i < __kmp_threads_capacity; i++) {
7424 kmp_info_t *thread = __kmp_threads[i];
7425 if (thread == NULL)
7426 continue;
7427 if (thread->th.th_current_task->td_icvs.nproc != 0)
7428 continue;
7429
7430 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7431 }
7432 }
7433 KA_TRACE(
7434 20,
7435 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7436 __kmp_dflt_team_nth));
7437
7438#ifdef KMP_ADJUST_BLOCKTIME
7439 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7440 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7441 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7442 if (__kmp_nth > __kmp_avail_proc) {
7443 __kmp_zero_bt = TRUE;
7444 }
7445 }
7446#endif /* KMP_ADJUST_BLOCKTIME */
7447
7448 /* we have finished middle initialization */
7449 TCW_SYNC_4(__kmp_init_middle, TRUE);
7450
7451 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7452}
7453
7454void __kmp_middle_initialize(void) {
7455 if (__kmp_init_middle) {
7456 return;
7457 }
7458 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7459 if (__kmp_init_middle) {
7460 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7461 return;
7462 }
7463 __kmp_do_middle_initialize();
7464 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7465}
7466
7467void __kmp_parallel_initialize(void) {
7468 int gtid = __kmp_entry_gtid(); // this might be a new root
7469
7470 /* synchronize parallel initialization (for sibling) */
7471 if (TCR_4(__kmp_init_parallel))
7472 return;
7473 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7474 if (TCR_4(__kmp_init_parallel)) {
7475 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7476 return;
7477 }
7478
7479 /* TODO reinitialization after we have already shut down */
7480 if (TCR_4(__kmp_global.g.g_done)) {
7481 KA_TRACE(
7482 10,
7483 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7484 __kmp_infinite_loop();
7485 }
7486
7487 /* jc: The lock __kmp_initz_lock is already held, so calling
7488 __kmp_serial_initialize would cause a deadlock. So we call
7489 __kmp_do_serial_initialize directly. */
7490 if (!__kmp_init_middle) {
7491 __kmp_do_middle_initialize();
7492 }
7493 __kmp_assign_root_init_mask();
7494 __kmp_resume_if_hard_paused();
7495
7496 /* begin initialization */
7497 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7498 KMP_ASSERT(KMP_UBER_GTID(gtid));
7499
7500#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7501 // Save the FP control regs.
7502 // Worker threads will set theirs to these values at thread startup.
7503 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7504 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7505 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7506#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7507
7508#if KMP_OS_UNIX
7509#if KMP_HANDLE_SIGNALS
7510 /* must be after __kmp_serial_initialize */
7511 __kmp_install_signals(TRUE);
7512#endif
7513#endif
7514
7515 __kmp_suspend_initialize();
7516
7517#if defined(USE_LOAD_BALANCE)
7518 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7519 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7520 }
7521#else
7522 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7523 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7524 }
7525#endif
7526
7527 if (__kmp_version) {
7528 __kmp_print_version_2();
7529 }
7530
7531 /* we have finished parallel initialization */
7532 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7533
7534 KMP_MB();
7535 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7536
7537 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7538}
7539
7540void __kmp_hidden_helper_initialize() {
7541 if (TCR_4(__kmp_init_hidden_helper))
7542 return;
7543
7544 // __kmp_parallel_initialize is required before we initialize hidden helper
7545 if (!TCR_4(__kmp_init_parallel))
7546 __kmp_parallel_initialize();
7547
7548 // Double check. Note that this double check should not be placed before
7549 // __kmp_parallel_initialize as it will cause dead lock.
7550 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7551 if (TCR_4(__kmp_init_hidden_helper)) {
7552 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7553 return;
7554 }
7555
7556#if KMP_AFFINITY_SUPPORTED
7557 // Initialize hidden helper affinity settings.
7558 // The above __kmp_parallel_initialize() will initialize
7559 // regular affinity (and topology) if not already done.
7560 if (!__kmp_hh_affinity.flags.initialized)
7561 __kmp_affinity_initialize(__kmp_hh_affinity);
7562#endif
7563
7564 // Set the count of hidden helper tasks to be executed to zero
7565 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7566
7567 // Set the global variable indicating that we're initializing hidden helper
7568 // team/threads
7569 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7570
7571 // Platform independent initialization
7572 __kmp_do_initialize_hidden_helper_threads();
7573
7574 // Wait here for the finish of initialization of hidden helper teams
7575 __kmp_hidden_helper_threads_initz_wait();
7576
7577 // We have finished hidden helper initialization
7578 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7579
7580 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7581}
7582
7583/* ------------------------------------------------------------------------ */
7584
7585void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7586 kmp_team_t *team) {
7587 kmp_disp_t *dispatch;
7588
7589 KMP_MB();
7590
7591 /* none of the threads have encountered any constructs, yet. */
7592 this_thr->th.th_local.this_construct = 0;
7593#if KMP_CACHE_MANAGE
7594 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7595#endif /* KMP_CACHE_MANAGE */
7596 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7597 KMP_DEBUG_ASSERT(dispatch);
7598 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7599 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7600 // this_thr->th.th_info.ds.ds_tid ] );
7601
7602 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7603 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7604 if (__kmp_env_consistency_check)
7605 __kmp_push_parallel(gtid, team->t.t_ident);
7606
7607 KMP_MB(); /* Flush all pending memory write invalidates. */
7608}
7609
7610void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7611 kmp_team_t *team) {
7612 if (__kmp_env_consistency_check)
7613 __kmp_pop_parallel(gtid, team->t.t_ident);
7614
7615 __kmp_finish_implicit_task(this_thr);
7616}
7617
7618int __kmp_invoke_task_func(int gtid) {
7619 int rc;
7620 int tid = __kmp_tid_from_gtid(gtid);
7621 kmp_info_t *this_thr = __kmp_threads[gtid];
7622 kmp_team_t *team = this_thr->th.th_team;
7623
7624 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7625#if USE_ITT_BUILD
7626 if (__itt_stack_caller_create_ptr) {
7627 // inform ittnotify about entering user's code
7628 if (team->t.t_stack_id != NULL) {
7629 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7630 } else {
7631 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7632 __kmp_itt_stack_callee_enter(
7633 (__itt_caller)team->t.t_parent->t.t_stack_id);
7634 }
7635 }
7636#endif /* USE_ITT_BUILD */
7637#if INCLUDE_SSC_MARKS
7638 SSC_MARK_INVOKING();
7639#endif
7640
7641#if OMPT_SUPPORT
7642 void *dummy;
7643 void **exit_frame_p;
7644 ompt_data_t *my_task_data;
7645 ompt_data_t *my_parallel_data;
7646 int ompt_team_size;
7647
7648 if (ompt_enabled.enabled) {
7649 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7650 .ompt_task_info.frame.exit_frame.ptr);
7651 } else {
7652 exit_frame_p = &dummy;
7653 }
7654
7655 my_task_data =
7656 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7657 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7658 if (ompt_enabled.ompt_callback_implicit_task) {
7659 ompt_team_size = team->t.t_nproc;
7660 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7661 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7662 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7663 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7664 }
7665#endif
7666
7667#if KMP_STATS_ENABLED
7668 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7669 if (previous_state == stats_state_e::TEAMS_REGION) {
7670 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7671 } else {
7672 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7673 }
7674 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7675#endif
7676
7677 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7678 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7679#if OMPT_SUPPORT
7680 ,
7681 exit_frame_p
7682#endif
7683 );
7684#if OMPT_SUPPORT
7685 *exit_frame_p = NULL;
7686 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7687#endif
7688
7689#if KMP_STATS_ENABLED
7690 if (previous_state == stats_state_e::TEAMS_REGION) {
7691 KMP_SET_THREAD_STATE(previous_state);
7692 }
7693 KMP_POP_PARTITIONED_TIMER();
7694#endif
7695
7696#if USE_ITT_BUILD
7697 if (__itt_stack_caller_create_ptr) {
7698 // inform ittnotify about leaving user's code
7699 if (team->t.t_stack_id != NULL) {
7700 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7701 } else {
7702 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7703 __kmp_itt_stack_callee_leave(
7704 (__itt_caller)team->t.t_parent->t.t_stack_id);
7705 }
7706 }
7707#endif /* USE_ITT_BUILD */
7708 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7709
7710 return rc;
7711}
7712
7713void __kmp_teams_master(int gtid) {
7714 // This routine is called by all primary threads in teams construct
7715 kmp_info_t *thr = __kmp_threads[gtid];
7716 kmp_team_t *team = thr->th.th_team;
7717 ident_t *loc = team->t.t_ident;
7718 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7719 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7720 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7721 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7722 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7723
7724 // This thread is a new CG root. Set up the proper variables.
7725 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7726 tmp->cg_root = thr; // Make thr the CG root
7727 // Init to thread limit stored when league primary threads were forked
7728 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7729 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7730 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7731 " cg_nthreads to 1\n",
7732 thr, tmp));
7733 tmp->up = thr->th.th_cg_roots;
7734 thr->th.th_cg_roots = tmp;
7735
7736// Launch league of teams now, but not let workers execute
7737// (they hang on fork barrier until next parallel)
7738#if INCLUDE_SSC_MARKS
7739 SSC_MARK_FORKING();
7740#endif
7741 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7742 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7743 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7744#if INCLUDE_SSC_MARKS
7745 SSC_MARK_JOINING();
7746#endif
7747 // If the team size was reduced from the limit, set it to the new size
7748 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7749 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7750 // AC: last parameter "1" eliminates join barrier which won't work because
7751 // worker threads are in a fork barrier waiting for more parallel regions
7752 __kmp_join_call(loc, gtid
7753#if OMPT_SUPPORT
7754 ,
7755 fork_context_intel
7756#endif
7757 ,
7758 1);
7759}
7760
7761int __kmp_invoke_teams_master(int gtid) {
7762 kmp_info_t *this_thr = __kmp_threads[gtid];
7763 kmp_team_t *team = this_thr->th.th_team;
7764#if KMP_DEBUG
7765 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7766 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7767 (void *)__kmp_teams_master);
7768#endif
7769 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7770#if OMPT_SUPPORT
7771 int tid = __kmp_tid_from_gtid(gtid);
7772 ompt_data_t *task_data =
7773 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7774 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7775 if (ompt_enabled.ompt_callback_implicit_task) {
7776 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7777 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7778 ompt_task_initial);
7779 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7780 }
7781#endif
7782 __kmp_teams_master(gtid);
7783#if OMPT_SUPPORT
7784 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7785#endif
7786 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7787 return 1;
7788}
7789
7790/* this sets the requested number of threads for the next parallel region
7791 encountered by this team. since this should be enclosed in the forkjoin
7792 critical section it should avoid race conditions with asymmetrical nested
7793 parallelism */
7794
7795void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7796 kmp_info_t *thr = __kmp_threads[gtid];
7797
7798 if (num_threads > 0)
7799 thr->th.th_set_nproc = num_threads;
7800}
7801
7802static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7803 int num_threads) {
7804 KMP_DEBUG_ASSERT(thr);
7805 // Remember the number of threads for inner parallel regions
7806 if (!TCR_4(__kmp_init_middle))
7807 __kmp_middle_initialize(); // get internal globals calculated
7808 __kmp_assign_root_init_mask();
7809 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7810 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7811
7812 if (num_threads == 0) {
7813 if (__kmp_teams_thread_limit > 0) {
7814 num_threads = __kmp_teams_thread_limit;
7815 } else {
7816 num_threads = __kmp_avail_proc / num_teams;
7817 }
7818 // adjust num_threads w/o warning as it is not user setting
7819 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7820 // no thread_limit clause specified - do not change thread-limit-var ICV
7821 if (num_threads > __kmp_dflt_team_nth) {
7822 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7823 }
7824 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7825 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7826 } // prevent team size to exceed thread-limit-var
7827 if (num_teams * num_threads > __kmp_teams_max_nth) {
7828 num_threads = __kmp_teams_max_nth / num_teams;
7829 }
7830 if (num_threads == 0) {
7831 num_threads = 1;
7832 }
7833 } else {
7834 if (num_threads < 0) {
7835 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7836 __kmp_msg_null);
7837 num_threads = 1;
7838 }
7839 // This thread will be the primary thread of the league primary threads
7840 // Store new thread limit; old limit is saved in th_cg_roots list
7841 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7842 // num_threads = min(num_threads, nthreads-var)
7843 if (num_threads > __kmp_dflt_team_nth) {
7844 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7845 }
7846 if (num_teams * num_threads > __kmp_teams_max_nth) {
7847 int new_threads = __kmp_teams_max_nth / num_teams;
7848 if (new_threads == 0) {
7849 new_threads = 1;
7850 }
7851 if (new_threads != num_threads) {
7852 if (!__kmp_reserve_warn) { // user asked for too many threads
7853 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7854 __kmp_msg(kmp_ms_warning,
7855 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7856 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7857 }
7858 }
7859 num_threads = new_threads;
7860 }
7861 }
7862 thr->th.th_teams_size.nth = num_threads;
7863}
7864
7865/* this sets the requested number of teams for the teams region and/or
7866 the number of threads for the next parallel region encountered */
7867void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7868 int num_threads) {
7869 kmp_info_t *thr = __kmp_threads[gtid];
7870 if (num_teams < 0) {
7871 // OpenMP specification requires requested values to be positive,
7872 // but people can send us any value, so we'd better check
7873 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7874 __kmp_msg_null);
7875 num_teams = 1;
7876 }
7877 if (num_teams == 0) {
7878 if (__kmp_nteams > 0) {
7879 num_teams = __kmp_nteams;
7880 } else {
7881 num_teams = 1; // default number of teams is 1.
7882 }
7883 }
7884 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7885 if (!__kmp_reserve_warn) {
7886 __kmp_reserve_warn = 1;
7887 __kmp_msg(kmp_ms_warning,
7888 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7889 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7890 }
7891 num_teams = __kmp_teams_max_nth;
7892 }
7893 // Set number of teams (number of threads in the outer "parallel" of the
7894 // teams)
7895 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7896
7897 __kmp_push_thread_limit(thr, num_teams, num_threads);
7898}
7899
7900/* This sets the requested number of teams for the teams region and/or
7901 the number of threads for the next parallel region encountered */
7902void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7903 int num_teams_ub, int num_threads) {
7904 kmp_info_t *thr = __kmp_threads[gtid];
7905 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7906 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7907 KMP_DEBUG_ASSERT(num_threads >= 0);
7908
7909 if (num_teams_lb > num_teams_ub) {
7910 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7911 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7912 }
7913
7914 int num_teams = 1; // defalt number of teams is 1.
7915
7916 if (num_teams_lb == 0 && num_teams_ub > 0)
7917 num_teams_lb = num_teams_ub;
7918
7919 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7920 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7921 if (num_teams > __kmp_teams_max_nth) {
7922 if (!__kmp_reserve_warn) {
7923 __kmp_reserve_warn = 1;
7924 __kmp_msg(kmp_ms_warning,
7925 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7926 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7927 }
7928 num_teams = __kmp_teams_max_nth;
7929 }
7930 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7931 num_teams = num_teams_ub;
7932 } else { // num_teams_lb <= num_teams <= num_teams_ub
7933 if (num_threads <= 0) {
7934 if (num_teams_ub > __kmp_teams_max_nth) {
7935 num_teams = num_teams_lb;
7936 } else {
7937 num_teams = num_teams_ub;
7938 }
7939 } else {
7940 num_teams = (num_threads > __kmp_teams_max_nth)
7941 ? num_teams
7942 : __kmp_teams_max_nth / num_threads;
7943 if (num_teams < num_teams_lb) {
7944 num_teams = num_teams_lb;
7945 } else if (num_teams > num_teams_ub) {
7946 num_teams = num_teams_ub;
7947 }
7948 }
7949 }
7950 // Set number of teams (number of threads in the outer "parallel" of the
7951 // teams)
7952 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7953
7954 __kmp_push_thread_limit(thr, num_teams, num_threads);
7955}
7956
7957// Set the proc_bind var to use in the following parallel region.
7958void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7959 kmp_info_t *thr = __kmp_threads[gtid];
7960 thr->th.th_set_proc_bind = proc_bind;
7961}
7962
7963/* Launch the worker threads into the microtask. */
7964
7965void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7966 kmp_info_t *this_thr = __kmp_threads[gtid];
7967
7968#ifdef KMP_DEBUG
7969 int f;
7970#endif /* KMP_DEBUG */
7971
7972 KMP_DEBUG_ASSERT(team);
7973 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7974 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7975 KMP_MB(); /* Flush all pending memory write invalidates. */
7976
7977 team->t.t_construct = 0; /* no single directives seen yet */
7978 team->t.t_ordered.dt.t_value =
7979 0; /* thread 0 enters the ordered section first */
7980
7981 /* Reset the identifiers on the dispatch buffer */
7982 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7983 if (team->t.t_max_nproc > 1) {
7984 int i;
7985 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7986 team->t.t_disp_buffer[i].buffer_index = i;
7987 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7988 }
7989 } else {
7990 team->t.t_disp_buffer[0].buffer_index = 0;
7991 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7992 }
7993
7994 KMP_MB(); /* Flush all pending memory write invalidates. */
7995 KMP_ASSERT(this_thr->th.th_team == team);
7996
7997#ifdef KMP_DEBUG
7998 for (f = 0; f < team->t.t_nproc; f++) {
7999 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8000 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8001 }
8002#endif /* KMP_DEBUG */
8003
8004 /* release the worker threads so they may begin working */
8005 __kmp_fork_barrier(gtid, 0);
8006}
8007
8008void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8009 kmp_info_t *this_thr = __kmp_threads[gtid];
8010
8011 KMP_DEBUG_ASSERT(team);
8012 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8013 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8014 KMP_MB(); /* Flush all pending memory write invalidates. */
8015
8016 /* Join barrier after fork */
8017
8018#ifdef KMP_DEBUG
8019 if (__kmp_threads[gtid] &&
8020 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8021 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8022 __kmp_threads[gtid]);
8023 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8024 "team->t.t_nproc=%d\n",
8025 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8026 team->t.t_nproc);
8027 __kmp_print_structure();
8028 }
8029 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8030 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8031#endif /* KMP_DEBUG */
8032
8033 __kmp_join_barrier(gtid); /* wait for everyone */
8034#if OMPT_SUPPORT
8035 if (ompt_enabled.enabled &&
8036 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8037 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8038 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8039 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8040#if OMPT_OPTIONAL
8041 void *codeptr = NULL;
8042 if (KMP_MASTER_TID(ds_tid) &&
8043 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8044 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8045 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8046
8047 if (ompt_enabled.ompt_callback_sync_region_wait) {
8048 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8049 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8050 codeptr);
8051 }
8052 if (ompt_enabled.ompt_callback_sync_region) {
8053 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8054 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8055 codeptr);
8056 }
8057#endif
8058 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8059 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8060 ompt_scope_end, NULL, task_data, 0, ds_tid,
8061 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8062 }
8063 }
8064#endif
8065
8066 KMP_MB(); /* Flush all pending memory write invalidates. */
8067 KMP_ASSERT(this_thr->th.th_team == team);
8068}
8069
8070/* ------------------------------------------------------------------------ */
8071
8072#ifdef USE_LOAD_BALANCE
8073
8074// Return the worker threads actively spinning in the hot team, if we
8075// are at the outermost level of parallelism. Otherwise, return 0.
8076static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8077 int i;
8078 int retval;
8079 kmp_team_t *hot_team;
8080
8081 if (root->r.r_active) {
8082 return 0;
8083 }
8084 hot_team = root->r.r_hot_team;
8085 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8086 return hot_team->t.t_nproc - 1; // Don't count primary thread
8087 }
8088
8089 // Skip the primary thread - it is accounted for elsewhere.
8090 retval = 0;
8091 for (i = 1; i < hot_team->t.t_nproc; i++) {
8092 if (hot_team->t.t_threads[i]->th.th_active) {
8093 retval++;
8094 }
8095 }
8096 return retval;
8097}
8098
8099// Perform an automatic adjustment to the number of
8100// threads used by the next parallel region.
8101static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8102 int retval;
8103 int pool_active;
8104 int hot_team_active;
8105 int team_curr_active;
8106 int system_active;
8107
8108 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8109 set_nproc));
8110 KMP_DEBUG_ASSERT(root);
8111 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8112 ->th.th_current_task->td_icvs.dynamic == TRUE);
8113 KMP_DEBUG_ASSERT(set_nproc > 1);
8114
8115 if (set_nproc == 1) {
8116 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8117 return 1;
8118 }
8119
8120 // Threads that are active in the thread pool, active in the hot team for this
8121 // particular root (if we are at the outer par level), and the currently
8122 // executing thread (to become the primary thread) are available to add to the
8123 // new team, but are currently contributing to the system load, and must be
8124 // accounted for.
8125 pool_active = __kmp_thread_pool_active_nth;
8126 hot_team_active = __kmp_active_hot_team_nproc(root);
8127 team_curr_active = pool_active + hot_team_active + 1;
8128
8129 // Check the system load.
8130 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8131 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8132 "hot team active = %d\n",
8133 system_active, pool_active, hot_team_active));
8134
8135 if (system_active < 0) {
8136 // There was an error reading the necessary info from /proc, so use the
8137 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8138 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8139 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8140 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8141
8142 // Make this call behave like the thread limit algorithm.
8143 retval = __kmp_avail_proc - __kmp_nth +
8144 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8145 if (retval > set_nproc) {
8146 retval = set_nproc;
8147 }
8148 if (retval < KMP_MIN_NTH) {
8149 retval = KMP_MIN_NTH;
8150 }
8151
8152 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8153 retval));
8154 return retval;
8155 }
8156
8157 // There is a slight delay in the load balance algorithm in detecting new
8158 // running procs. The real system load at this instant should be at least as
8159 // large as the #active omp thread that are available to add to the team.
8160 if (system_active < team_curr_active) {
8161 system_active = team_curr_active;
8162 }
8163 retval = __kmp_avail_proc - system_active + team_curr_active;
8164 if (retval > set_nproc) {
8165 retval = set_nproc;
8166 }
8167 if (retval < KMP_MIN_NTH) {
8168 retval = KMP_MIN_NTH;
8169 }
8170
8171 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8172 return retval;
8173} // __kmp_load_balance_nproc()
8174
8175#endif /* USE_LOAD_BALANCE */
8176
8177/* ------------------------------------------------------------------------ */
8178
8179/* NOTE: this is called with the __kmp_init_lock held */
8180void __kmp_cleanup(void) {
8181 int f;
8182
8183 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8184
8185 if (TCR_4(__kmp_init_parallel)) {
8186#if KMP_HANDLE_SIGNALS
8187 __kmp_remove_signals();
8188#endif
8189 TCW_4(__kmp_init_parallel, FALSE);
8190 }
8191
8192 if (TCR_4(__kmp_init_middle)) {
8193#if KMP_AFFINITY_SUPPORTED
8194 __kmp_affinity_uninitialize();
8195#endif /* KMP_AFFINITY_SUPPORTED */
8196 __kmp_cleanup_hierarchy();
8197 TCW_4(__kmp_init_middle, FALSE);
8198 }
8199
8200 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8201
8202 if (__kmp_init_serial) {
8203 __kmp_runtime_destroy();
8204 __kmp_init_serial = FALSE;
8205 }
8206
8207 __kmp_cleanup_threadprivate_caches();
8208
8209 for (f = 0; f < __kmp_threads_capacity; f++) {
8210 if (__kmp_root[f] != NULL) {
8211 __kmp_free(__kmp_root[f]);
8212 __kmp_root[f] = NULL;
8213 }
8214 }
8215 __kmp_free(__kmp_threads);
8216 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8217 // there is no need in freeing __kmp_root.
8218 __kmp_threads = NULL;
8219 __kmp_root = NULL;
8220 __kmp_threads_capacity = 0;
8221
8222 // Free old __kmp_threads arrays if they exist.
8223 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8224 while (ptr) {
8225 kmp_old_threads_list_t *next = ptr->next;
8226 __kmp_free(ptr->threads);
8227 __kmp_free(ptr);
8228 ptr = next;
8229 }
8230
8231#if KMP_USE_DYNAMIC_LOCK
8232 __kmp_cleanup_indirect_user_locks();
8233#else
8234 __kmp_cleanup_user_locks();
8235#endif
8236#if OMPD_SUPPORT
8237 if (ompd_state) {
8238 __kmp_free(ompd_env_block);
8239 ompd_env_block = NULL;
8240 ompd_env_block_size = 0;
8241 }
8242#endif
8243
8244#if KMP_AFFINITY_SUPPORTED
8245 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8246 __kmp_cpuinfo_file = NULL;
8247#endif /* KMP_AFFINITY_SUPPORTED */
8248
8249#if KMP_USE_ADAPTIVE_LOCKS
8250#if KMP_DEBUG_ADAPTIVE_LOCKS
8251 __kmp_print_speculative_stats();
8252#endif
8253#endif
8254 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8255 __kmp_nested_nth.nth = NULL;
8256 __kmp_nested_nth.size = 0;
8257 __kmp_nested_nth.used = 0;
8258 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8259 __kmp_nested_proc_bind.bind_types = NULL;
8260 __kmp_nested_proc_bind.size = 0;
8261 __kmp_nested_proc_bind.used = 0;
8262 if (__kmp_affinity_format) {
8263 KMP_INTERNAL_FREE(__kmp_affinity_format);
8264 __kmp_affinity_format = NULL;
8265 }
8266
8267 __kmp_i18n_catclose();
8268
8269#if KMP_USE_HIER_SCHED
8270 __kmp_hier_scheds.deallocate();
8271#endif
8272
8273#if KMP_STATS_ENABLED
8274 __kmp_stats_fini();
8275#endif
8276
8277 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8278}
8279
8280/* ------------------------------------------------------------------------ */
8281
8282int __kmp_ignore_mppbeg(void) {
8283 char *env;
8284
8285 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8286 if (__kmp_str_match_false(env))
8287 return FALSE;
8288 }
8289 // By default __kmpc_begin() is no-op.
8290 return TRUE;
8291}
8292
8293int __kmp_ignore_mppend(void) {
8294 char *env;
8295
8296 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8297 if (__kmp_str_match_false(env))
8298 return FALSE;
8299 }
8300 // By default __kmpc_end() is no-op.
8301 return TRUE;
8302}
8303
8304void __kmp_internal_begin(void) {
8305 int gtid;
8306 kmp_root_t *root;
8307
8308 /* this is a very important step as it will register new sibling threads
8309 and assign these new uber threads a new gtid */
8310 gtid = __kmp_entry_gtid();
8311 root = __kmp_threads[gtid]->th.th_root;
8312 KMP_ASSERT(KMP_UBER_GTID(gtid));
8313
8314 if (root->r.r_begin)
8315 return;
8316 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8317 if (root->r.r_begin) {
8318 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8319 return;
8320 }
8321
8322 root->r.r_begin = TRUE;
8323
8324 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8325}
8326
8327/* ------------------------------------------------------------------------ */
8328
8329void __kmp_user_set_library(enum library_type arg) {
8330 int gtid;
8331 kmp_root_t *root;
8332 kmp_info_t *thread;
8333
8334 /* first, make sure we are initialized so we can get our gtid */
8335
8336 gtid = __kmp_entry_gtid();
8337 thread = __kmp_threads[gtid];
8338
8339 root = thread->th.th_root;
8340
8341 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8342 library_serial));
8343 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8344 thread */
8345 KMP_WARNING(SetLibraryIncorrectCall);
8346 return;
8347 }
8348
8349 switch (arg) {
8350 case library_serial:
8351 thread->th.th_set_nproc = 0;
8352 set__nproc(thread, 1);
8353 break;
8354 case library_turnaround:
8355 thread->th.th_set_nproc = 0;
8356 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8357 : __kmp_dflt_team_nth_ub);
8358 break;
8359 case library_throughput:
8360 thread->th.th_set_nproc = 0;
8361 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8362 : __kmp_dflt_team_nth_ub);
8363 break;
8364 default:
8365 KMP_FATAL(UnknownLibraryType, arg);
8366 }
8367
8368 __kmp_aux_set_library(arg);
8369}
8370
8371void __kmp_aux_set_stacksize(size_t arg) {
8372 if (!__kmp_init_serial)
8373 __kmp_serial_initialize();
8374
8375#if KMP_OS_DARWIN
8376 if (arg & (0x1000 - 1)) {
8377 arg &= ~(0x1000 - 1);
8378 if (arg + 0x1000) /* check for overflow if we round up */
8379 arg += 0x1000;
8380 }
8381#endif
8382 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8383
8384 /* only change the default stacksize before the first parallel region */
8385 if (!TCR_4(__kmp_init_parallel)) {
8386 size_t value = arg; /* argument is in bytes */
8387
8388 if (value < __kmp_sys_min_stksize)
8389 value = __kmp_sys_min_stksize;
8390 else if (value > KMP_MAX_STKSIZE)
8391 value = KMP_MAX_STKSIZE;
8392
8393 __kmp_stksize = value;
8394
8395 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8396 }
8397
8398 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8399}
8400
8401/* set the behaviour of the runtime library */
8402/* TODO this can cause some odd behaviour with sibling parallelism... */
8403void __kmp_aux_set_library(enum library_type arg) {
8404 __kmp_library = arg;
8405
8406 switch (__kmp_library) {
8407 case library_serial: {
8408 KMP_INFORM(LibraryIsSerial);
8409 } break;
8410 case library_turnaround:
8411 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8412 __kmp_use_yield = 2; // only yield when oversubscribed
8413 break;
8414 case library_throughput:
8415 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8416 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8417 break;
8418 default:
8419 KMP_FATAL(UnknownLibraryType, arg);
8420 }
8421}
8422
8423/* Getting team information common for all team API */
8424// Returns NULL if not in teams construct
8425static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8426 kmp_info_t *thr = __kmp_entry_thread();
8427 teams_serialized = 0;
8428 if (thr->th.th_teams_microtask) {
8429 kmp_team_t *team = thr->th.th_team;
8430 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8431 int ii = team->t.t_level;
8432 teams_serialized = team->t.t_serialized;
8433 int level = tlevel + 1;
8434 KMP_DEBUG_ASSERT(ii >= tlevel);
8435 while (ii > level) {
8436 for (teams_serialized = team->t.t_serialized;
8437 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8438 }
8439 if (team->t.t_serialized && (!teams_serialized)) {
8440 team = team->t.t_parent;
8441 continue;
8442 }
8443 if (ii > level) {
8444 team = team->t.t_parent;
8445 ii--;
8446 }
8447 }
8448 return team;
8449 }
8450 return NULL;
8451}
8452
8453int __kmp_aux_get_team_num() {
8454 int serialized;
8455 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8456 if (team) {
8457 if (serialized > 1) {
8458 return 0; // teams region is serialized ( 1 team of 1 thread ).
8459 } else {
8460 return team->t.t_master_tid;
8461 }
8462 }
8463 return 0;
8464}
8465
8466int __kmp_aux_get_num_teams() {
8467 int serialized;
8468 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8469 if (team) {
8470 if (serialized > 1) {
8471 return 1;
8472 } else {
8473 return team->t.t_parent->t.t_nproc;
8474 }
8475 }
8476 return 1;
8477}
8478
8479/* ------------------------------------------------------------------------ */
8480
8481/*
8482 * Affinity Format Parser
8483 *
8484 * Field is in form of: %[[[0].]size]type
8485 * % and type are required (%% means print a literal '%')
8486 * type is either single char or long name surrounded by {},
8487 * e.g., N or {num_threads}
8488 * 0 => leading zeros
8489 * . => right justified when size is specified
8490 * by default output is left justified
8491 * size is the *minimum* field length
8492 * All other characters are printed as is
8493 *
8494 * Available field types:
8495 * L {thread_level} - omp_get_level()
8496 * n {thread_num} - omp_get_thread_num()
8497 * h {host} - name of host machine
8498 * P {process_id} - process id (integer)
8499 * T {thread_identifier} - native thread identifier (integer)
8500 * N {num_threads} - omp_get_num_threads()
8501 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8502 * a {thread_affinity} - comma separated list of integers or integer ranges
8503 * (values of affinity mask)
8504 *
8505 * Implementation-specific field types can be added
8506 * If a type is unknown, print "undefined"
8507 */
8508
8509// Structure holding the short name, long name, and corresponding data type
8510// for snprintf. A table of these will represent the entire valid keyword
8511// field types.
8512typedef struct kmp_affinity_format_field_t {
8513 char short_name; // from spec e.g., L -> thread level
8514 const char *long_name; // from spec thread_level -> thread level
8515 char field_format; // data type for snprintf (typically 'd' or 's'
8516 // for integer or string)
8517} kmp_affinity_format_field_t;
8518
8519static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8520#if KMP_AFFINITY_SUPPORTED
8521 {'A', "thread_affinity", 's'},
8522#endif
8523 {'t', "team_num", 'd'},
8524 {'T', "num_teams", 'd'},
8525 {'L', "nesting_level", 'd'},
8526 {'n', "thread_num", 'd'},
8527 {'N', "num_threads", 'd'},
8528 {'a', "ancestor_tnum", 'd'},
8529 {'H', "host", 's'},
8530 {'P', "process_id", 'd'},
8531 {'i', "native_thread_id", 'd'}};
8532
8533// Return the number of characters it takes to hold field
8534static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8535 const char **ptr,
8536 kmp_str_buf_t *field_buffer) {
8537 int rc, format_index, field_value;
8538 const char *width_left, *width_right;
8539 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8540 static const int FORMAT_SIZE = 20;
8541 char format[FORMAT_SIZE] = {0};
8542 char absolute_short_name = 0;
8543
8544 KMP_DEBUG_ASSERT(gtid >= 0);
8545 KMP_DEBUG_ASSERT(th);
8546 KMP_DEBUG_ASSERT(**ptr == '%');
8547 KMP_DEBUG_ASSERT(field_buffer);
8548
8549 __kmp_str_buf_clear(field_buffer);
8550
8551 // Skip the initial %
8552 (*ptr)++;
8553
8554 // Check for %% first
8555 if (**ptr == '%') {
8556 __kmp_str_buf_cat(field_buffer, "%", 1);
8557 (*ptr)++; // skip over the second %
8558 return 1;
8559 }
8560
8561 // Parse field modifiers if they are present
8562 pad_zeros = false;
8563 if (**ptr == '0') {
8564 pad_zeros = true;
8565 (*ptr)++; // skip over 0
8566 }
8567 right_justify = false;
8568 if (**ptr == '.') {
8569 right_justify = true;
8570 (*ptr)++; // skip over .
8571 }
8572 // Parse width of field: [width_left, width_right)
8573 width_left = width_right = NULL;
8574 if (**ptr >= '0' && **ptr <= '9') {
8575 width_left = *ptr;
8576 SKIP_DIGITS(*ptr);
8577 width_right = *ptr;
8578 }
8579
8580 // Create the format for KMP_SNPRINTF based on flags parsed above
8581 format_index = 0;
8582 format[format_index++] = '%';
8583 if (!right_justify)
8584 format[format_index++] = '-';
8585 if (pad_zeros)
8586 format[format_index++] = '0';
8587 if (width_left && width_right) {
8588 int i = 0;
8589 // Only allow 8 digit number widths.
8590 // This also prevents overflowing format variable
8591 while (i < 8 && width_left < width_right) {
8592 format[format_index++] = *width_left;
8593 width_left++;
8594 i++;
8595 }
8596 }
8597
8598 // Parse a name (long or short)
8599 // Canonicalize the name into absolute_short_name
8600 found_valid_name = false;
8601 parse_long_name = (**ptr == '{');
8602 if (parse_long_name)
8603 (*ptr)++; // skip initial left brace
8604 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8605 sizeof(__kmp_affinity_format_table[0]);
8606 ++i) {
8607 char short_name = __kmp_affinity_format_table[i].short_name;
8608 const char *long_name = __kmp_affinity_format_table[i].long_name;
8609 char field_format = __kmp_affinity_format_table[i].field_format;
8610 if (parse_long_name) {
8611 size_t length = KMP_STRLEN(long_name);
8612 if (strncmp(*ptr, long_name, length) == 0) {
8613 found_valid_name = true;
8614 (*ptr) += length; // skip the long name
8615 }
8616 } else if (**ptr == short_name) {
8617 found_valid_name = true;
8618 (*ptr)++; // skip the short name
8619 }
8620 if (found_valid_name) {
8621 format[format_index++] = field_format;
8622 format[format_index++] = '\0';
8623 absolute_short_name = short_name;
8624 break;
8625 }
8626 }
8627 if (parse_long_name) {
8628 if (**ptr != '}') {
8629 absolute_short_name = 0;
8630 } else {
8631 (*ptr)++; // skip over the right brace
8632 }
8633 }
8634
8635 // Attempt to fill the buffer with the requested
8636 // value using snprintf within __kmp_str_buf_print()
8637 switch (absolute_short_name) {
8638 case 't':
8639 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8640 break;
8641 case 'T':
8642 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8643 break;
8644 case 'L':
8645 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8646 break;
8647 case 'n':
8648 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8649 break;
8650 case 'H': {
8651 static const int BUFFER_SIZE = 256;
8652 char buf[BUFFER_SIZE];
8653 __kmp_expand_host_name(buf, BUFFER_SIZE);
8654 rc = __kmp_str_buf_print(field_buffer, format, buf);
8655 } break;
8656 case 'P':
8657 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8658 break;
8659 case 'i':
8660 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8661 break;
8662 case 'N':
8663 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8664 break;
8665 case 'a':
8666 field_value =
8667 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8668 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8669 break;
8670#if KMP_AFFINITY_SUPPORTED
8671 case 'A': {
8672 kmp_str_buf_t buf;
8673 __kmp_str_buf_init(&buf);
8674 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8675 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8676 __kmp_str_buf_free(&buf);
8677 } break;
8678#endif
8679 default:
8680 // According to spec, If an implementation does not have info for field
8681 // type, then "undefined" is printed
8682 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8683 // Skip the field
8684 if (parse_long_name) {
8685 SKIP_TOKEN(*ptr);
8686 if (**ptr == '}')
8687 (*ptr)++;
8688 } else {
8689 (*ptr)++;
8690 }
8691 }
8692
8693 KMP_ASSERT(format_index <= FORMAT_SIZE);
8694 return rc;
8695}
8696
8697/*
8698 * Return number of characters needed to hold the affinity string
8699 * (not including null byte character)
8700 * The resultant string is printed to buffer, which the caller can then
8701 * handle afterwards
8702 */
8703size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8704 kmp_str_buf_t *buffer) {
8705 const char *parse_ptr;
8706 size_t retval;
8707 const kmp_info_t *th;
8708 kmp_str_buf_t field;
8709
8710 KMP_DEBUG_ASSERT(buffer);
8711 KMP_DEBUG_ASSERT(gtid >= 0);
8712
8713 __kmp_str_buf_init(&field);
8714 __kmp_str_buf_clear(buffer);
8715
8716 th = __kmp_threads[gtid];
8717 retval = 0;
8718
8719 // If format is NULL or zero-length string, then we use
8720 // affinity-format-var ICV
8721 parse_ptr = format;
8722 if (parse_ptr == NULL || *parse_ptr == '\0') {
8723 parse_ptr = __kmp_affinity_format;
8724 }
8725 KMP_DEBUG_ASSERT(parse_ptr);
8726
8727 while (*parse_ptr != '\0') {
8728 // Parse a field
8729 if (*parse_ptr == '%') {
8730 // Put field in the buffer
8731 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8732 __kmp_str_buf_catbuf(buffer, &field);
8733 retval += rc;
8734 } else {
8735 // Put literal character in buffer
8736 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8737 retval++;
8738 parse_ptr++;
8739 }
8740 }
8741 __kmp_str_buf_free(&field);
8742 return retval;
8743}
8744
8745// Displays the affinity string to stdout
8746void __kmp_aux_display_affinity(int gtid, const char *format) {
8747 kmp_str_buf_t buf;
8748 __kmp_str_buf_init(&buf);
8749 __kmp_aux_capture_affinity(gtid, format, &buf);
8750 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8751 __kmp_str_buf_free(&buf);
8752}
8753
8754/* ------------------------------------------------------------------------ */
8755
8756void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8757 int blocktime = arg; /* argument is in milliseconds */
8758#if KMP_USE_MONITOR
8759 int bt_intervals;
8760#endif
8761 kmp_int8 bt_set;
8762
8763 __kmp_save_internal_controls(thread);
8764
8765 /* Normalize and set blocktime for the teams */
8766 if (blocktime < KMP_MIN_BLOCKTIME)
8767 blocktime = KMP_MIN_BLOCKTIME;
8768 else if (blocktime > KMP_MAX_BLOCKTIME)
8769 blocktime = KMP_MAX_BLOCKTIME;
8770
8771 set__blocktime_team(thread->th.th_team, tid, blocktime);
8772 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8773
8774#if KMP_USE_MONITOR
8775 /* Calculate and set blocktime intervals for the teams */
8776 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8777
8778 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8779 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8780#endif
8781
8782 /* Set whether blocktime has been set to "TRUE" */
8783 bt_set = TRUE;
8784
8785 set__bt_set_team(thread->th.th_team, tid, bt_set);
8786 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8787#if KMP_USE_MONITOR
8788 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8789 "bt_intervals=%d, monitor_updates=%d\n",
8790 __kmp_gtid_from_tid(tid, thread->th.th_team),
8791 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8792 __kmp_monitor_wakeups));
8793#else
8794 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8795 __kmp_gtid_from_tid(tid, thread->th.th_team),
8796 thread->th.th_team->t.t_id, tid, blocktime));
8797#endif
8798}
8799
8800void __kmp_aux_set_defaults(char const *str, size_t len) {
8801 if (!__kmp_init_serial) {
8802 __kmp_serial_initialize();
8803 }
8804 __kmp_env_initialize(str);
8805
8806 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8807 __kmp_env_print();
8808 }
8809} // __kmp_aux_set_defaults
8810
8811/* ------------------------------------------------------------------------ */
8812/* internal fast reduction routines */
8813
8814PACKED_REDUCTION_METHOD_T
8815__kmp_determine_reduction_method(
8816 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8817 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8818 kmp_critical_name *lck) {
8819
8820 // Default reduction method: critical construct ( lck != NULL, like in current
8821 // PAROPT )
8822 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8823 // can be selected by RTL
8824 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8825 // can be selected by RTL
8826 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8827 // among generated by PAROPT.
8828
8829 PACKED_REDUCTION_METHOD_T retval;
8830
8831 int team_size;
8832
8833 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8834
8835#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8836 (loc && \
8837 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8838#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8839
8840 retval = critical_reduce_block;
8841
8842 // another choice of getting a team size (with 1 dynamic deference) is slower
8843 team_size = __kmp_get_team_num_threads(global_tid);
8844 if (team_size == 1) {
8845
8846 retval = empty_reduce_block;
8847
8848 } else {
8849
8850 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8851
8852#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8853 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8854
8855#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8856 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8857
8858 int teamsize_cutoff = 4;
8859
8860#if KMP_MIC_SUPPORTED
8861 if (__kmp_mic_type != non_mic) {
8862 teamsize_cutoff = 8;
8863 }
8864#endif
8865 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8866 if (tree_available) {
8867 if (team_size <= teamsize_cutoff) {
8868 if (atomic_available) {
8869 retval = atomic_reduce_block;
8870 }
8871 } else {
8872 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8873 }
8874 } else if (atomic_available) {
8875 retval = atomic_reduce_block;
8876 }
8877#else
8878#error "Unknown or unsupported OS"
8879#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8880 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8881
8882#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8883
8884#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8885
8886 // basic tuning
8887
8888 if (atomic_available) {
8889 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8890 retval = atomic_reduce_block;
8891 }
8892 } // otherwise: use critical section
8893
8894#elif KMP_OS_DARWIN
8895
8896 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8897 if (atomic_available && (num_vars <= 3)) {
8898 retval = atomic_reduce_block;
8899 } else if (tree_available) {
8900 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8901 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8902 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8903 }
8904 } // otherwise: use critical section
8905
8906#else
8907#error "Unknown or unsupported OS"
8908#endif
8909
8910#else
8911#error "Unknown or unsupported architecture"
8912#endif
8913 }
8914
8915 // KMP_FORCE_REDUCTION
8916
8917 // If the team is serialized (team_size == 1), ignore the forced reduction
8918 // method and stay with the unsynchronized method (empty_reduce_block)
8919 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8920 team_size != 1) {
8921
8922 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8923
8924 int atomic_available, tree_available;
8925
8926 switch ((forced_retval = __kmp_force_reduction_method)) {
8927 case critical_reduce_block:
8928 KMP_ASSERT(lck); // lck should be != 0
8929 break;
8930
8931 case atomic_reduce_block:
8932 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8933 if (!atomic_available) {
8934 KMP_WARNING(RedMethodNotSupported, "atomic");
8935 forced_retval = critical_reduce_block;
8936 }
8937 break;
8938
8939 case tree_reduce_block:
8940 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8941 if (!tree_available) {
8942 KMP_WARNING(RedMethodNotSupported, "tree");
8943 forced_retval = critical_reduce_block;
8944 } else {
8945#if KMP_FAST_REDUCTION_BARRIER
8946 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8947#endif
8948 }
8949 break;
8950
8951 default:
8952 KMP_ASSERT(0); // "unsupported method specified"
8953 }
8954
8955 retval = forced_retval;
8956 }
8957
8958 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8959
8960#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8961#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8962
8963 return (retval);
8964}
8965// this function is for testing set/get/determine reduce method
8966kmp_int32 __kmp_get_reduce_method(void) {
8967 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8968}
8969
8970// Soft pause sets up threads to ignore blocktime and just go to sleep.
8971// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8972void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8973
8974// Hard pause shuts down the runtime completely. Resume happens naturally when
8975// OpenMP is used subsequently.
8976void __kmp_hard_pause() {
8977 __kmp_pause_status = kmp_hard_paused;
8978 __kmp_internal_end_thread(-1);
8979}
8980
8981// Soft resume sets __kmp_pause_status, and wakes up all threads.
8982void __kmp_resume_if_soft_paused() {
8983 if (__kmp_pause_status == kmp_soft_paused) {
8984 __kmp_pause_status = kmp_not_paused;
8985
8986 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8987 kmp_info_t *thread = __kmp_threads[gtid];
8988 if (thread) { // Wake it if sleeping
8989 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8990 thread);
8991 if (fl.is_sleeping())
8992 fl.resume(gtid);
8993 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8994 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8995 } else { // thread holds the lock and may sleep soon
8996 do { // until either the thread sleeps, or we can get the lock
8997 if (fl.is_sleeping()) {
8998 fl.resume(gtid);
8999 break;
9000 } else if (__kmp_try_suspend_mx(thread)) {
9001 __kmp_unlock_suspend_mx(thread);
9002 break;
9003 }
9004 } while (1);
9005 }
9006 }
9007 }
9008 }
9009}
9010
9011// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9012// TODO: add warning messages
9013int __kmp_pause_resource(kmp_pause_status_t level) {
9014 if (level == kmp_not_paused) { // requesting resume
9015 if (__kmp_pause_status == kmp_not_paused) {
9016 // error message about runtime not being paused, so can't resume
9017 return 1;
9018 } else {
9019 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9020 __kmp_pause_status == kmp_hard_paused);
9021 __kmp_pause_status = kmp_not_paused;
9022 return 0;
9023 }
9024 } else if (level == kmp_soft_paused) { // requesting soft pause
9025 if (__kmp_pause_status != kmp_not_paused) {
9026 // error message about already being paused
9027 return 1;
9028 } else {
9029 __kmp_soft_pause();
9030 return 0;
9031 }
9032 } else if (level == kmp_hard_paused) { // requesting hard pause
9033 if (__kmp_pause_status != kmp_not_paused) {
9034 // error message about already being paused
9035 return 1;
9036 } else {
9037 __kmp_hard_pause();
9038 return 0;
9039 }
9040 } else {
9041 // error message about invalid level
9042 return 1;
9043 }
9044}
9045
9046void __kmp_omp_display_env(int verbose) {
9047 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9048 if (__kmp_init_serial == 0)
9049 __kmp_do_serial_initialize();
9050 __kmp_display_env_impl(!verbose, verbose);
9051 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9052}
9053
9054// The team size is changing, so distributed barrier must be modified
9055void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9056 int new_nthreads) {
9057 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9058 bp_dist_bar);
9059 kmp_info_t **other_threads = team->t.t_threads;
9060
9061 // We want all the workers to stop waiting on the barrier while we adjust the
9062 // size of the team.
9063 for (int f = 1; f < old_nthreads; ++f) {
9064 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9065 // Ignore threads that are already inactive or not present in the team
9066 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9067 // teams construct causes thread_limit to get passed in, and some of
9068 // those could be inactive; just ignore them
9069 continue;
9070 }
9071 // If thread is transitioning still to in_use state, wait for it
9072 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9073 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9074 KMP_CPU_PAUSE();
9075 }
9076 // The thread should be in_use now
9077 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9078 // Transition to unused state
9079 team->t.t_threads[f]->th.th_used_in_team.store(2);
9080 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9081 }
9082 // Release all the workers
9083 team->t.b->go_release();
9084
9085 KMP_MFENCE();
9086
9087 // Workers should see transition status 2 and move to 0; but may need to be
9088 // woken up first
9089 int count = old_nthreads - 1;
9090 while (count > 0) {
9091 count = old_nthreads - 1;
9092 for (int f = 1; f < old_nthreads; ++f) {
9093 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9094 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9095 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9096 void *, other_threads[f]->th.th_sleep_loc);
9097 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9098 }
9099 } else {
9100 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9101 count--;
9102 }
9103 }
9104 }
9105 // Now update the barrier size
9106 team->t.b->update_num_threads(new_nthreads);
9107 team->t.b->go_reset();
9108}
9109
9110void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9111 // Add the threads back to the team
9112 KMP_DEBUG_ASSERT(team);
9113 // Threads were paused and pointed at th_used_in_team temporarily during a
9114 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9115 // the thread that it should transition itself back into the team. Then, if
9116 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9117 // to wake it up.
9118 for (int f = 1; f < new_nthreads; ++f) {
9119 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9120 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9121 3);
9122 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9123 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9124 (kmp_flag_32<false, false> *)NULL);
9125 }
9126 }
9127 // The threads should be transitioning to the team; when they are done, they
9128 // should have set th_used_in_team to 1. This loop forces master to wait until
9129 // all threads have moved into the team and are waiting in the barrier.
9130 int count = new_nthreads - 1;
9131 while (count > 0) {
9132 count = new_nthreads - 1;
9133 for (int f = 1; f < new_nthreads; ++f) {
9134 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9135 count--;
9136 }
9137 }
9138 }
9139}
9140
9141// Globals and functions for hidden helper task
9142kmp_info_t **__kmp_hidden_helper_threads;
9143kmp_info_t *__kmp_hidden_helper_main_thread;
9144std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9145#if KMP_OS_LINUX
9146kmp_int32 __kmp_hidden_helper_threads_num = 8;
9147kmp_int32 __kmp_enable_hidden_helper = TRUE;
9148#else
9149kmp_int32 __kmp_hidden_helper_threads_num = 0;
9150kmp_int32 __kmp_enable_hidden_helper = FALSE;
9151#endif
9152
9153namespace {
9154std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9155
9156void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9157 // This is an explicit synchronization on all hidden helper threads in case
9158 // that when a regular thread pushes a hidden helper task to one hidden
9159 // helper thread, the thread has not been awaken once since they're released
9160 // by the main thread after creating the team.
9161 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9162 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9163 __kmp_hidden_helper_threads_num)
9164 ;
9165
9166 // If main thread, then wait for signal
9167 if (__kmpc_master(nullptr, *gtid)) {
9168 // First, unset the initial state and release the initial thread
9169 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9170 __kmp_hidden_helper_initz_release();
9171 __kmp_hidden_helper_main_thread_wait();
9172 // Now wake up all worker threads
9173 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9174 __kmp_hidden_helper_worker_thread_signal();
9175 }
9176 }
9177}
9178} // namespace
9179
9180void __kmp_hidden_helper_threads_initz_routine() {
9181 // Create a new root for hidden helper team/threads
9182 const int gtid = __kmp_register_root(TRUE);
9183 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9184 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9185 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9186 __kmp_hidden_helper_threads_num;
9187
9188 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9189
9190 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9191
9192 // Set the initialization flag to FALSE
9193 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9194
9195 __kmp_hidden_helper_threads_deinitz_release();
9196}
9197
9198/* Nesting Mode:
9199 Set via KMP_NESTING_MODE, which takes an integer.
9200 Note: we skip duplicate topology levels, and skip levels with only
9201 one entity.
9202 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9203 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9204 in the topology, and initializes the number of threads at each of those
9205 levels to the number of entities at each level, respectively, below the
9206 entity at the parent level.
9207 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9208 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9209 the user to turn nesting on explicitly. This is an even more experimental
9210 option to this experimental feature, and may change or go away in the
9211 future.
9212*/
9213
9214// Allocate space to store nesting levels
9215void __kmp_init_nesting_mode() {
9216 int levels = KMP_HW_LAST;
9217 __kmp_nesting_mode_nlevels = levels;
9218 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9219 for (int i = 0; i < levels; ++i)
9220 __kmp_nesting_nth_level[i] = 0;
9221 if (__kmp_nested_nth.size < levels) {
9222 __kmp_nested_nth.nth =
9223 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9224 __kmp_nested_nth.size = levels;
9225 }
9226}
9227
9228// Set # threads for top levels of nesting; must be called after topology set
9229void __kmp_set_nesting_mode_threads() {
9230 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9231
9232 if (__kmp_nesting_mode == 1)
9233 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9234 else if (__kmp_nesting_mode > 1)
9235 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9236
9237 if (__kmp_topology) { // use topology info
9238 int loc, hw_level;
9239 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9240 loc < __kmp_nesting_mode_nlevels;
9241 loc++, hw_level++) {
9242 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9243 if (__kmp_nesting_nth_level[loc] == 1)
9244 loc--;
9245 }
9246 // Make sure all cores are used
9247 if (__kmp_nesting_mode > 1 && loc > 1) {
9248 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9249 int num_cores = __kmp_topology->get_count(core_level);
9250 int upper_levels = 1;
9251 for (int level = 0; level < loc - 1; ++level)
9252 upper_levels *= __kmp_nesting_nth_level[level];
9253 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9254 __kmp_nesting_nth_level[loc - 1] =
9255 num_cores / __kmp_nesting_nth_level[loc - 2];
9256 }
9257 __kmp_nesting_mode_nlevels = loc;
9258 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9259 } else { // no topology info available; provide a reasonable guesstimation
9260 if (__kmp_avail_proc >= 4) {
9261 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9262 __kmp_nesting_nth_level[1] = 2;
9263 __kmp_nesting_mode_nlevels = 2;
9264 } else {
9265 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9266 __kmp_nesting_mode_nlevels = 1;
9267 }
9268 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9269 }
9270 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9271 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9272 }
9273 set__nproc(thread, __kmp_nesting_nth_level[0]);
9274 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9275 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9276 if (get__max_active_levels(thread) > 1) {
9277 // if max levels was set, set nesting mode levels to same
9278 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9279 }
9280 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9281 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9282}
9283
9284// Empty symbols to export (see exports_so.txt) when feature is disabled
9285extern "C" {
9286#if !KMP_STATS_ENABLED
9287void __kmp_reset_stats() {}
9288#endif
9289#if !USE_DEBUGGER
9290int __kmp_omp_debug_struct_info = FALSE;
9291int __kmp_debugging = FALSE;
9292#endif
9293#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9294void __kmp_itt_fini_ittlib() {}
9295void __kmp_itt_init_ittlib() {}
9296#endif
9297}
9298
9299// end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236