LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34#if OMPD_SUPPORT
35#include "ompd-specific.h"
36#endif
37
38#if OMP_PROFILING_SUPPORT
39#include "llvm/Support/TimeProfiler.h"
40static char *ProfileTraceFile = nullptr;
41#endif
42
43/* these are temporary issues to be dealt with */
44#define KMP_USE_PRCTL 0
45
46#if KMP_OS_WINDOWS
47#include <process.h>
48#endif
49
50#if KMP_OS_WINDOWS
51// windows does not need include files as it doesn't use shared memory
52#else
53#include <sys/mman.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#define SHM_SIZE 1024
57#endif
58
59#if defined(KMP_GOMP_COMPAT)
60char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62#endif /* defined(KMP_GOMP_COMPAT) */
63
64char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66
67#ifdef KMP_DEBUG
68char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70#endif /* KMP_DEBUG */
71
72#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73
74/* ------------------------------------------------------------------------ */
75
76#if KMP_USE_MONITOR
77kmp_info_t __kmp_monitor;
78#endif
79
80/* Forward declarations */
81
82void __kmp_cleanup(void);
83
84static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89#if KMP_AFFINITY_SUPPORTED
90static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92#endif
93static void __kmp_do_serial_initialize(void);
94void __kmp_fork_barrier(int gtid, int tid);
95void __kmp_join_barrier(int gtid);
96void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
98
99#ifdef USE_LOAD_BALANCE
100static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101#endif
102
103static int __kmp_expand_threads(int nNeed);
104#if KMP_OS_WINDOWS
105static int __kmp_unregister_root_other_thread(int gtid);
106#endif
107static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109
110void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111 int new_nthreads);
112void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113
114/* Calculate the identifier of the current thread */
115/* fast (and somewhat portable) way to get unique identifier of executing
116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117int __kmp_get_global_thread_id() {
118 int i;
119 kmp_info_t **other_threads;
120 size_t stack_data;
121 char *stack_addr;
122 size_t stack_size;
123 char *stack_base;
124
125 KA_TRACE(
126 1000,
127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128 __kmp_nth, __kmp_all_nth));
129
130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133 __kmp_init_gtid for this to work. */
134
135 if (!TCR_4(__kmp_init_gtid))
136 return KMP_GTID_DNE;
137
138#ifdef KMP_TDATA_GTID
139 if (TCR_4(__kmp_gtid_mode) >= 3) {
140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141 return __kmp_gtid;
142 }
143#endif
144 if (TCR_4(__kmp_gtid_mode) >= 2) {
145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146 return __kmp_gtid_get_specific();
147 }
148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149
150 stack_addr = (char *)&stack_data;
151 other_threads = __kmp_threads;
152
153 /* ATT: The code below is a source of potential bugs due to unsynchronized
154 access to __kmp_threads array. For example:
155 1. Current thread loads other_threads[i] to thr and checks it, it is
156 non-NULL.
157 2. Current thread is suspended by OS.
158 3. Another thread unregisters and finishes (debug versions of free()
159 may fill memory with something like 0xEF).
160 4. Current thread is resumed.
161 5. Current thread reads junk from *thr.
162 TODO: Fix it. --ln */
163
164 for (i = 0; i < __kmp_threads_capacity; i++) {
165
166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167 if (!thr)
168 continue;
169
170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172
173 /* stack grows down -- search through all of the active threads */
174
175 if (stack_addr <= stack_base) {
176 size_t stack_diff = stack_base - stack_addr;
177
178 if (stack_diff <= stack_size) {
179 /* The only way we can be closer than the allocated */
180 /* stack size is if we are running on this thread. */
181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182 return i;
183 }
184 }
185 }
186
187 /* get specific to try and determine our gtid */
188 KA_TRACE(1000,
189 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190 "thread, using TLS\n"));
191 i = __kmp_gtid_get_specific();
192
193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194
195 /* if we havn't been assigned a gtid, then return code */
196 if (i < 0)
197 return i;
198
199 /* dynamically updated stack window for uber threads to avoid get_specific
200 call */
201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202 KMP_FATAL(StackOverflow, i);
203 }
204
205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206 if (stack_addr > stack_base) {
207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210 stack_base);
211 } else {
212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213 stack_base - stack_addr);
214 }
215
216 /* Reprint stack bounds for ubermaster since they have been refined */
217 if (__kmp_storage_map) {
218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221 other_threads[i]->th.th_info.ds.ds_stacksize,
222 "th_%d stack (refinement)", i);
223 }
224 return i;
225}
226
227int __kmp_get_global_thread_id_reg() {
228 int gtid;
229
230 if (!__kmp_init_serial) {
231 gtid = KMP_GTID_DNE;
232 } else
233#ifdef KMP_TDATA_GTID
234 if (TCR_4(__kmp_gtid_mode) >= 3) {
235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236 gtid = __kmp_gtid;
237 } else
238#endif
239 if (TCR_4(__kmp_gtid_mode) >= 2) {
240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241 gtid = __kmp_gtid_get_specific();
242 } else {
243 KA_TRACE(1000,
244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245 gtid = __kmp_get_global_thread_id();
246 }
247
248 /* we must be a new uber master sibling thread */
249 if (gtid == KMP_GTID_DNE) {
250 KA_TRACE(10,
251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252 "Registering a new gtid.\n"));
253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254 if (!__kmp_init_serial) {
255 __kmp_do_serial_initialize();
256 gtid = __kmp_gtid_get_specific();
257 } else {
258 gtid = __kmp_register_root(FALSE);
259 }
260 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262 }
263
264 KMP_DEBUG_ASSERT(gtid >= 0);
265
266 return gtid;
267}
268
269/* caller must hold forkjoin_lock */
270void __kmp_check_stack_overlap(kmp_info_t *th) {
271 int f;
272 char *stack_beg = NULL;
273 char *stack_end = NULL;
274 int gtid;
275
276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277 if (__kmp_storage_map) {
278 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280
281 gtid = __kmp_gtid_from_thread(th);
282
283 if (gtid == KMP_GTID_MONITOR) {
284 __kmp_print_storage_map_gtid(
285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286 "th_%s stack (%s)", "mon",
287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288 } else {
289 __kmp_print_storage_map_gtid(
290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291 "th_%d stack (%s)", gtid,
292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293 }
294 }
295
296 /* No point in checking ubermaster threads since they use refinement and
297 * cannot overlap */
298 gtid = __kmp_gtid_from_thread(th);
299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300 KA_TRACE(10,
301 ("__kmp_check_stack_overlap: performing extensive checking\n"));
302 if (stack_beg == NULL) {
303 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305 }
306
307 for (f = 0; f < __kmp_threads_capacity; f++) {
308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309
310 if (f_th && f_th != th) {
311 char *other_stack_end =
312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313 char *other_stack_beg =
314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317
318 /* Print the other stack values before the abort */
319 if (__kmp_storage_map)
320 __kmp_print_storage_map_gtid(
321 -1, other_stack_beg, other_stack_end,
322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324
325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326 __kmp_msg_null);
327 }
328 }
329 }
330 }
331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332}
333
334/* ------------------------------------------------------------------------ */
335
336void __kmp_infinite_loop(void) {
337 static int done = FALSE;
338
339 while (!done) {
340 KMP_YIELD(TRUE);
341 }
342}
343
344#define MAX_MESSAGE 512
345
346void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347 char const *format, ...) {
348 char buffer[MAX_MESSAGE];
349 va_list ap;
350
351 va_start(ap, format);
352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353 p2, (unsigned long)size, format);
354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355 __kmp_vprintf(kmp_err, buffer, ap);
356#if KMP_PRINT_DATA_PLACEMENT
357 int node;
358 if (gtid >= 0) {
359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360 if (__kmp_storage_map_verbose) {
361 node = __kmp_get_host_node(p1);
362 if (node < 0) /* doesn't work, so don't try this next time */
363 __kmp_storage_map_verbose = FALSE;
364 else {
365 char *last;
366 int lastNode;
367 int localProc = __kmp_get_cpu_from_gtid(gtid);
368
369 const int page_size = KMP_GET_PAGE_SIZE();
370
371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373 if (localProc >= 0)
374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375 localProc >> 1);
376 else
377 __kmp_printf_no_lock(" GTID %d\n", gtid);
378#if KMP_USE_PRCTL
379 /* The more elaborate format is disabled for now because of the prctl
380 * hanging bug. */
381 do {
382 last = p1;
383 lastNode = node;
384 /* This loop collates adjacent pages with the same host node. */
385 do {
386 (char *)p1 += page_size;
387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389 lastNode);
390 } while (p1 <= p2);
391#else
392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393 (char *)p1 + (page_size - 1),
394 __kmp_get_host_node(p1));
395 if (p1 < p2) {
396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397 (char *)p2 + (page_size - 1),
398 __kmp_get_host_node(p2));
399 }
400#endif
401 }
402 }
403 } else
404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405 }
406#endif /* KMP_PRINT_DATA_PLACEMENT */
407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408}
409
410void __kmp_warn(char const *format, ...) {
411 char buffer[MAX_MESSAGE];
412 va_list ap;
413
414 if (__kmp_generate_warnings == kmp_warnings_off) {
415 return;
416 }
417
418 va_start(ap, format);
419
420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422 __kmp_vprintf(kmp_err, buffer, ap);
423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424
425 va_end(ap);
426}
427
428void __kmp_abort_process() {
429 // Later threads may stall here, but that's ok because abort() will kill them.
430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431
432 if (__kmp_debug_buf) {
433 __kmp_dump_debug_buffer();
434 }
435
436 if (KMP_OS_WINDOWS) {
437 // Let other threads know of abnormal termination and prevent deadlock
438 // if abort happened during library initialization or shutdown
439 __kmp_global.g.g_abort = SIGABRT;
440
441 /* On Windows* OS by default abort() causes pop-up error box, which stalls
442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443 boxes. _set_abort_behavior() works well, but this function is not
444 available in VS7 (this is not problem for DLL, but it is a problem for
445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446 help, at least in some versions of MS C RTL.
447
448 It seems following sequence is the only way to simulate abort() and
449 avoid pop-up error box. */
450 raise(SIGABRT);
451 _exit(3); // Just in case, if signal ignored, exit anyway.
452 } else {
453 __kmp_unregister_library();
454 abort();
455 }
456
457 __kmp_infinite_loop();
458 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459
460} // __kmp_abort_process
461
462void __kmp_abort_thread(void) {
463 // TODO: Eliminate g_abort global variable and this function.
464 // In case of abort just call abort(), it will kill all the threads.
465 __kmp_infinite_loop();
466} // __kmp_abort_thread
467
468/* Print out the storage map for the major kmp_info_t thread data structures
469 that are allocated together. */
470
471static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473 gtid);
474
475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477
478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479 sizeof(kmp_local_t), "th_%d.th_local", gtid);
480
481 __kmp_print_storage_map_gtid(
482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484
485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486 &thr->th.th_bar[bs_plain_barrier + 1],
487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488 gtid);
489
490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491 &thr->th.th_bar[bs_forkjoin_barrier + 1],
492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493 gtid);
494
495#if KMP_FAST_REDUCTION_BARRIER
496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497 &thr->th.th_bar[bs_reduction_barrier + 1],
498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499 gtid);
500#endif // KMP_FAST_REDUCTION_BARRIER
501}
502
503/* Print out the storage map for the major kmp_team_t team data structures
504 that are allocated together. */
505
506static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507 int team_id, int num_thr) {
508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510 header, team_id);
511
512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513 &team->t.t_bar[bs_last_barrier],
514 sizeof(kmp_balign_team_t) * bs_last_barrier,
515 "%s_%d.t_bar", header, team_id);
516
517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518 &team->t.t_bar[bs_plain_barrier + 1],
519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520 header, team_id);
521
522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523 &team->t.t_bar[bs_forkjoin_barrier + 1],
524 sizeof(kmp_balign_team_t),
525 "%s_%d.t_bar[forkjoin]", header, team_id);
526
527#if KMP_FAST_REDUCTION_BARRIER
528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529 &team->t.t_bar[bs_reduction_barrier + 1],
530 sizeof(kmp_balign_team_t),
531 "%s_%d.t_bar[reduction]", header, team_id);
532#endif // KMP_FAST_REDUCTION_BARRIER
533
534 __kmp_print_storage_map_gtid(
535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537
538 __kmp_print_storage_map_gtid(
539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541
542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543 &team->t.t_disp_buffer[num_disp_buff],
544 sizeof(dispatch_shared_info_t) * num_disp_buff,
545 "%s_%d.t_disp_buffer", header, team_id);
546}
547
548static void __kmp_init_allocator() {
549 __kmp_init_memkind();
550 __kmp_init_target_mem();
551}
552static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553
554/* ------------------------------------------------------------------------ */
555
556#if KMP_DYNAMIC_LIB
557#if KMP_OS_WINDOWS
558
559BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561
562 switch (fdwReason) {
563
564 case DLL_PROCESS_ATTACH:
565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566
567 return TRUE;
568
569 case DLL_PROCESS_DETACH:
570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571
572 // According to Windows* documentation for DllMain entry point:
573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574 // lpReserved == NULL when FreeLibrary() is called,
575 // lpReserved != NULL when the process is terminated.
576 // When FreeLibrary() is called, worker threads remain alive. So the
577 // runtime's state is consistent and executing proper shutdown is OK.
578 // When the process is terminated, worker threads have exited or been
579 // forcefully terminated by the OS and only the shutdown thread remains.
580 // This can leave the runtime in an inconsistent state.
581 // Hence, only attempt proper cleanup when FreeLibrary() is called.
582 // Otherwise, rely on OS to reclaim resources.
583 if (lpReserved == NULL)
584 __kmp_internal_end_library(__kmp_gtid_get_specific());
585
586 return TRUE;
587
588 case DLL_THREAD_ATTACH:
589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590
591 /* if we want to register new siblings all the time here call
592 * __kmp_get_gtid(); */
593 return TRUE;
594
595 case DLL_THREAD_DETACH:
596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597
598 __kmp_internal_end_thread(__kmp_gtid_get_specific());
599 return TRUE;
600 }
601
602 return TRUE;
603}
604
605#endif /* KMP_OS_WINDOWS */
606#endif /* KMP_DYNAMIC_LIB */
607
608/* __kmp_parallel_deo -- Wait until it's our turn. */
609void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610 int gtid = *gtid_ref;
611#ifdef BUILD_PARALLEL_ORDERED
612 kmp_team_t *team = __kmp_team_from_gtid(gtid);
613#endif /* BUILD_PARALLEL_ORDERED */
614
615 if (__kmp_env_consistency_check) {
616 if (__kmp_threads[gtid]->th.th_root->r.r_active)
617#if KMP_USE_DYNAMIC_LOCK
618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619#else
620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621#endif
622 }
623#ifdef BUILD_PARALLEL_ORDERED
624 if (!team->t.t_serialized) {
625 KMP_MB();
626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627 NULL);
628 KMP_MB();
629 }
630#endif /* BUILD_PARALLEL_ORDERED */
631}
632
633/* __kmp_parallel_dxo -- Signal the next task. */
634void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635 int gtid = *gtid_ref;
636#ifdef BUILD_PARALLEL_ORDERED
637 int tid = __kmp_tid_from_gtid(gtid);
638 kmp_team_t *team = __kmp_team_from_gtid(gtid);
639#endif /* BUILD_PARALLEL_ORDERED */
640
641 if (__kmp_env_consistency_check) {
642 if (__kmp_threads[gtid]->th.th_root->r.r_active)
643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644 }
645#ifdef BUILD_PARALLEL_ORDERED
646 if (!team->t.t_serialized) {
647 KMP_MB(); /* Flush all pending memory write invalidates. */
648
649 /* use the tid of the next thread in this team */
650 /* TODO replace with general release procedure */
651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652
653 KMP_MB(); /* Flush all pending memory write invalidates. */
654 }
655#endif /* BUILD_PARALLEL_ORDERED */
656}
657
658/* ------------------------------------------------------------------------ */
659/* The BARRIER for a SINGLE process section is always explicit */
660
661int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662 int status;
663 kmp_info_t *th;
664 kmp_team_t *team;
665
666 if (!TCR_4(__kmp_init_parallel))
667 __kmp_parallel_initialize();
668 __kmp_resume_if_soft_paused();
669
670 th = __kmp_threads[gtid];
671 team = th->th.th_team;
672 status = 0;
673
674 th->th.th_ident = id_ref;
675
676 if (team->t.t_serialized) {
677 status = 1;
678 } else {
679 kmp_int32 old_this = th->th.th_local.this_construct;
680
681 ++th->th.th_local.this_construct;
682 /* try to set team count to thread count--success means thread got the
683 single block */
684 /* TODO: Should this be acquire or release? */
685 if (team->t.t_construct == old_this) {
686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687 th->th.th_local.this_construct);
688 }
689#if USE_ITT_BUILD
690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692 team->t.t_active_level == 1) {
693 // Only report metadata by primary thread of active team at level 1
694 __kmp_itt_metadata_single(id_ref);
695 }
696#endif /* USE_ITT_BUILD */
697 }
698
699 if (__kmp_env_consistency_check) {
700 if (status && push_ws) {
701 __kmp_push_workshare(gtid, ct_psingle, id_ref);
702 } else {
703 __kmp_check_workshare(gtid, ct_psingle, id_ref);
704 }
705 }
706#if USE_ITT_BUILD
707 if (status) {
708 __kmp_itt_single_start(gtid);
709 }
710#endif /* USE_ITT_BUILD */
711 return status;
712}
713
714void __kmp_exit_single(int gtid) {
715#if USE_ITT_BUILD
716 __kmp_itt_single_end(gtid);
717#endif /* USE_ITT_BUILD */
718 if (__kmp_env_consistency_check)
719 __kmp_pop_workshare(gtid, ct_psingle, NULL);
720}
721
722/* determine if we can go parallel or must use a serialized parallel region and
723 * how many threads we can use
724 * set_nproc is the number of threads requested for the team
725 * returns 0 if we should serialize or only use one thread,
726 * otherwise the number of threads to use
727 * The forkjoin lock is held by the caller. */
728static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729 int master_tid, int set_nthreads,
730 int enter_teams) {
731 int capacity;
732 int new_nthreads;
733 KMP_DEBUG_ASSERT(__kmp_init_serial);
734 KMP_DEBUG_ASSERT(root && parent_team);
735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736
737 // If dyn-var is set, dynamically adjust the number of desired threads,
738 // according to the method specified by dynamic_mode.
739 new_nthreads = set_nthreads;
740 if (!get__dynamic_2(parent_team, master_tid)) {
741 ;
742 }
743#ifdef USE_LOAD_BALANCE
744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746 if (new_nthreads == 1) {
747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748 "reservation to 1 thread\n",
749 master_tid));
750 return 1;
751 }
752 if (new_nthreads < set_nthreads) {
753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754 "reservation to %d threads\n",
755 master_tid, new_nthreads));
756 }
757 }
758#endif /* USE_LOAD_BALANCE */
759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760 new_nthreads = __kmp_avail_proc - __kmp_nth +
761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762 if (new_nthreads <= 1) {
763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764 "reservation to 1 thread\n",
765 master_tid));
766 return 1;
767 }
768 if (new_nthreads < set_nthreads) {
769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770 "reservation to %d threads\n",
771 master_tid, new_nthreads));
772 } else {
773 new_nthreads = set_nthreads;
774 }
775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776 if (set_nthreads > 2) {
777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778 new_nthreads = (new_nthreads % set_nthreads) + 1;
779 if (new_nthreads == 1) {
780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781 "reservation to 1 thread\n",
782 master_tid));
783 return 1;
784 }
785 if (new_nthreads < set_nthreads) {
786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787 "reservation to %d threads\n",
788 master_tid, new_nthreads));
789 }
790 }
791 } else {
792 KMP_ASSERT(0);
793 }
794
795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796 if (__kmp_nth + new_nthreads -
797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798 __kmp_max_nth) {
799 int tl_nthreads = __kmp_max_nth - __kmp_nth +
800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801 if (tl_nthreads <= 0) {
802 tl_nthreads = 1;
803 }
804
805 // If dyn-var is false, emit a 1-time warning.
806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807 __kmp_reserve_warn = 1;
808 __kmp_msg(kmp_ms_warning,
809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811 }
812 if (tl_nthreads == 1) {
813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814 "reduced reservation to 1 thread\n",
815 master_tid));
816 return 1;
817 }
818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819 "reservation to %d threads\n",
820 master_tid, tl_nthreads));
821 new_nthreads = tl_nthreads;
822 }
823
824 // Respect OMP_THREAD_LIMIT
825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827 if (cg_nthreads + new_nthreads -
828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829 max_cg_threads) {
830 int tl_nthreads = max_cg_threads - cg_nthreads +
831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832 if (tl_nthreads <= 0) {
833 tl_nthreads = 1;
834 }
835
836 // If dyn-var is false, emit a 1-time warning.
837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838 __kmp_reserve_warn = 1;
839 __kmp_msg(kmp_ms_warning,
840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842 }
843 if (tl_nthreads == 1) {
844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845 "reduced reservation to 1 thread\n",
846 master_tid));
847 return 1;
848 }
849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850 "reservation to %d threads\n",
851 master_tid, tl_nthreads));
852 new_nthreads = tl_nthreads;
853 }
854
855 // Check if the threads array is large enough, or needs expanding.
856 // See comment in __kmp_register_root() about the adjustment if
857 // __kmp_threads[0] == NULL.
858 capacity = __kmp_threads_capacity;
859 if (TCR_PTR(__kmp_threads[0]) == NULL) {
860 --capacity;
861 }
862 // If it is not for initializing the hidden helper team, we need to take
863 // __kmp_hidden_helper_threads_num out of the capacity because it is included
864 // in __kmp_threads_capacity.
865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866 capacity -= __kmp_hidden_helper_threads_num;
867 }
868 if (__kmp_nth + new_nthreads -
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870 capacity) {
871 // Expand the threads array.
872 int slotsRequired = __kmp_nth + new_nthreads -
873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874 capacity;
875 int slotsAdded = __kmp_expand_threads(slotsRequired);
876 if (slotsAdded < slotsRequired) {
877 // The threads array was not expanded enough.
878 new_nthreads -= (slotsRequired - slotsAdded);
879 KMP_ASSERT(new_nthreads >= 1);
880
881 // If dyn-var is false, emit a 1-time warning.
882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883 __kmp_reserve_warn = 1;
884 if (__kmp_tp_cached) {
885 __kmp_msg(kmp_ms_warning,
886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889 } else {
890 __kmp_msg(kmp_ms_warning,
891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893 }
894 }
895 }
896 }
897
898#ifdef KMP_DEBUG
899 if (new_nthreads == 1) {
900 KC_TRACE(10,
901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902 "dead roots and rechecking; requested %d threads\n",
903 __kmp_get_gtid(), set_nthreads));
904 } else {
905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906 " %d threads\n",
907 __kmp_get_gtid(), new_nthreads, set_nthreads));
908 }
909#endif // KMP_DEBUG
910 return new_nthreads;
911}
912
913/* Allocate threads from the thread pool and assign them to the new team. We are
914 assured that there are enough threads available, because we checked on that
915 earlier within critical section forkjoin */
916static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917 kmp_info_t *master_th, int master_gtid,
918 int fork_teams_workers) {
919 int i;
920 int use_hot_team;
921
922 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924 KMP_MB();
925
926 /* first, let's setup the primary thread */
927 master_th->th.th_info.ds.ds_tid = 0;
928 master_th->th.th_team = team;
929 master_th->th.th_team_nproc = team->t.t_nproc;
930 master_th->th.th_team_master = master_th;
931 master_th->th.th_team_serialized = FALSE;
932 master_th->th.th_dispatch = &team->t.t_dispatch[0];
933
934/* make sure we are not the optimized hot team */
935#if KMP_NESTED_HOT_TEAMS
936 use_hot_team = 0;
937 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938 if (hot_teams) { // hot teams array is not allocated if
939 // KMP_HOT_TEAMS_MAX_LEVEL=0
940 int level = team->t.t_active_level - 1; // index in array of hot teams
941 if (master_th->th.th_teams_microtask) { // are we inside the teams?
942 if (master_th->th.th_teams_size.nteams > 1) {
943 ++level; // level was not increased in teams construct for
944 // team_of_masters
945 }
946 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947 master_th->th.th_teams_level == team->t.t_level) {
948 ++level; // level was not increased in teams construct for
949 // team_of_workers before the parallel
950 } // team->t.t_level will be increased inside parallel
951 }
952 if (level < __kmp_hot_teams_max_level) {
953 if (hot_teams[level].hot_team) {
954 // hot team has already been allocated for given level
955 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956 use_hot_team = 1; // the team is ready to use
957 } else {
958 use_hot_team = 0; // AC: threads are not allocated yet
959 hot_teams[level].hot_team = team; // remember new hot team
960 hot_teams[level].hot_team_nth = team->t.t_nproc;
961 }
962 } else {
963 use_hot_team = 0;
964 }
965 }
966#else
967 use_hot_team = team == root->r.r_hot_team;
968#endif
969 if (!use_hot_team) {
970
971 /* install the primary thread */
972 team->t.t_threads[0] = master_th;
973 __kmp_initialize_info(master_th, team, 0, master_gtid);
974
975 /* now, install the worker threads */
976 for (i = 1; i < team->t.t_nproc; i++) {
977
978 /* fork or reallocate a new thread and install it in team */
979 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980 team->t.t_threads[i] = thr;
981 KMP_DEBUG_ASSERT(thr);
982 KMP_DEBUG_ASSERT(thr->th.th_team == team);
983 /* align team and thread arrived states */
984 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985 "T#%d(%d:%d) join =%llu, plain=%llu\n",
986 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989 team->t.t_bar[bs_plain_barrier].b_arrived));
990 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991 thr->th.th_teams_level = master_th->th.th_teams_level;
992 thr->th.th_teams_size = master_th->th.th_teams_size;
993 { // Initialize threads' barrier data.
994 int b;
995 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996 for (b = 0; b < bs_last_barrier; ++b) {
997 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999#if USE_DEBUGGER
1000 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001#endif
1002 }
1003 }
1004 }
1005
1006#if KMP_AFFINITY_SUPPORTED
1007 // Do not partition the places list for teams construct workers who
1008 // haven't actually been forked to do real work yet. This partitioning
1009 // will take place in the parallel region nested within the teams construct.
1010 if (!fork_teams_workers) {
1011 __kmp_partition_places(team);
1012 }
1013#endif
1014 }
1015
1016 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017 for (i = 0; i < team->t.t_nproc; i++) {
1018 kmp_info_t *thr = team->t.t_threads[i];
1019 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020 thr->th.th_prev_level != team->t.t_level) {
1021 team->t.t_display_affinity = 1;
1022 break;
1023 }
1024 }
1025 }
1026
1027 KMP_MB();
1028}
1029
1030#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031// Propagate any changes to the floating point control registers out to the team
1032// We try to avoid unnecessary writes to the relevant cache line in the team
1033// structure, so we don't make changes unless they are needed.
1034inline static void propagateFPControl(kmp_team_t *team) {
1035 if (__kmp_inherit_fp_control) {
1036 kmp_int16 x87_fpu_control_word;
1037 kmp_uint32 mxcsr;
1038
1039 // Get primary thread's values of FPU control flags (both X87 and vector)
1040 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041 __kmp_store_mxcsr(&mxcsr);
1042 mxcsr &= KMP_X86_MXCSR_MASK;
1043
1044 // There is no point looking at t_fp_control_saved here.
1045 // If it is TRUE, we still have to update the values if they are different
1046 // from those we now have. If it is FALSE we didn't save anything yet, but
1047 // our objective is the same. We have to ensure that the values in the team
1048 // are the same as those we have.
1049 // So, this code achieves what we need whether or not t_fp_control_saved is
1050 // true. By checking whether the value needs updating we avoid unnecessary
1051 // writes that would put the cache-line into a written state, causing all
1052 // threads in the team to have to read it again.
1053 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055 // Although we don't use this value, other code in the runtime wants to know
1056 // whether it should restore them. So we must ensure it is correct.
1057 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058 } else {
1059 // Similarly here. Don't write to this cache-line in the team structure
1060 // unless we have to.
1061 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062 }
1063}
1064
1065// Do the opposite, setting the hardware registers to the updated values from
1066// the team.
1067inline static void updateHWFPControl(kmp_team_t *team) {
1068 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069 // Only reset the fp control regs if they have been changed in the team.
1070 // the parallel region that we are exiting.
1071 kmp_int16 x87_fpu_control_word;
1072 kmp_uint32 mxcsr;
1073 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074 __kmp_store_mxcsr(&mxcsr);
1075 mxcsr &= KMP_X86_MXCSR_MASK;
1076
1077 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078 __kmp_clear_x87_fpu_status_word();
1079 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080 }
1081
1082 if (team->t.t_mxcsr != mxcsr) {
1083 __kmp_load_mxcsr(&team->t.t_mxcsr);
1084 }
1085 }
1086}
1087#else
1088#define propagateFPControl(x) ((void)0)
1089#define updateHWFPControl(x) ((void)0)
1090#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091
1092static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093 int realloc); // forward declaration
1094
1095/* Run a parallel region that has been serialized, so runs only in a team of the
1096 single primary thread. */
1097void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098 kmp_info_t *this_thr;
1099 kmp_team_t *serial_team;
1100
1101 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102
1103 /* Skip all this code for autopar serialized loops since it results in
1104 unacceptable overhead */
1105 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106 return;
1107
1108 if (!TCR_4(__kmp_init_parallel))
1109 __kmp_parallel_initialize();
1110 __kmp_resume_if_soft_paused();
1111
1112 this_thr = __kmp_threads[global_tid];
1113 serial_team = this_thr->th.th_serial_team;
1114
1115 /* utilize the serialized team held by this thread */
1116 KMP_DEBUG_ASSERT(serial_team);
1117 KMP_MB();
1118
1119 if (__kmp_tasking_mode != tskm_immediate_exec) {
1120 KMP_DEBUG_ASSERT(
1121 this_thr->th.th_task_team ==
1122 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124 NULL);
1125 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126 "team %p, new task_team = NULL\n",
1127 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128 this_thr->th.th_task_team = NULL;
1129 }
1130
1131 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133 proc_bind = proc_bind_false;
1134 } else if (proc_bind == proc_bind_default) {
1135 // No proc_bind clause was specified, so use the current value
1136 // of proc-bind-var for this parallel region.
1137 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138 }
1139 // Reset for next parallel region
1140 this_thr->th.th_set_proc_bind = proc_bind_default;
1141
1142#if OMPT_SUPPORT
1143 ompt_data_t ompt_parallel_data = ompt_data_none;
1144 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145 if (ompt_enabled.enabled &&
1146 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147
1148 ompt_task_info_t *parent_task_info;
1149 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150
1151 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152 if (ompt_enabled.ompt_callback_parallel_begin) {
1153 int team_size = 1;
1154
1155 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156 &(parent_task_info->task_data), &(parent_task_info->frame),
1157 &ompt_parallel_data, team_size,
1158 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159 }
1160 }
1161#endif // OMPT_SUPPORT
1162
1163 if (this_thr->th.th_team != serial_team) {
1164 // Nested level will be an index in the nested nthreads array
1165 int level = this_thr->th.th_team->t.t_level;
1166
1167 if (serial_team->t.t_serialized) {
1168 /* this serial team was already used
1169 TODO increase performance by making this locks more specific */
1170 kmp_team_t *new_team;
1171
1172 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173
1174 new_team =
1175 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176#if OMPT_SUPPORT
1177 ompt_parallel_data,
1178#endif
1179 proc_bind, &this_thr->th.th_current_task->td_icvs,
1180 0 USE_NESTED_HOT_ARG(NULL));
1181 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182 KMP_ASSERT(new_team);
1183
1184 /* setup new serialized team and install it */
1185 new_team->t.t_threads[0] = this_thr;
1186 new_team->t.t_parent = this_thr->th.th_team;
1187 serial_team = new_team;
1188 this_thr->th.th_serial_team = serial_team;
1189
1190 KF_TRACE(
1191 10,
1192 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193 global_tid, serial_team));
1194
1195 /* TODO the above breaks the requirement that if we run out of resources,
1196 then we can still guarantee that serialized teams are ok, since we may
1197 need to allocate a new one */
1198 } else {
1199 KF_TRACE(
1200 10,
1201 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202 global_tid, serial_team));
1203 }
1204
1205 /* we have to initialize this serial team */
1206 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209 serial_team->t.t_ident = loc;
1210 serial_team->t.t_serialized = 1;
1211 serial_team->t.t_nproc = 1;
1212 serial_team->t.t_parent = this_thr->th.th_team;
1213 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214 this_thr->th.th_team = serial_team;
1215 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216
1217 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218 this_thr->th.th_current_task));
1219 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220 this_thr->th.th_current_task->td_flags.executing = 0;
1221
1222 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223
1224 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225 implicit task for each serialized task represented by
1226 team->t.t_serialized? */
1227 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228 &this_thr->th.th_current_task->td_parent->td_icvs);
1229
1230 // Thread value exists in the nested nthreads array for the next nested
1231 // level
1232 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233 this_thr->th.th_current_task->td_icvs.nproc =
1234 __kmp_nested_nth.nth[level + 1];
1235 }
1236
1237 if (__kmp_nested_proc_bind.used &&
1238 (level + 1 < __kmp_nested_proc_bind.used)) {
1239 this_thr->th.th_current_task->td_icvs.proc_bind =
1240 __kmp_nested_proc_bind.bind_types[level + 1];
1241 }
1242
1243#if USE_DEBUGGER
1244 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245#endif
1246 this_thr->th.th_info.ds.ds_tid = 0;
1247
1248 /* set thread cache values */
1249 this_thr->th.th_team_nproc = 1;
1250 this_thr->th.th_team_master = this_thr;
1251 this_thr->th.th_team_serialized = 1;
1252
1253 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256
1257 propagateFPControl(serial_team);
1258
1259 /* check if we need to allocate dispatch buffers stack */
1260 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262 serial_team->t.t_dispatch->th_disp_buffer =
1263 (dispatch_private_info_t *)__kmp_allocate(
1264 sizeof(dispatch_private_info_t));
1265 }
1266 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267
1268 KMP_MB();
1269
1270 } else {
1271 /* this serialized team is already being used,
1272 * that's fine, just add another nested level */
1273 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276 ++serial_team->t.t_serialized;
1277 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278
1279 // Nested level will be an index in the nested nthreads array
1280 int level = this_thr->th.th_team->t.t_level;
1281 // Thread value exists in the nested nthreads array for the next nested
1282 // level
1283 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284 this_thr->th.th_current_task->td_icvs.nproc =
1285 __kmp_nested_nth.nth[level + 1];
1286 }
1287 serial_team->t.t_level++;
1288 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289 "of serial team %p to %d\n",
1290 global_tid, serial_team, serial_team->t.t_level));
1291
1292 /* allocate/push dispatch buffers stack */
1293 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294 {
1295 dispatch_private_info_t *disp_buffer =
1296 (dispatch_private_info_t *)__kmp_allocate(
1297 sizeof(dispatch_private_info_t));
1298 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300 }
1301 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302
1303 KMP_MB();
1304 }
1305 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306
1307 // Perform the display affinity functionality for
1308 // serialized parallel regions
1309 if (__kmp_display_affinity) {
1310 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311 this_thr->th.th_prev_num_threads != 1) {
1312 // NULL means use the affinity-format-var ICV
1313 __kmp_aux_display_affinity(global_tid, NULL);
1314 this_thr->th.th_prev_level = serial_team->t.t_level;
1315 this_thr->th.th_prev_num_threads = 1;
1316 }
1317 }
1318
1319 if (__kmp_env_consistency_check)
1320 __kmp_push_parallel(global_tid, NULL);
1321#if OMPT_SUPPORT
1322 serial_team->t.ompt_team_info.master_return_address = codeptr;
1323 if (ompt_enabled.enabled &&
1324 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326 OMPT_GET_FRAME_ADDRESS(0);
1327
1328 ompt_lw_taskteam_t lw_taskteam;
1329 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330 &ompt_parallel_data, codeptr);
1331
1332 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333 // don't use lw_taskteam after linking. content was swaped
1334
1335 /* OMPT implicit task begin */
1336 if (ompt_enabled.ompt_callback_implicit_task) {
1337 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342 __kmp_tid_from_gtid(global_tid);
1343 }
1344
1345 /* OMPT state */
1346 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348 OMPT_GET_FRAME_ADDRESS(0);
1349 }
1350#endif
1351}
1352
1353/* most of the work for a fork */
1354/* return true if we really went parallel, false if serialized */
1355int __kmp_fork_call(ident_t *loc, int gtid,
1356 enum fork_context_e call_context, // Intel, GNU, ...
1357 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358 kmp_va_list ap) {
1359 void **argv;
1360 int i;
1361 int master_tid;
1362 int master_this_cons;
1363 kmp_team_t *team;
1364 kmp_team_t *parent_team;
1365 kmp_info_t *master_th;
1366 kmp_root_t *root;
1367 int nthreads;
1368 int master_active;
1369 int master_set_numthreads;
1370 int level;
1371 int active_level;
1372 int teams_level;
1373#if KMP_NESTED_HOT_TEAMS
1374 kmp_hot_team_ptr_t **p_hot_teams;
1375#endif
1376 { // KMP_TIME_BLOCK
1377 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379
1380 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382 /* Some systems prefer the stack for the root thread(s) to start with */
1383 /* some gap from the parent stack to prevent false sharing. */
1384 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385 /* These 2 lines below are so this does not get optimized out */
1386 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387 __kmp_stkpadding += (short)((kmp_int64)dummy);
1388 }
1389
1390 /* initialize if needed */
1391 KMP_DEBUG_ASSERT(
1392 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393 if (!TCR_4(__kmp_init_parallel))
1394 __kmp_parallel_initialize();
1395 __kmp_resume_if_soft_paused();
1396
1397 /* setup current data */
1398 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399 // shutdown
1400 parent_team = master_th->th.th_team;
1401 master_tid = master_th->th.th_info.ds.ds_tid;
1402 master_this_cons = master_th->th.th_local.this_construct;
1403 root = master_th->th.th_root;
1404 master_active = root->r.r_active;
1405 master_set_numthreads = master_th->th.th_set_nproc;
1406
1407#if OMPT_SUPPORT
1408 ompt_data_t ompt_parallel_data = ompt_data_none;
1409 ompt_data_t *parent_task_data;
1410 ompt_frame_t *ompt_frame;
1411 ompt_data_t *implicit_task_data;
1412 void *return_address = NULL;
1413
1414 if (ompt_enabled.enabled) {
1415 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416 NULL, NULL);
1417 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418 }
1419#endif
1420
1421 // Assign affinity to root thread if it hasn't happened yet
1422 __kmp_assign_root_init_mask();
1423
1424 // Nested level will be an index in the nested nthreads array
1425 level = parent_team->t.t_level;
1426 // used to launch non-serial teams even if nested is not allowed
1427 active_level = parent_team->t.t_active_level;
1428 // needed to check nesting inside the teams
1429 teams_level = master_th->th.th_teams_level;
1430#if KMP_NESTED_HOT_TEAMS
1431 p_hot_teams = &master_th->th.th_hot_teams;
1432 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436 // it is either actual or not needed (when active_level > 0)
1437 (*p_hot_teams)[0].hot_team_nth = 1;
1438 }
1439#endif
1440
1441#if OMPT_SUPPORT
1442 if (ompt_enabled.enabled) {
1443 if (ompt_enabled.ompt_callback_parallel_begin) {
1444 int team_size = master_set_numthreads
1445 ? master_set_numthreads
1446 : get__nproc_2(parent_team, master_tid);
1447 int flags = OMPT_INVOKER(call_context) |
1448 ((microtask == (microtask_t)__kmp_teams_master)
1449 ? ompt_parallel_league
1450 : ompt_parallel_team);
1451 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453 return_address);
1454 }
1455 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456 }
1457#endif
1458
1459 master_th->th.th_ident = loc;
1460
1461 if (master_th->th.th_teams_microtask && ap &&
1462 microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463 // AC: This is start of parallel that is nested inside teams construct.
1464 // The team is actual (hot), all workers are ready at the fork barrier.
1465 // No lock needed to initialize the team a bit, then free workers.
1466 parent_team->t.t_ident = loc;
1467 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468 parent_team->t.t_argc = argc;
1469 argv = (void **)parent_team->t.t_argv;
1470 for (i = argc - 1; i >= 0; --i)
1471 *argv++ = va_arg(kmp_va_deref(ap), void *);
1472 // Increment our nested depth levels, but not increase the serialization
1473 if (parent_team == master_th->th.th_serial_team) {
1474 // AC: we are in serialized parallel
1475 __kmpc_serialized_parallel(loc, gtid);
1476 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477
1478 if (call_context == fork_context_gnu) {
1479 // AC: need to decrement t_serialized for enquiry functions to work
1480 // correctly, will restore at join time
1481 parent_team->t.t_serialized--;
1482 return TRUE;
1483 }
1484
1485#if OMPD_SUPPORT
1486 parent_team->t.t_pkfn = microtask;
1487#endif
1488
1489#if OMPT_SUPPORT
1490 void *dummy;
1491 void **exit_frame_p;
1492
1493 ompt_lw_taskteam_t lw_taskteam;
1494
1495 if (ompt_enabled.enabled) {
1496 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497 &ompt_parallel_data, return_address);
1498 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499
1500 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501 // don't use lw_taskteam after linking. content was swaped
1502
1503 /* OMPT implicit task begin */
1504 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505 if (ompt_enabled.ompt_callback_implicit_task) {
1506 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507 __kmp_tid_from_gtid(gtid);
1508 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510 implicit_task_data, 1,
1511 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512 }
1513
1514 /* OMPT state */
1515 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516 } else {
1517 exit_frame_p = &dummy;
1518 }
1519#endif
1520 // AC: need to decrement t_serialized for enquiry functions to work
1521 // correctly, will restore at join time
1522 parent_team->t.t_serialized--;
1523
1524 {
1525 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528#if OMPT_SUPPORT
1529 ,
1530 exit_frame_p
1531#endif
1532 );
1533 }
1534
1535#if OMPT_SUPPORT
1536 if (ompt_enabled.enabled) {
1537 *exit_frame_p = NULL;
1538 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539 if (ompt_enabled.ompt_callback_implicit_task) {
1540 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541 ompt_scope_end, NULL, implicit_task_data, 1,
1542 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543 }
1544 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545 __ompt_lw_taskteam_unlink(master_th);
1546 if (ompt_enabled.ompt_callback_parallel_end) {
1547 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549 OMPT_INVOKER(call_context) | ompt_parallel_team,
1550 return_address);
1551 }
1552 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553 }
1554#endif
1555 return TRUE;
1556 }
1557
1558 parent_team->t.t_pkfn = microtask;
1559 parent_team->t.t_invoke = invoker;
1560 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561 parent_team->t.t_active_level++;
1562 parent_team->t.t_level++;
1563 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564
1565#if OMPT_SUPPORT
1566 if (ompt_enabled.enabled) {
1567 ompt_lw_taskteam_t lw_taskteam;
1568 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569 &ompt_parallel_data, return_address);
1570 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571 }
1572#endif
1573
1574 /* Change number of threads in the team if requested */
1575 if (master_set_numthreads) { // The parallel has num_threads clause
1576 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577 // AC: only can reduce number of threads dynamically, can't increase
1578 kmp_info_t **other_threads = parent_team->t.t_threads;
1579 // NOTE: if using distributed barrier, we need to run this code block
1580 // even when the team size appears not to have changed from the max.
1581 int old_proc = master_th->th.th_teams_size.nth;
1582 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583 bp_dist_bar) {
1584 __kmp_resize_dist_barrier(parent_team, old_proc,
1585 master_set_numthreads);
1586 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587 }
1588 parent_team->t.t_nproc = master_set_numthreads;
1589 for (i = 0; i < master_set_numthreads; ++i) {
1590 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591 }
1592 }
1593 // Keep extra threads hot in the team for possible next parallels
1594 master_th->th.th_set_nproc = 0;
1595 }
1596
1597#if USE_DEBUGGER
1598 if (__kmp_debugging) { // Let debugger override number of threads.
1599 int nth = __kmp_omp_num_threads(loc);
1600 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601 master_set_numthreads = nth;
1602 }
1603 }
1604#endif
1605
1606 // Figure out the proc_bind policy for the nested parallel within teams
1607 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608 // proc_bind_default means don't update
1609 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611 proc_bind = proc_bind_false;
1612 } else {
1613 // No proc_bind clause specified; use current proc-bind-var
1614 if (proc_bind == proc_bind_default) {
1615 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616 }
1617 /* else: The proc_bind policy was specified explicitly on parallel
1618 clause.
1619 This overrides proc-bind-var for this parallel region, but does not
1620 change proc-bind-var. */
1621 // Figure the value of proc-bind-var for the child threads.
1622 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624 master_th->th.th_current_task->td_icvs.proc_bind)) {
1625 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626 }
1627 }
1628 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629 // Need to change the bind-var ICV to correct value for each implicit task
1630 if (proc_bind_icv != proc_bind_default &&
1631 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632 kmp_info_t **other_threads = parent_team->t.t_threads;
1633 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634 other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635 proc_bind_icv;
1636 }
1637 }
1638 // Reset for next parallel region
1639 master_th->th.th_set_proc_bind = proc_bind_default;
1640
1641#if USE_ITT_BUILD && USE_ITT_NOTIFY
1642 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643 KMP_ITT_DEBUG) &&
1644 __kmp_forkjoin_frames_mode == 3 &&
1645 parent_team->t.t_active_level == 1 // only report frames at level 1
1646 && master_th->th.th_teams_size.nteams == 1) {
1647 kmp_uint64 tmp_time = __itt_get_timestamp();
1648 master_th->th.th_frame_time = tmp_time;
1649 parent_team->t.t_region_time = tmp_time;
1650 }
1651 if (__itt_stack_caller_create_ptr) {
1652 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653 // create new stack stitching id before entering fork barrier
1654 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655 }
1656#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657#if KMP_AFFINITY_SUPPORTED
1658 __kmp_partition_places(parent_team);
1659#endif
1660
1661 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662 "master_th=%p, gtid=%d\n",
1663 root, parent_team, master_th, gtid));
1664 __kmp_internal_fork(loc, gtid, parent_team);
1665 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666 "master_th=%p, gtid=%d\n",
1667 root, parent_team, master_th, gtid));
1668
1669 if (call_context == fork_context_gnu)
1670 return TRUE;
1671
1672 /* Invoke microtask for PRIMARY thread */
1673 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674 parent_team->t.t_id, parent_team->t.t_pkfn));
1675
1676 if (!parent_team->t.t_invoke(gtid)) {
1677 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678 }
1679 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680 parent_team->t.t_id, parent_team->t.t_pkfn));
1681 KMP_MB(); /* Flush all pending memory write invalidates. */
1682
1683 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684
1685 return TRUE;
1686 } // Parallel closely nested in teams construct
1687
1688#if KMP_DEBUG
1689 if (__kmp_tasking_mode != tskm_immediate_exec) {
1690 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691 parent_team->t.t_task_team[master_th->th.th_task_state]);
1692 }
1693#endif
1694
1695 // Need this to happen before we determine the number of threads, not while
1696 // we are allocating the team
1697 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698 int enter_teams = 0;
1699 if (parent_team->t.t_active_level >=
1700 master_th->th.th_current_task->td_icvs.max_active_levels) {
1701 nthreads = 1;
1702 } else {
1703 enter_teams = ((ap == NULL && active_level == 0) ||
1704 (ap && teams_level > 0 && teams_level == level));
1705 nthreads = master_set_numthreads
1706 ? master_set_numthreads
1707 // TODO: get nproc directly from current task
1708 : get__nproc_2(parent_team, master_tid);
1709 // Check if we need to take forkjoin lock? (no need for serialized
1710 // parallel out of teams construct). This code moved here from
1711 // __kmp_reserve_threads() to speedup nested serialized parallels.
1712 if (nthreads > 1) {
1713 if ((get__max_active_levels(master_th) == 1 &&
1714 (root->r.r_in_parallel && !enter_teams)) ||
1715 (__kmp_library == library_serial)) {
1716 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717 " threads\n",
1718 gtid, nthreads));
1719 nthreads = 1;
1720 }
1721 }
1722 if (nthreads > 1) {
1723 /* determine how many new threads we can use */
1724 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725 /* AC: If we execute teams from parallel region (on host), then teams
1726 should be created but each can only have 1 thread if nesting is
1727 disabled. If teams called from serial region, then teams and their
1728 threads should be created regardless of the nesting setting. */
1729 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730 nthreads, enter_teams);
1731 if (nthreads == 1) {
1732 // Free lock for single thread execution here; for multi-thread
1733 // execution it will be freed later after team of threads created
1734 // and initialized
1735 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736 }
1737 }
1738 }
1739 KMP_DEBUG_ASSERT(nthreads > 0);
1740
1741 // If we temporarily changed the set number of threads then restore it now
1742 master_th->th.th_set_nproc = 0;
1743
1744 /* create a serialized parallel region? */
1745 if (nthreads == 1) {
1746/* josh todo: hypothetical question: what do we do for OS X*? */
1747#if KMP_OS_LINUX && \
1748 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749 void *args[argc];
1750#else
1751 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753 KMP_ARCH_AARCH64) */
1754
1755 KA_TRACE(20,
1756 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757
1758 __kmpc_serialized_parallel(loc, gtid);
1759
1760#if OMPD_SUPPORT
1761 master_th->th.th_serial_team->t.t_pkfn = microtask;
1762#endif
1763
1764 if (call_context == fork_context_intel) {
1765 /* TODO this sucks, use the compiler itself to pass args! :) */
1766 master_th->th.th_serial_team->t.t_ident = loc;
1767 if (!ap) {
1768 // revert change made in __kmpc_serialized_parallel()
1769 master_th->th.th_serial_team->t.t_level--;
1770 // Get args from parent team for teams construct
1771
1772#if OMPT_SUPPORT
1773 void *dummy;
1774 void **exit_frame_p;
1775 ompt_task_info_t *task_info;
1776
1777 ompt_lw_taskteam_t lw_taskteam;
1778
1779 if (ompt_enabled.enabled) {
1780 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781 &ompt_parallel_data, return_address);
1782
1783 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784 // don't use lw_taskteam after linking. content was swaped
1785
1786 task_info = OMPT_CUR_TASK_INFO(master_th);
1787 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788 if (ompt_enabled.ompt_callback_implicit_task) {
1789 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790 __kmp_tid_from_gtid(gtid);
1791 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793 &(task_info->task_data), 1,
1794 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795 ompt_task_implicit);
1796 }
1797
1798 /* OMPT state */
1799 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800 } else {
1801 exit_frame_p = &dummy;
1802 }
1803#endif
1804
1805 {
1806 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808 __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809 parent_team->t.t_argv
1810#if OMPT_SUPPORT
1811 ,
1812 exit_frame_p
1813#endif
1814 );
1815 }
1816
1817#if OMPT_SUPPORT
1818 if (ompt_enabled.enabled) {
1819 *exit_frame_p = NULL;
1820 if (ompt_enabled.ompt_callback_implicit_task) {
1821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822 ompt_scope_end, NULL, &(task_info->task_data), 1,
1823 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824 ompt_task_implicit);
1825 }
1826 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827 __ompt_lw_taskteam_unlink(master_th);
1828 if (ompt_enabled.ompt_callback_parallel_end) {
1829 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830 &ompt_parallel_data, parent_task_data,
1831 OMPT_INVOKER(call_context) | ompt_parallel_team,
1832 return_address);
1833 }
1834 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835 }
1836#endif
1837 } else if (microtask == (microtask_t)__kmp_teams_master) {
1838 KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839 master_th->th.th_serial_team);
1840 team = master_th->th.th_team;
1841 // team->t.t_pkfn = microtask;
1842 team->t.t_invoke = invoker;
1843 __kmp_alloc_argv_entries(argc, team, TRUE);
1844 team->t.t_argc = argc;
1845 argv = (void **)team->t.t_argv;
1846 if (ap) {
1847 for (i = argc - 1; i >= 0; --i)
1848 *argv++ = va_arg(kmp_va_deref(ap), void *);
1849 } else {
1850 for (i = 0; i < argc; ++i)
1851 // Get args from parent team for teams construct
1852 argv[i] = parent_team->t.t_argv[i];
1853 }
1854 // AC: revert change made in __kmpc_serialized_parallel()
1855 // because initial code in teams should have level=0
1856 team->t.t_level--;
1857 // AC: call special invoker for outer "parallel" of teams construct
1858 invoker(gtid);
1859#if OMPT_SUPPORT
1860 if (ompt_enabled.enabled) {
1861 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862 if (ompt_enabled.ompt_callback_implicit_task) {
1863 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864 ompt_scope_end, NULL, &(task_info->task_data), 0,
1865 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866 }
1867 if (ompt_enabled.ompt_callback_parallel_end) {
1868 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869 &ompt_parallel_data, parent_task_data,
1870 OMPT_INVOKER(call_context) | ompt_parallel_league,
1871 return_address);
1872 }
1873 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874 }
1875#endif
1876 } else {
1877 argv = args;
1878 for (i = argc - 1; i >= 0; --i)
1879 *argv++ = va_arg(kmp_va_deref(ap), void *);
1880 KMP_MB();
1881
1882#if OMPT_SUPPORT
1883 void *dummy;
1884 void **exit_frame_p;
1885 ompt_task_info_t *task_info;
1886
1887 ompt_lw_taskteam_t lw_taskteam;
1888
1889 if (ompt_enabled.enabled) {
1890 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891 &ompt_parallel_data, return_address);
1892 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893 // don't use lw_taskteam after linking. content was swaped
1894 task_info = OMPT_CUR_TASK_INFO(master_th);
1895 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896
1897 /* OMPT implicit task begin */
1898 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899 if (ompt_enabled.ompt_callback_implicit_task) {
1900 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903 ompt_task_implicit);
1904 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905 __kmp_tid_from_gtid(gtid);
1906 }
1907
1908 /* OMPT state */
1909 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910 } else {
1911 exit_frame_p = &dummy;
1912 }
1913#endif
1914
1915 {
1916 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919#if OMPT_SUPPORT
1920 ,
1921 exit_frame_p
1922#endif
1923 );
1924 }
1925
1926#if OMPT_SUPPORT
1927 if (ompt_enabled.enabled) {
1928 *exit_frame_p = NULL;
1929 if (ompt_enabled.ompt_callback_implicit_task) {
1930 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931 ompt_scope_end, NULL, &(task_info->task_data), 1,
1932 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933 ompt_task_implicit);
1934 }
1935
1936 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937 __ompt_lw_taskteam_unlink(master_th);
1938 if (ompt_enabled.ompt_callback_parallel_end) {
1939 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940 &ompt_parallel_data, parent_task_data,
1941 OMPT_INVOKER(call_context) | ompt_parallel_team,
1942 return_address);
1943 }
1944 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945 }
1946#endif
1947 }
1948 } else if (call_context == fork_context_gnu) {
1949#if OMPT_SUPPORT
1950 if (ompt_enabled.enabled) {
1951 ompt_lw_taskteam_t lwt;
1952 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1953 return_address);
1954
1955 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1956 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1957 }
1958// don't use lw_taskteam after linking. content was swaped
1959#endif
1960
1961 // we were called from GNU native code
1962 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1963 return FALSE;
1964 } else {
1965 KMP_ASSERT2(call_context < fork_context_last,
1966 "__kmp_fork_call: unknown fork_context parameter");
1967 }
1968
1969 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1970 KMP_MB();
1971 return FALSE;
1972 } // if (nthreads == 1)
1973
1974 // GEH: only modify the executing flag in the case when not serialized
1975 // serialized case is handled in kmpc_serialized_parallel
1976 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1977 "curtask=%p, curtask_max_aclevel=%d\n",
1978 parent_team->t.t_active_level, master_th,
1979 master_th->th.th_current_task,
1980 master_th->th.th_current_task->td_icvs.max_active_levels));
1981 // TODO: GEH - cannot do this assertion because root thread not set up as
1982 // executing
1983 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1984 master_th->th.th_current_task->td_flags.executing = 0;
1985
1986 if (!master_th->th.th_teams_microtask || level > teams_level) {
1987 /* Increment our nested depth level */
1988 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1989 }
1990
1991 // See if we need to make a copy of the ICVs.
1992 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1993 if ((level + 1 < __kmp_nested_nth.used) &&
1994 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1995 nthreads_icv = __kmp_nested_nth.nth[level + 1];
1996 } else {
1997 nthreads_icv = 0; // don't update
1998 }
1999
2000 // Figure out the proc_bind_policy for the new team.
2001 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2002 // proc_bind_default means don't update
2003 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2004 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2005 proc_bind = proc_bind_false;
2006 } else {
2007 // No proc_bind clause specified; use current proc-bind-var for this
2008 // parallel region
2009 if (proc_bind == proc_bind_default) {
2010 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2011 }
2012 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2013 if (master_th->th.th_teams_microtask &&
2014 microtask == (microtask_t)__kmp_teams_master) {
2015 proc_bind = __kmp_teams_proc_bind;
2016 }
2017 /* else: The proc_bind policy was specified explicitly on parallel clause.
2018 This overrides proc-bind-var for this parallel region, but does not
2019 change proc-bind-var. */
2020 // Figure the value of proc-bind-var for the child threads.
2021 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2022 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2023 master_th->th.th_current_task->td_icvs.proc_bind)) {
2024 // Do not modify the proc bind icv for the two teams construct forks
2025 // They just let the proc bind icv pass through
2026 if (!master_th->th.th_teams_microtask ||
2027 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2028 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2029 }
2030 }
2031
2032 // Reset for next parallel region
2033 master_th->th.th_set_proc_bind = proc_bind_default;
2034
2035 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2036 kmp_internal_control_t new_icvs;
2037 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2038 new_icvs.next = NULL;
2039 if (nthreads_icv > 0) {
2040 new_icvs.nproc = nthreads_icv;
2041 }
2042 if (proc_bind_icv != proc_bind_default) {
2043 new_icvs.proc_bind = proc_bind_icv;
2044 }
2045
2046 /* allocate a new parallel team */
2047 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2048 team = __kmp_allocate_team(root, nthreads, nthreads,
2049#if OMPT_SUPPORT
2050 ompt_parallel_data,
2051#endif
2052 proc_bind, &new_icvs,
2053 argc USE_NESTED_HOT_ARG(master_th));
2054 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2055 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2056 } else {
2057 /* allocate a new parallel team */
2058 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2059 team = __kmp_allocate_team(root, nthreads, nthreads,
2060#if OMPT_SUPPORT
2061 ompt_parallel_data,
2062#endif
2063 proc_bind,
2064 &master_th->th.th_current_task->td_icvs,
2065 argc USE_NESTED_HOT_ARG(master_th));
2066 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2067 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2068 &master_th->th.th_current_task->td_icvs);
2069 }
2070 KF_TRACE(
2071 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2072
2073 /* setup the new team */
2074 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2075 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2076 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2077 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2078 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2079#if OMPT_SUPPORT
2080 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2081 return_address);
2082#endif
2083 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2084 // TODO: parent_team->t.t_level == INT_MAX ???
2085 if (!master_th->th.th_teams_microtask || level > teams_level) {
2086 int new_level = parent_team->t.t_level + 1;
2087 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2088 new_level = parent_team->t.t_active_level + 1;
2089 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2090 } else {
2091 // AC: Do not increase parallel level at start of the teams construct
2092 int new_level = parent_team->t.t_level;
2093 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2094 new_level = parent_team->t.t_active_level;
2095 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2096 }
2097 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2098 // set primary thread's schedule as new run-time schedule
2099 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2100
2101 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2102 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2103
2104 // Update the floating point rounding in the team if required.
2105 propagateFPControl(team);
2106#if OMPD_SUPPORT
2107 if (ompd_state & OMPD_ENABLE_BP)
2108 ompd_bp_parallel_begin();
2109#endif
2110
2111 if (__kmp_tasking_mode != tskm_immediate_exec) {
2112 // Set primary thread's task team to team's task team. Unless this is hot
2113 // team, it should be NULL.
2114 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2115 parent_team->t.t_task_team[master_th->th.th_task_state]);
2116 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2117 "%p, new task_team %p / team %p\n",
2118 __kmp_gtid_from_thread(master_th),
2119 master_th->th.th_task_team, parent_team,
2120 team->t.t_task_team[master_th->th.th_task_state], team));
2121
2122 if (active_level || master_th->th.th_task_team) {
2123 // Take a memo of primary thread's task_state
2124 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2125 if (master_th->th.th_task_state_top >=
2126 master_th->th.th_task_state_stack_sz) { // increase size
2127 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2128 kmp_uint8 *old_stack, *new_stack;
2129 kmp_uint32 i;
2130 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2131 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2132 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2133 }
2134 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2135 ++i) { // zero-init rest of stack
2136 new_stack[i] = 0;
2137 }
2138 old_stack = master_th->th.th_task_state_memo_stack;
2139 master_th->th.th_task_state_memo_stack = new_stack;
2140 master_th->th.th_task_state_stack_sz = new_size;
2141 __kmp_free(old_stack);
2142 }
2143 // Store primary thread's task_state on stack
2144 master_th->th
2145 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2146 master_th->th.th_task_state;
2147 master_th->th.th_task_state_top++;
2148#if KMP_NESTED_HOT_TEAMS
2149 if (master_th->th.th_hot_teams &&
2150 active_level < __kmp_hot_teams_max_level &&
2151 team == master_th->th.th_hot_teams[active_level].hot_team) {
2152 // Restore primary thread's nested state if nested hot team
2153 master_th->th.th_task_state =
2154 master_th->th
2155 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2156 } else {
2157#endif
2158 master_th->th.th_task_state = 0;
2159#if KMP_NESTED_HOT_TEAMS
2160 }
2161#endif
2162 }
2163#if !KMP_NESTED_HOT_TEAMS
2164 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2165 (team == root->r.r_hot_team));
2166#endif
2167 }
2168
2169 KA_TRACE(
2170 20,
2171 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2172 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2173 team->t.t_nproc));
2174 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2175 (team->t.t_master_tid == 0 &&
2176 (team->t.t_parent == root->r.r_root_team ||
2177 team->t.t_parent->t.t_serialized)));
2178 KMP_MB();
2179
2180 /* now, setup the arguments */
2181 argv = (void **)team->t.t_argv;
2182 if (ap) {
2183 for (i = argc - 1; i >= 0; --i) {
2184 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2185 KMP_CHECK_UPDATE(*argv, new_argv);
2186 argv++;
2187 }
2188 } else {
2189 for (i = 0; i < argc; ++i) {
2190 // Get args from parent team for teams construct
2191 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2192 }
2193 }
2194
2195 /* now actually fork the threads */
2196 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2197 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2198 root->r.r_active = TRUE;
2199
2200 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2201 __kmp_setup_icv_copy(team, nthreads,
2202 &master_th->th.th_current_task->td_icvs, loc);
2203
2204#if OMPT_SUPPORT
2205 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2206#endif
2207
2208 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2209
2210#if USE_ITT_BUILD
2211 if (team->t.t_active_level == 1 // only report frames at level 1
2212 && !master_th->th.th_teams_microtask) { // not in teams construct
2213#if USE_ITT_NOTIFY
2214 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2215 (__kmp_forkjoin_frames_mode == 3 ||
2216 __kmp_forkjoin_frames_mode == 1)) {
2217 kmp_uint64 tmp_time = 0;
2218 if (__itt_get_timestamp_ptr)
2219 tmp_time = __itt_get_timestamp();
2220 // Internal fork - report frame begin
2221 master_th->th.th_frame_time = tmp_time;
2222 if (__kmp_forkjoin_frames_mode == 3)
2223 team->t.t_region_time = tmp_time;
2224 } else
2225// only one notification scheme (either "submit" or "forking/joined", not both)
2226#endif /* USE_ITT_NOTIFY */
2227 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2228 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2229 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2230 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2231 }
2232 }
2233#endif /* USE_ITT_BUILD */
2234
2235 /* now go on and do the work */
2236 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2237 KMP_MB();
2238 KF_TRACE(10,
2239 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2240 root, team, master_th, gtid));
2241
2242#if USE_ITT_BUILD
2243 if (__itt_stack_caller_create_ptr) {
2244 // create new stack stitching id before entering fork barrier
2245 if (!enter_teams) {
2246 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2247 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2248 } else if (parent_team->t.t_serialized) {
2249 // keep stack stitching id in the serialized parent_team;
2250 // current team will be used for parallel inside the teams;
2251 // if parent_team is active, then it already keeps stack stitching id
2252 // for the league of teams
2253 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2254 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2255 }
2256 }
2257#endif /* USE_ITT_BUILD */
2258
2259 // AC: skip __kmp_internal_fork at teams construct, let only primary
2260 // threads execute
2261 if (ap) {
2262 __kmp_internal_fork(loc, gtid, team);
2263 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2264 "master_th=%p, gtid=%d\n",
2265 root, team, master_th, gtid));
2266 }
2267
2268 if (call_context == fork_context_gnu) {
2269 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2270 return TRUE;
2271 }
2272
2273 /* Invoke microtask for PRIMARY thread */
2274 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2275 team->t.t_id, team->t.t_pkfn));
2276 } // END of timer KMP_fork_call block
2277
2278#if KMP_STATS_ENABLED
2279 // If beginning a teams construct, then change thread state
2280 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2281 if (!ap) {
2282 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2283 }
2284#endif
2285
2286 if (!team->t.t_invoke(gtid)) {
2287 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2288 }
2289
2290#if KMP_STATS_ENABLED
2291 // If was beginning of a teams construct, then reset thread state
2292 if (!ap) {
2293 KMP_SET_THREAD_STATE(previous_state);
2294 }
2295#endif
2296
2297 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2298 team->t.t_id, team->t.t_pkfn));
2299 KMP_MB(); /* Flush all pending memory write invalidates. */
2300
2301 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2302#if OMPT_SUPPORT
2303 if (ompt_enabled.enabled) {
2304 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2305 }
2306#endif
2307
2308 return TRUE;
2309}
2310
2311#if OMPT_SUPPORT
2312static inline void __kmp_join_restore_state(kmp_info_t *thread,
2313 kmp_team_t *team) {
2314 // restore state outside the region
2315 thread->th.ompt_thread_info.state =
2316 ((team->t.t_serialized) ? ompt_state_work_serial
2317 : ompt_state_work_parallel);
2318}
2319
2320static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2321 kmp_team_t *team, ompt_data_t *parallel_data,
2322 int flags, void *codeptr) {
2323 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2324 if (ompt_enabled.ompt_callback_parallel_end) {
2325 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2326 parallel_data, &(task_info->task_data), flags, codeptr);
2327 }
2328
2329 task_info->frame.enter_frame = ompt_data_none;
2330 __kmp_join_restore_state(thread, team);
2331}
2332#endif
2333
2334void __kmp_join_call(ident_t *loc, int gtid
2335#if OMPT_SUPPORT
2336 ,
2337 enum fork_context_e fork_context
2338#endif
2339 ,
2340 int exit_teams) {
2341 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2342 kmp_team_t *team;
2343 kmp_team_t *parent_team;
2344 kmp_info_t *master_th;
2345 kmp_root_t *root;
2346 int master_active;
2347
2348 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2349
2350 /* setup current data */
2351 master_th = __kmp_threads[gtid];
2352 root = master_th->th.th_root;
2353 team = master_th->th.th_team;
2354 parent_team = team->t.t_parent;
2355
2356 master_th->th.th_ident = loc;
2357
2358#if OMPT_SUPPORT
2359 void *team_microtask = (void *)team->t.t_pkfn;
2360 // For GOMP interface with serialized parallel, need the
2361 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2362 // and end-parallel events.
2363 if (ompt_enabled.enabled &&
2364 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2365 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2366 }
2367#endif
2368
2369#if KMP_DEBUG
2370 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2371 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2372 "th_task_team = %p\n",
2373 __kmp_gtid_from_thread(master_th), team,
2374 team->t.t_task_team[master_th->th.th_task_state],
2375 master_th->th.th_task_team));
2376 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2377 team->t.t_task_team[master_th->th.th_task_state]);
2378 }
2379#endif
2380
2381 if (team->t.t_serialized) {
2382 if (master_th->th.th_teams_microtask) {
2383 // We are in teams construct
2384 int level = team->t.t_level;
2385 int tlevel = master_th->th.th_teams_level;
2386 if (level == tlevel) {
2387 // AC: we haven't incremented it earlier at start of teams construct,
2388 // so do it here - at the end of teams construct
2389 team->t.t_level++;
2390 } else if (level == tlevel + 1) {
2391 // AC: we are exiting parallel inside teams, need to increment
2392 // serialization in order to restore it in the next call to
2393 // __kmpc_end_serialized_parallel
2394 team->t.t_serialized++;
2395 }
2396 }
2398
2399#if OMPT_SUPPORT
2400 if (ompt_enabled.enabled) {
2401 if (fork_context == fork_context_gnu) {
2402 __ompt_lw_taskteam_unlink(master_th);
2403 }
2404 __kmp_join_restore_state(master_th, parent_team);
2405 }
2406#endif
2407
2408 return;
2409 }
2410
2411 master_active = team->t.t_master_active;
2412
2413 if (!exit_teams) {
2414 // AC: No barrier for internal teams at exit from teams construct.
2415 // But there is barrier for external team (league).
2416 __kmp_internal_join(loc, gtid, team);
2417#if USE_ITT_BUILD
2418 if (__itt_stack_caller_create_ptr) {
2419 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2420 // destroy the stack stitching id after join barrier
2421 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2422 team->t.t_stack_id = NULL;
2423 }
2424#endif
2425 } else {
2426 master_th->th.th_task_state =
2427 0; // AC: no tasking in teams (out of any parallel)
2428#if USE_ITT_BUILD
2429 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2430 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2431 // destroy the stack stitching id on exit from the teams construct
2432 // if parent_team is active, then the id will be destroyed later on
2433 // by master of the league of teams
2434 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2435 parent_team->t.t_stack_id = NULL;
2436 }
2437#endif
2438
2439 if (team->t.t_nproc > 1 &&
2440 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2441 team->t.b->update_num_threads(team->t.t_nproc);
2442 __kmp_add_threads_to_team(team, team->t.t_nproc);
2443 }
2444 }
2445
2446 KMP_MB();
2447
2448#if OMPT_SUPPORT
2449 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2450 void *codeptr = team->t.ompt_team_info.master_return_address;
2451#endif
2452
2453#if USE_ITT_BUILD
2454 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2455 if (team->t.t_active_level == 1 &&
2456 (!master_th->th.th_teams_microtask || /* not in teams construct */
2457 master_th->th.th_teams_size.nteams == 1)) {
2458 master_th->th.th_ident = loc;
2459 // only one notification scheme (either "submit" or "forking/joined", not
2460 // both)
2461 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2462 __kmp_forkjoin_frames_mode == 3)
2463 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2464 master_th->th.th_frame_time, 0, loc,
2465 master_th->th.th_team_nproc, 1);
2466 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2467 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2468 __kmp_itt_region_joined(gtid);
2469 } // active_level == 1
2470#endif /* USE_ITT_BUILD */
2471
2472#if KMP_AFFINITY_SUPPORTED
2473 if (!exit_teams) {
2474 // Restore master thread's partition.
2475 master_th->th.th_first_place = team->t.t_first_place;
2476 master_th->th.th_last_place = team->t.t_last_place;
2477 }
2478#endif // KMP_AFFINITY_SUPPORTED
2479
2480 if (master_th->th.th_teams_microtask && !exit_teams &&
2481 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2482 team->t.t_level == master_th->th.th_teams_level + 1) {
2483// AC: We need to leave the team structure intact at the end of parallel
2484// inside the teams construct, so that at the next parallel same (hot) team
2485// works, only adjust nesting levels
2486#if OMPT_SUPPORT
2487 ompt_data_t ompt_parallel_data = ompt_data_none;
2488 if (ompt_enabled.enabled) {
2489 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2490 if (ompt_enabled.ompt_callback_implicit_task) {
2491 int ompt_team_size = team->t.t_nproc;
2492 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2493 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2494 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2495 }
2496 task_info->frame.exit_frame = ompt_data_none;
2497 task_info->task_data = ompt_data_none;
2498 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2499 __ompt_lw_taskteam_unlink(master_th);
2500 }
2501#endif
2502 /* Decrement our nested depth level */
2503 team->t.t_level--;
2504 team->t.t_active_level--;
2505 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2506
2507 // Restore number of threads in the team if needed. This code relies on
2508 // the proper adjustment of th_teams_size.nth after the fork in
2509 // __kmp_teams_master on each teams primary thread in the case that
2510 // __kmp_reserve_threads reduced it.
2511 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2512 int old_num = master_th->th.th_team_nproc;
2513 int new_num = master_th->th.th_teams_size.nth;
2514 kmp_info_t **other_threads = team->t.t_threads;
2515 team->t.t_nproc = new_num;
2516 for (int i = 0; i < old_num; ++i) {
2517 other_threads[i]->th.th_team_nproc = new_num;
2518 }
2519 // Adjust states of non-used threads of the team
2520 for (int i = old_num; i < new_num; ++i) {
2521 // Re-initialize thread's barrier data.
2522 KMP_DEBUG_ASSERT(other_threads[i]);
2523 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2524 for (int b = 0; b < bs_last_barrier; ++b) {
2525 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2526 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2527#if USE_DEBUGGER
2528 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2529#endif
2530 }
2531 if (__kmp_tasking_mode != tskm_immediate_exec) {
2532 // Synchronize thread's task state
2533 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2534 }
2535 }
2536 }
2537
2538#if OMPT_SUPPORT
2539 if (ompt_enabled.enabled) {
2540 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2541 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2542 }
2543#endif
2544
2545 return;
2546 }
2547
2548 /* do cleanup and restore the parent team */
2549 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2550 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2551
2552 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2553
2554 /* jc: The following lock has instructions with REL and ACQ semantics,
2555 separating the parallel user code called in this parallel region
2556 from the serial user code called after this function returns. */
2557 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2558
2559 if (!master_th->th.th_teams_microtask ||
2560 team->t.t_level > master_th->th.th_teams_level) {
2561 /* Decrement our nested depth level */
2562 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2563 }
2564 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2565
2566#if OMPT_SUPPORT
2567 if (ompt_enabled.enabled) {
2568 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2569 if (ompt_enabled.ompt_callback_implicit_task) {
2570 int flags = (team_microtask == (void *)__kmp_teams_master)
2571 ? ompt_task_initial
2572 : ompt_task_implicit;
2573 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2574 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2575 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2576 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2577 }
2578 task_info->frame.exit_frame = ompt_data_none;
2579 task_info->task_data = ompt_data_none;
2580 }
2581#endif
2582
2583 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2584 master_th, team));
2585 __kmp_pop_current_task_from_thread(master_th);
2586
2587 master_th->th.th_def_allocator = team->t.t_def_allocator;
2588
2589#if OMPD_SUPPORT
2590 if (ompd_state & OMPD_ENABLE_BP)
2591 ompd_bp_parallel_end();
2592#endif
2593 updateHWFPControl(team);
2594
2595 if (root->r.r_active != master_active)
2596 root->r.r_active = master_active;
2597
2598 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2599 master_th)); // this will free worker threads
2600
2601 /* this race was fun to find. make sure the following is in the critical
2602 region otherwise assertions may fail occasionally since the old team may be
2603 reallocated and the hierarchy appears inconsistent. it is actually safe to
2604 run and won't cause any bugs, but will cause those assertion failures. it's
2605 only one deref&assign so might as well put this in the critical region */
2606 master_th->th.th_team = parent_team;
2607 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2608 master_th->th.th_team_master = parent_team->t.t_threads[0];
2609 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2610
2611 /* restore serialized team, if need be */
2612 if (parent_team->t.t_serialized &&
2613 parent_team != master_th->th.th_serial_team &&
2614 parent_team != root->r.r_root_team) {
2615 __kmp_free_team(root,
2616 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2617 master_th->th.th_serial_team = parent_team;
2618 }
2619
2620 if (__kmp_tasking_mode != tskm_immediate_exec) {
2621 if (master_th->th.th_task_state_top >
2622 0) { // Restore task state from memo stack
2623 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2624 // Remember primary thread's state if we re-use this nested hot team
2625 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2626 master_th->th.th_task_state;
2627 --master_th->th.th_task_state_top; // pop
2628 // Now restore state at this level
2629 master_th->th.th_task_state =
2630 master_th->th
2631 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2632 }
2633 // Copy the task team from the parent team to the primary thread
2634 master_th->th.th_task_team =
2635 parent_team->t.t_task_team[master_th->th.th_task_state];
2636 KA_TRACE(20,
2637 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2638 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2639 parent_team));
2640 }
2641
2642 // TODO: GEH - cannot do this assertion because root thread not set up as
2643 // executing
2644 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2645 master_th->th.th_current_task->td_flags.executing = 1;
2646
2647 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2648
2649#if KMP_AFFINITY_SUPPORTED
2650 if (master_th->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
2651 __kmp_reset_root_init_mask(gtid);
2652 }
2653#endif
2654#if OMPT_SUPPORT
2655 int flags =
2656 OMPT_INVOKER(fork_context) |
2657 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2658 : ompt_parallel_team);
2659 if (ompt_enabled.enabled) {
2660 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2661 codeptr);
2662 }
2663#endif
2664
2665 KMP_MB();
2666 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2667}
2668
2669/* Check whether we should push an internal control record onto the
2670 serial team stack. If so, do it. */
2671void __kmp_save_internal_controls(kmp_info_t *thread) {
2672
2673 if (thread->th.th_team != thread->th.th_serial_team) {
2674 return;
2675 }
2676 if (thread->th.th_team->t.t_serialized > 1) {
2677 int push = 0;
2678
2679 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2680 push = 1;
2681 } else {
2682 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2683 thread->th.th_team->t.t_serialized) {
2684 push = 1;
2685 }
2686 }
2687 if (push) { /* push a record on the serial team's stack */
2688 kmp_internal_control_t *control =
2689 (kmp_internal_control_t *)__kmp_allocate(
2690 sizeof(kmp_internal_control_t));
2691
2692 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2693
2694 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2695
2696 control->next = thread->th.th_team->t.t_control_stack_top;
2697 thread->th.th_team->t.t_control_stack_top = control;
2698 }
2699 }
2700}
2701
2702/* Changes set_nproc */
2703void __kmp_set_num_threads(int new_nth, int gtid) {
2704 kmp_info_t *thread;
2705 kmp_root_t *root;
2706
2707 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2708 KMP_DEBUG_ASSERT(__kmp_init_serial);
2709
2710 if (new_nth < 1)
2711 new_nth = 1;
2712 else if (new_nth > __kmp_max_nth)
2713 new_nth = __kmp_max_nth;
2714
2715 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2716 thread = __kmp_threads[gtid];
2717 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2718 return; // nothing to do
2719
2720 __kmp_save_internal_controls(thread);
2721
2722 set__nproc(thread, new_nth);
2723
2724 // If this omp_set_num_threads() call will cause the hot team size to be
2725 // reduced (in the absence of a num_threads clause), then reduce it now,
2726 // rather than waiting for the next parallel region.
2727 root = thread->th.th_root;
2728 if (__kmp_init_parallel && (!root->r.r_active) &&
2729 (root->r.r_hot_team->t.t_nproc > new_nth)
2730#if KMP_NESTED_HOT_TEAMS
2731 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2732#endif
2733 ) {
2734 kmp_team_t *hot_team = root->r.r_hot_team;
2735 int f;
2736
2737 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2738
2739 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2740 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2741 }
2742 // Release the extra threads we don't need any more.
2743 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2744 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2745 if (__kmp_tasking_mode != tskm_immediate_exec) {
2746 // When decreasing team size, threads no longer in the team should unref
2747 // task team.
2748 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2749 }
2750 __kmp_free_thread(hot_team->t.t_threads[f]);
2751 hot_team->t.t_threads[f] = NULL;
2752 }
2753 hot_team->t.t_nproc = new_nth;
2754#if KMP_NESTED_HOT_TEAMS
2755 if (thread->th.th_hot_teams) {
2756 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2757 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2758 }
2759#endif
2760
2761 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2762 hot_team->t.b->update_num_threads(new_nth);
2763 __kmp_add_threads_to_team(hot_team, new_nth);
2764 }
2765
2766 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2767
2768 // Update the t_nproc field in the threads that are still active.
2769 for (f = 0; f < new_nth; f++) {
2770 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2771 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2772 }
2773 // Special flag in case omp_set_num_threads() call
2774 hot_team->t.t_size_changed = -1;
2775 }
2776}
2777
2778/* Changes max_active_levels */
2779void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2780 kmp_info_t *thread;
2781
2782 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2783 "%d = (%d)\n",
2784 gtid, max_active_levels));
2785 KMP_DEBUG_ASSERT(__kmp_init_serial);
2786
2787 // validate max_active_levels
2788 if (max_active_levels < 0) {
2789 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2790 // We ignore this call if the user has specified a negative value.
2791 // The current setting won't be changed. The last valid setting will be
2792 // used. A warning will be issued (if warnings are allowed as controlled by
2793 // the KMP_WARNINGS env var).
2794 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2795 "max_active_levels for thread %d = (%d)\n",
2796 gtid, max_active_levels));
2797 return;
2798 }
2799 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2800 // it's OK, the max_active_levels is within the valid range: [ 0;
2801 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2802 // We allow a zero value. (implementation defined behavior)
2803 } else {
2804 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2805 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2806 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2807 // Current upper limit is MAX_INT. (implementation defined behavior)
2808 // If the input exceeds the upper limit, we correct the input to be the
2809 // upper limit. (implementation defined behavior)
2810 // Actually, the flow should never get here until we use MAX_INT limit.
2811 }
2812 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2813 "max_active_levels for thread %d = (%d)\n",
2814 gtid, max_active_levels));
2815
2816 thread = __kmp_threads[gtid];
2817
2818 __kmp_save_internal_controls(thread);
2819
2820 set__max_active_levels(thread, max_active_levels);
2821}
2822
2823/* Gets max_active_levels */
2824int __kmp_get_max_active_levels(int gtid) {
2825 kmp_info_t *thread;
2826
2827 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2828 KMP_DEBUG_ASSERT(__kmp_init_serial);
2829
2830 thread = __kmp_threads[gtid];
2831 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2832 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2833 "curtask_maxaclevel=%d\n",
2834 gtid, thread->th.th_current_task,
2835 thread->th.th_current_task->td_icvs.max_active_levels));
2836 return thread->th.th_current_task->td_icvs.max_active_levels;
2837}
2838
2839// nteams-var per-device ICV
2840void __kmp_set_num_teams(int num_teams) {
2841 if (num_teams > 0)
2842 __kmp_nteams = num_teams;
2843}
2844int __kmp_get_max_teams(void) { return __kmp_nteams; }
2845// teams-thread-limit-var per-device ICV
2846void __kmp_set_teams_thread_limit(int limit) {
2847 if (limit > 0)
2848 __kmp_teams_thread_limit = limit;
2849}
2850int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2851
2852KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2853KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2854
2855/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2856void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2857 kmp_info_t *thread;
2858 kmp_sched_t orig_kind;
2859 // kmp_team_t *team;
2860
2861 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2862 gtid, (int)kind, chunk));
2863 KMP_DEBUG_ASSERT(__kmp_init_serial);
2864
2865 // Check if the kind parameter is valid, correct if needed.
2866 // Valid parameters should fit in one of two intervals - standard or extended:
2867 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2868 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2869 orig_kind = kind;
2870 kind = __kmp_sched_without_mods(kind);
2871
2872 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2873 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2874 // TODO: Hint needs attention in case we change the default schedule.
2875 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2876 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2877 __kmp_msg_null);
2878 kind = kmp_sched_default;
2879 chunk = 0; // ignore chunk value in case of bad kind
2880 }
2881
2882 thread = __kmp_threads[gtid];
2883
2884 __kmp_save_internal_controls(thread);
2885
2886 if (kind < kmp_sched_upper_std) {
2887 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2888 // differ static chunked vs. unchunked: chunk should be invalid to
2889 // indicate unchunked schedule (which is the default)
2890 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2891 } else {
2892 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2893 __kmp_sch_map[kind - kmp_sched_lower - 1];
2894 }
2895 } else {
2896 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2897 // kmp_sched_lower - 2 ];
2898 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2899 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2900 kmp_sched_lower - 2];
2901 }
2902 __kmp_sched_apply_mods_intkind(
2903 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2904 if (kind == kmp_sched_auto || chunk < 1) {
2905 // ignore parameter chunk for schedule auto
2906 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2907 } else {
2908 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2909 }
2910}
2911
2912/* Gets def_sched_var ICV values */
2913void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2914 kmp_info_t *thread;
2915 enum sched_type th_type;
2916
2917 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2918 KMP_DEBUG_ASSERT(__kmp_init_serial);
2919
2920 thread = __kmp_threads[gtid];
2921
2922 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2923 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2924 case kmp_sch_static:
2925 case kmp_sch_static_greedy:
2926 case kmp_sch_static_balanced:
2927 *kind = kmp_sched_static;
2928 __kmp_sched_apply_mods_stdkind(kind, th_type);
2929 *chunk = 0; // chunk was not set, try to show this fact via zero value
2930 return;
2931 case kmp_sch_static_chunked:
2932 *kind = kmp_sched_static;
2933 break;
2934 case kmp_sch_dynamic_chunked:
2935 *kind = kmp_sched_dynamic;
2936 break;
2938 case kmp_sch_guided_iterative_chunked:
2939 case kmp_sch_guided_analytical_chunked:
2940 *kind = kmp_sched_guided;
2941 break;
2942 case kmp_sch_auto:
2943 *kind = kmp_sched_auto;
2944 break;
2945 case kmp_sch_trapezoidal:
2946 *kind = kmp_sched_trapezoidal;
2947 break;
2948#if KMP_STATIC_STEAL_ENABLED
2949 case kmp_sch_static_steal:
2950 *kind = kmp_sched_static_steal;
2951 break;
2952#endif
2953 default:
2954 KMP_FATAL(UnknownSchedulingType, th_type);
2955 }
2956
2957 __kmp_sched_apply_mods_stdkind(kind, th_type);
2958 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2959}
2960
2961int __kmp_get_ancestor_thread_num(int gtid, int level) {
2962
2963 int ii, dd;
2964 kmp_team_t *team;
2965 kmp_info_t *thr;
2966
2967 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2968 KMP_DEBUG_ASSERT(__kmp_init_serial);
2969
2970 // validate level
2971 if (level == 0)
2972 return 0;
2973 if (level < 0)
2974 return -1;
2975 thr = __kmp_threads[gtid];
2976 team = thr->th.th_team;
2977 ii = team->t.t_level;
2978 if (level > ii)
2979 return -1;
2980
2981 if (thr->th.th_teams_microtask) {
2982 // AC: we are in teams region where multiple nested teams have same level
2983 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2984 if (level <=
2985 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2986 KMP_DEBUG_ASSERT(ii >= tlevel);
2987 // AC: As we need to pass by the teams league, we need to artificially
2988 // increase ii
2989 if (ii == tlevel) {
2990 ii += 2; // three teams have same level
2991 } else {
2992 ii++; // two teams have same level
2993 }
2994 }
2995 }
2996
2997 if (ii == level)
2998 return __kmp_tid_from_gtid(gtid);
2999
3000 dd = team->t.t_serialized;
3001 level++;
3002 while (ii > level) {
3003 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3004 }
3005 if ((team->t.t_serialized) && (!dd)) {
3006 team = team->t.t_parent;
3007 continue;
3008 }
3009 if (ii > level) {
3010 team = team->t.t_parent;
3011 dd = team->t.t_serialized;
3012 ii--;
3013 }
3014 }
3015
3016 return (dd > 1) ? (0) : (team->t.t_master_tid);
3017}
3018
3019int __kmp_get_team_size(int gtid, int level) {
3020
3021 int ii, dd;
3022 kmp_team_t *team;
3023 kmp_info_t *thr;
3024
3025 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3026 KMP_DEBUG_ASSERT(__kmp_init_serial);
3027
3028 // validate level
3029 if (level == 0)
3030 return 1;
3031 if (level < 0)
3032 return -1;
3033 thr = __kmp_threads[gtid];
3034 team = thr->th.th_team;
3035 ii = team->t.t_level;
3036 if (level > ii)
3037 return -1;
3038
3039 if (thr->th.th_teams_microtask) {
3040 // AC: we are in teams region where multiple nested teams have same level
3041 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3042 if (level <=
3043 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3044 KMP_DEBUG_ASSERT(ii >= tlevel);
3045 // AC: As we need to pass by the teams league, we need to artificially
3046 // increase ii
3047 if (ii == tlevel) {
3048 ii += 2; // three teams have same level
3049 } else {
3050 ii++; // two teams have same level
3051 }
3052 }
3053 }
3054
3055 while (ii > level) {
3056 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3057 }
3058 if (team->t.t_serialized && (!dd)) {
3059 team = team->t.t_parent;
3060 continue;
3061 }
3062 if (ii > level) {
3063 team = team->t.t_parent;
3064 ii--;
3065 }
3066 }
3067
3068 return team->t.t_nproc;
3069}
3070
3071kmp_r_sched_t __kmp_get_schedule_global() {
3072 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3073 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3074 // independently. So one can get the updated schedule here.
3075
3076 kmp_r_sched_t r_sched;
3077
3078 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3079 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3080 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3081 // different roots (even in OMP 2.5)
3082 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3083 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3084 if (s == kmp_sch_static) {
3085 // replace STATIC with more detailed schedule (balanced or greedy)
3086 r_sched.r_sched_type = __kmp_static;
3087 } else if (s == kmp_sch_guided_chunked) {
3088 // replace GUIDED with more detailed schedule (iterative or analytical)
3089 r_sched.r_sched_type = __kmp_guided;
3090 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3091 r_sched.r_sched_type = __kmp_sched;
3092 }
3093 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3094
3095 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3096 // __kmp_chunk may be wrong here (if it was not ever set)
3097 r_sched.chunk = KMP_DEFAULT_CHUNK;
3098 } else {
3099 r_sched.chunk = __kmp_chunk;
3100 }
3101
3102 return r_sched;
3103}
3104
3105/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3106 at least argc number of *t_argv entries for the requested team. */
3107static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3108
3109 KMP_DEBUG_ASSERT(team);
3110 if (!realloc || argc > team->t.t_max_argc) {
3111
3112 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3113 "current entries=%d\n",
3114 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3115 /* if previously allocated heap space for args, free them */
3116 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3117 __kmp_free((void *)team->t.t_argv);
3118
3119 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3120 /* use unused space in the cache line for arguments */
3121 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3122 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3123 "argv entries\n",
3124 team->t.t_id, team->t.t_max_argc));
3125 team->t.t_argv = &team->t.t_inline_argv[0];
3126 if (__kmp_storage_map) {
3127 __kmp_print_storage_map_gtid(
3128 -1, &team->t.t_inline_argv[0],
3129 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3130 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3131 team->t.t_id);
3132 }
3133 } else {
3134 /* allocate space for arguments in the heap */
3135 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3136 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3137 : 2 * argc;
3138 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3139 "argv entries\n",
3140 team->t.t_id, team->t.t_max_argc));
3141 team->t.t_argv =
3142 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3143 if (__kmp_storage_map) {
3144 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3145 &team->t.t_argv[team->t.t_max_argc],
3146 sizeof(void *) * team->t.t_max_argc,
3147 "team_%d.t_argv", team->t.t_id);
3148 }
3149 }
3150 }
3151}
3152
3153static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3154 int i;
3155 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3156 team->t.t_threads =
3157 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3158 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3159 sizeof(dispatch_shared_info_t) * num_disp_buff);
3160 team->t.t_dispatch =
3161 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3162 team->t.t_implicit_task_taskdata =
3163 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3164 team->t.t_max_nproc = max_nth;
3165
3166 /* setup dispatch buffers */
3167 for (i = 0; i < num_disp_buff; ++i) {
3168 team->t.t_disp_buffer[i].buffer_index = i;
3169 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3170 }
3171}
3172
3173static void __kmp_free_team_arrays(kmp_team_t *team) {
3174 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3175 int i;
3176 for (i = 0; i < team->t.t_max_nproc; ++i) {
3177 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3178 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3179 team->t.t_dispatch[i].th_disp_buffer = NULL;
3180 }
3181 }
3182#if KMP_USE_HIER_SCHED
3183 __kmp_dispatch_free_hierarchies(team);
3184#endif
3185 __kmp_free(team->t.t_threads);
3186 __kmp_free(team->t.t_disp_buffer);
3187 __kmp_free(team->t.t_dispatch);
3188 __kmp_free(team->t.t_implicit_task_taskdata);
3189 team->t.t_threads = NULL;
3190 team->t.t_disp_buffer = NULL;
3191 team->t.t_dispatch = NULL;
3192 team->t.t_implicit_task_taskdata = 0;
3193}
3194
3195static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3196 kmp_info_t **oldThreads = team->t.t_threads;
3197
3198 __kmp_free(team->t.t_disp_buffer);
3199 __kmp_free(team->t.t_dispatch);
3200 __kmp_free(team->t.t_implicit_task_taskdata);
3201 __kmp_allocate_team_arrays(team, max_nth);
3202
3203 KMP_MEMCPY(team->t.t_threads, oldThreads,
3204 team->t.t_nproc * sizeof(kmp_info_t *));
3205
3206 __kmp_free(oldThreads);
3207}
3208
3209static kmp_internal_control_t __kmp_get_global_icvs(void) {
3210
3211 kmp_r_sched_t r_sched =
3212 __kmp_get_schedule_global(); // get current state of scheduling globals
3213
3214 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3215
3216 kmp_internal_control_t g_icvs = {
3217 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3218 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3219 // adjustment of threads (per thread)
3220 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3221 // whether blocktime is explicitly set
3222 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3223#if KMP_USE_MONITOR
3224 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3225// intervals
3226#endif
3227 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3228 // next parallel region (per thread)
3229 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3230 __kmp_cg_max_nth, // int thread_limit;
3231 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3232 // for max_active_levels
3233 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3234 // {sched,chunk} pair
3235 __kmp_nested_proc_bind.bind_types[0],
3236 __kmp_default_device,
3237 NULL // struct kmp_internal_control *next;
3238 };
3239
3240 return g_icvs;
3241}
3242
3243static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3244
3245 kmp_internal_control_t gx_icvs;
3246 gx_icvs.serial_nesting_level =
3247 0; // probably =team->t.t_serial like in save_inter_controls
3248 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3249 gx_icvs.next = NULL;
3250
3251 return gx_icvs;
3252}
3253
3254static void __kmp_initialize_root(kmp_root_t *root) {
3255 int f;
3256 kmp_team_t *root_team;
3257 kmp_team_t *hot_team;
3258 int hot_team_max_nth;
3259 kmp_r_sched_t r_sched =
3260 __kmp_get_schedule_global(); // get current state of scheduling globals
3261 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3262 KMP_DEBUG_ASSERT(root);
3263 KMP_ASSERT(!root->r.r_begin);
3264
3265 /* setup the root state structure */
3266 __kmp_init_lock(&root->r.r_begin_lock);
3267 root->r.r_begin = FALSE;
3268 root->r.r_active = FALSE;
3269 root->r.r_in_parallel = 0;
3270 root->r.r_blocktime = __kmp_dflt_blocktime;
3271#if KMP_AFFINITY_SUPPORTED
3272 root->r.r_affinity_assigned = FALSE;
3273#endif
3274
3275 /* setup the root team for this task */
3276 /* allocate the root team structure */
3277 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3278
3279 root_team =
3280 __kmp_allocate_team(root,
3281 1, // new_nproc
3282 1, // max_nproc
3283#if OMPT_SUPPORT
3284 ompt_data_none, // root parallel id
3285#endif
3286 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3287 0 // argc
3288 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3289 );
3290#if USE_DEBUGGER
3291 // Non-NULL value should be assigned to make the debugger display the root
3292 // team.
3293 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3294#endif
3295
3296 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3297
3298 root->r.r_root_team = root_team;
3299 root_team->t.t_control_stack_top = NULL;
3300
3301 /* initialize root team */
3302 root_team->t.t_threads[0] = NULL;
3303 root_team->t.t_nproc = 1;
3304 root_team->t.t_serialized = 1;
3305 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3306 root_team->t.t_sched.sched = r_sched.sched;
3307 KA_TRACE(
3308 20,
3309 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3310 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3311
3312 /* setup the hot team for this task */
3313 /* allocate the hot team structure */
3314 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3315
3316 hot_team =
3317 __kmp_allocate_team(root,
3318 1, // new_nproc
3319 __kmp_dflt_team_nth_ub * 2, // max_nproc
3320#if OMPT_SUPPORT
3321 ompt_data_none, // root parallel id
3322#endif
3323 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3324 0 // argc
3325 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3326 );
3327 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3328
3329 root->r.r_hot_team = hot_team;
3330 root_team->t.t_control_stack_top = NULL;
3331
3332 /* first-time initialization */
3333 hot_team->t.t_parent = root_team;
3334
3335 /* initialize hot team */
3336 hot_team_max_nth = hot_team->t.t_max_nproc;
3337 for (f = 0; f < hot_team_max_nth; ++f) {
3338 hot_team->t.t_threads[f] = NULL;
3339 }
3340 hot_team->t.t_nproc = 1;
3341 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3342 hot_team->t.t_sched.sched = r_sched.sched;
3343 hot_team->t.t_size_changed = 0;
3344}
3345
3346#ifdef KMP_DEBUG
3347
3348typedef struct kmp_team_list_item {
3349 kmp_team_p const *entry;
3350 struct kmp_team_list_item *next;
3351} kmp_team_list_item_t;
3352typedef kmp_team_list_item_t *kmp_team_list_t;
3353
3354static void __kmp_print_structure_team_accum( // Add team to list of teams.
3355 kmp_team_list_t list, // List of teams.
3356 kmp_team_p const *team // Team to add.
3357) {
3358
3359 // List must terminate with item where both entry and next are NULL.
3360 // Team is added to the list only once.
3361 // List is sorted in ascending order by team id.
3362 // Team id is *not* a key.
3363
3364 kmp_team_list_t l;
3365
3366 KMP_DEBUG_ASSERT(list != NULL);
3367 if (team == NULL) {
3368 return;
3369 }
3370
3371 __kmp_print_structure_team_accum(list, team->t.t_parent);
3372 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3373
3374 // Search list for the team.
3375 l = list;
3376 while (l->next != NULL && l->entry != team) {
3377 l = l->next;
3378 }
3379 if (l->next != NULL) {
3380 return; // Team has been added before, exit.
3381 }
3382
3383 // Team is not found. Search list again for insertion point.
3384 l = list;
3385 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3386 l = l->next;
3387 }
3388
3389 // Insert team.
3390 {
3391 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3392 sizeof(kmp_team_list_item_t));
3393 *item = *l;
3394 l->entry = team;
3395 l->next = item;
3396 }
3397}
3398
3399static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3400
3401) {
3402 __kmp_printf("%s", title);
3403 if (team != NULL) {
3404 __kmp_printf("%2x %p\n", team->t.t_id, team);
3405 } else {
3406 __kmp_printf(" - (nil)\n");
3407 }
3408}
3409
3410static void __kmp_print_structure_thread(char const *title,
3411 kmp_info_p const *thread) {
3412 __kmp_printf("%s", title);
3413 if (thread != NULL) {
3414 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3415 } else {
3416 __kmp_printf(" - (nil)\n");
3417 }
3418}
3419
3420void __kmp_print_structure(void) {
3421
3422 kmp_team_list_t list;
3423
3424 // Initialize list of teams.
3425 list =
3426 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3427 list->entry = NULL;
3428 list->next = NULL;
3429
3430 __kmp_printf("\n------------------------------\nGlobal Thread "
3431 "Table\n------------------------------\n");
3432 {
3433 int gtid;
3434 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3435 __kmp_printf("%2d", gtid);
3436 if (__kmp_threads != NULL) {
3437 __kmp_printf(" %p", __kmp_threads[gtid]);
3438 }
3439 if (__kmp_root != NULL) {
3440 __kmp_printf(" %p", __kmp_root[gtid]);
3441 }
3442 __kmp_printf("\n");
3443 }
3444 }
3445
3446 // Print out __kmp_threads array.
3447 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3448 "----------\n");
3449 if (__kmp_threads != NULL) {
3450 int gtid;
3451 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3452 kmp_info_t const *thread = __kmp_threads[gtid];
3453 if (thread != NULL) {
3454 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3455 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3456 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3457 __kmp_print_structure_team(" Serial Team: ",
3458 thread->th.th_serial_team);
3459 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3460 __kmp_print_structure_thread(" Primary: ",
3461 thread->th.th_team_master);
3462 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3463 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3464 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3465 __kmp_print_structure_thread(" Next in pool: ",
3466 thread->th.th_next_pool);
3467 __kmp_printf("\n");
3468 __kmp_print_structure_team_accum(list, thread->th.th_team);
3469 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3470 }
3471 }
3472 } else {
3473 __kmp_printf("Threads array is not allocated.\n");
3474 }
3475
3476 // Print out __kmp_root array.
3477 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3478 "--------\n");
3479 if (__kmp_root != NULL) {
3480 int gtid;
3481 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3482 kmp_root_t const *root = __kmp_root[gtid];
3483 if (root != NULL) {
3484 __kmp_printf("GTID %2d %p:\n", gtid, root);
3485 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3486 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3487 __kmp_print_structure_thread(" Uber Thread: ",
3488 root->r.r_uber_thread);
3489 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3490 __kmp_printf(" In Parallel: %2d\n",
3491 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3492 __kmp_printf("\n");
3493 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3494 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3495 }
3496 }
3497 } else {
3498 __kmp_printf("Ubers array is not allocated.\n");
3499 }
3500
3501 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3502 "--------\n");
3503 while (list->next != NULL) {
3504 kmp_team_p const *team = list->entry;
3505 int i;
3506 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3507 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3508 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3509 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3510 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3511 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3512 for (i = 0; i < team->t.t_nproc; ++i) {
3513 __kmp_printf(" Thread %2d: ", i);
3514 __kmp_print_structure_thread("", team->t.t_threads[i]);
3515 }
3516 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3517 __kmp_printf("\n");
3518 list = list->next;
3519 }
3520
3521 // Print out __kmp_thread_pool and __kmp_team_pool.
3522 __kmp_printf("\n------------------------------\nPools\n----------------------"
3523 "--------\n");
3524 __kmp_print_structure_thread("Thread pool: ",
3525 CCAST(kmp_info_t *, __kmp_thread_pool));
3526 __kmp_print_structure_team("Team pool: ",
3527 CCAST(kmp_team_t *, __kmp_team_pool));
3528 __kmp_printf("\n");
3529
3530 // Free team list.
3531 while (list != NULL) {
3532 kmp_team_list_item_t *item = list;
3533 list = list->next;
3534 KMP_INTERNAL_FREE(item);
3535 }
3536}
3537
3538#endif
3539
3540//---------------------------------------------------------------------------
3541// Stuff for per-thread fast random number generator
3542// Table of primes
3543static const unsigned __kmp_primes[] = {
3544 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3545 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3546 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3547 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3548 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3549 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3550 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3551 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3552 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3553 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3554 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3555
3556//---------------------------------------------------------------------------
3557// __kmp_get_random: Get a random number using a linear congruential method.
3558unsigned short __kmp_get_random(kmp_info_t *thread) {
3559 unsigned x = thread->th.th_x;
3560 unsigned short r = (unsigned short)(x >> 16);
3561
3562 thread->th.th_x = x * thread->th.th_a + 1;
3563
3564 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3565 thread->th.th_info.ds.ds_tid, r));
3566
3567 return r;
3568}
3569//--------------------------------------------------------
3570// __kmp_init_random: Initialize a random number generator
3571void __kmp_init_random(kmp_info_t *thread) {
3572 unsigned seed = thread->th.th_info.ds.ds_tid;
3573
3574 thread->th.th_a =
3575 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3576 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3577 KA_TRACE(30,
3578 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3579}
3580
3581#if KMP_OS_WINDOWS
3582/* reclaim array entries for root threads that are already dead, returns number
3583 * reclaimed */
3584static int __kmp_reclaim_dead_roots(void) {
3585 int i, r = 0;
3586
3587 for (i = 0; i < __kmp_threads_capacity; ++i) {
3588 if (KMP_UBER_GTID(i) &&
3589 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3590 !__kmp_root[i]
3591 ->r.r_active) { // AC: reclaim only roots died in non-active state
3592 r += __kmp_unregister_root_other_thread(i);
3593 }
3594 }
3595 return r;
3596}
3597#endif
3598
3599/* This function attempts to create free entries in __kmp_threads and
3600 __kmp_root, and returns the number of free entries generated.
3601
3602 For Windows* OS static library, the first mechanism used is to reclaim array
3603 entries for root threads that are already dead.
3604
3605 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3606 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3607 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3608 threadprivate cache array has been created. Synchronization with
3609 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3610
3611 After any dead root reclamation, if the clipping value allows array expansion
3612 to result in the generation of a total of nNeed free slots, the function does
3613 that expansion. If not, nothing is done beyond the possible initial root
3614 thread reclamation.
3615
3616 If any argument is negative, the behavior is undefined. */
3617static int __kmp_expand_threads(int nNeed) {
3618 int added = 0;
3619 int minimumRequiredCapacity;
3620 int newCapacity;
3621 kmp_info_t **newThreads;
3622 kmp_root_t **newRoot;
3623
3624 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3625 // resizing __kmp_threads does not need additional protection if foreign
3626 // threads are present
3627
3628#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3629 /* only for Windows static library */
3630 /* reclaim array entries for root threads that are already dead */
3631 added = __kmp_reclaim_dead_roots();
3632
3633 if (nNeed) {
3634 nNeed -= added;
3635 if (nNeed < 0)
3636 nNeed = 0;
3637 }
3638#endif
3639 if (nNeed <= 0)
3640 return added;
3641
3642 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3643 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3644 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3645 // > __kmp_max_nth in one of two ways:
3646 //
3647 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3648 // may not be reused by another thread, so we may need to increase
3649 // __kmp_threads_capacity to __kmp_max_nth + 1.
3650 //
3651 // 2) New foreign root(s) are encountered. We always register new foreign
3652 // roots. This may cause a smaller # of threads to be allocated at
3653 // subsequent parallel regions, but the worker threads hang around (and
3654 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3655 //
3656 // Anyway, that is the reason for moving the check to see if
3657 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3658 // instead of having it performed here. -BB
3659
3660 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3661
3662 /* compute expansion headroom to check if we can expand */
3663 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3664 /* possible expansion too small -- give up */
3665 return added;
3666 }
3667 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3668
3669 newCapacity = __kmp_threads_capacity;
3670 do {
3671 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3672 : __kmp_sys_max_nth;
3673 } while (newCapacity < minimumRequiredCapacity);
3674 newThreads = (kmp_info_t **)__kmp_allocate(
3675 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3676 newRoot =
3677 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3678 KMP_MEMCPY(newThreads, __kmp_threads,
3679 __kmp_threads_capacity * sizeof(kmp_info_t *));
3680 KMP_MEMCPY(newRoot, __kmp_root,
3681 __kmp_threads_capacity * sizeof(kmp_root_t *));
3682 // Put old __kmp_threads array on a list. Any ongoing references to the old
3683 // list will be valid. This list is cleaned up at library shutdown.
3684 kmp_old_threads_list_t *node =
3685 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3686 node->threads = __kmp_threads;
3687 node->next = __kmp_old_threads_list;
3688 __kmp_old_threads_list = node;
3689
3690 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3691 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3692 added += newCapacity - __kmp_threads_capacity;
3693 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3694
3695 if (newCapacity > __kmp_tp_capacity) {
3696 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3697 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3698 __kmp_threadprivate_resize_cache(newCapacity);
3699 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3700 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3701 }
3702 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3703 }
3704
3705 return added;
3706}
3707
3708/* Register the current thread as a root thread and obtain our gtid. We must
3709 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3710 thread that calls from __kmp_do_serial_initialize() */
3711int __kmp_register_root(int initial_thread) {
3712 kmp_info_t *root_thread;
3713 kmp_root_t *root;
3714 int gtid;
3715 int capacity;
3716 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3717 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3718 KMP_MB();
3719
3720 /* 2007-03-02:
3721 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3722 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3723 work as expected -- it may return false (that means there is at least one
3724 empty slot in __kmp_threads array), but it is possible the only free slot
3725 is #0, which is reserved for initial thread and so cannot be used for this
3726 one. Following code workarounds this bug.
3727
3728 However, right solution seems to be not reserving slot #0 for initial
3729 thread because:
3730 (1) there is no magic in slot #0,
3731 (2) we cannot detect initial thread reliably (the first thread which does
3732 serial initialization may be not a real initial thread).
3733 */
3734 capacity = __kmp_threads_capacity;
3735 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3736 --capacity;
3737 }
3738
3739 // If it is not for initializing the hidden helper team, we need to take
3740 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3741 // in __kmp_threads_capacity.
3742 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3743 capacity -= __kmp_hidden_helper_threads_num;
3744 }
3745
3746 /* see if there are too many threads */
3747 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3748 if (__kmp_tp_cached) {
3749 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3750 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3751 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3752 } else {
3753 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3754 __kmp_msg_null);
3755 }
3756 }
3757
3758 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3759 // 0: initial thread, also a regular OpenMP thread.
3760 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3761 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3762 // regular OpenMP threads.
3763 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3764 // Find an available thread slot for hidden helper thread. Slots for hidden
3765 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3766 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3767 gtid <= __kmp_hidden_helper_threads_num;
3768 gtid++)
3769 ;
3770 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3771 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3772 "hidden helper thread: T#%d\n",
3773 gtid));
3774 } else {
3775 /* find an available thread slot */
3776 // Don't reassign the zero slot since we need that to only be used by
3777 // initial thread. Slots for hidden helper threads should also be skipped.
3778 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3779 gtid = 0;
3780 } else {
3781 for (gtid = __kmp_hidden_helper_threads_num + 1;
3782 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3783 ;
3784 }
3785 KA_TRACE(
3786 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3787 KMP_ASSERT(gtid < __kmp_threads_capacity);
3788 }
3789
3790 /* update global accounting */
3791 __kmp_all_nth++;
3792 TCW_4(__kmp_nth, __kmp_nth + 1);
3793
3794 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3795 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3796 if (__kmp_adjust_gtid_mode) {
3797 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3798 if (TCR_4(__kmp_gtid_mode) != 2) {
3799 TCW_4(__kmp_gtid_mode, 2);
3800 }
3801 } else {
3802 if (TCR_4(__kmp_gtid_mode) != 1) {
3803 TCW_4(__kmp_gtid_mode, 1);
3804 }
3805 }
3806 }
3807
3808#ifdef KMP_ADJUST_BLOCKTIME
3809 /* Adjust blocktime to zero if necessary */
3810 /* Middle initialization might not have occurred yet */
3811 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3812 if (__kmp_nth > __kmp_avail_proc) {
3813 __kmp_zero_bt = TRUE;
3814 }
3815 }
3816#endif /* KMP_ADJUST_BLOCKTIME */
3817
3818 /* setup this new hierarchy */
3819 if (!(root = __kmp_root[gtid])) {
3820 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3821 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3822 }
3823
3824#if KMP_STATS_ENABLED
3825 // Initialize stats as soon as possible (right after gtid assignment).
3826 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3827 __kmp_stats_thread_ptr->startLife();
3828 KMP_SET_THREAD_STATE(SERIAL_REGION);
3829 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3830#endif
3831 __kmp_initialize_root(root);
3832
3833 /* setup new root thread structure */
3834 if (root->r.r_uber_thread) {
3835 root_thread = root->r.r_uber_thread;
3836 } else {
3837 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3838 if (__kmp_storage_map) {
3839 __kmp_print_thread_storage_map(root_thread, gtid);
3840 }
3841 root_thread->th.th_info.ds.ds_gtid = gtid;
3842#if OMPT_SUPPORT
3843 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3844#endif
3845 root_thread->th.th_root = root;
3846 if (__kmp_env_consistency_check) {
3847 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3848 }
3849#if USE_FAST_MEMORY
3850 __kmp_initialize_fast_memory(root_thread);
3851#endif /* USE_FAST_MEMORY */
3852
3853#if KMP_USE_BGET
3854 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3855 __kmp_initialize_bget(root_thread);
3856#endif
3857 __kmp_init_random(root_thread); // Initialize random number generator
3858 }
3859
3860 /* setup the serial team held in reserve by the root thread */
3861 if (!root_thread->th.th_serial_team) {
3862 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3863 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3864 root_thread->th.th_serial_team = __kmp_allocate_team(
3865 root, 1, 1,
3866#if OMPT_SUPPORT
3867 ompt_data_none, // root parallel id
3868#endif
3869 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3870 }
3871 KMP_ASSERT(root_thread->th.th_serial_team);
3872 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3873 root_thread->th.th_serial_team));
3874
3875 /* drop root_thread into place */
3876 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3877
3878 root->r.r_root_team->t.t_threads[0] = root_thread;
3879 root->r.r_hot_team->t.t_threads[0] = root_thread;
3880 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3881 // AC: the team created in reserve, not for execution (it is unused for now).
3882 root_thread->th.th_serial_team->t.t_serialized = 0;
3883 root->r.r_uber_thread = root_thread;
3884
3885 /* initialize the thread, get it ready to go */
3886 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3887 TCW_4(__kmp_init_gtid, TRUE);
3888
3889 /* prepare the primary thread for get_gtid() */
3890 __kmp_gtid_set_specific(gtid);
3891
3892#if USE_ITT_BUILD
3893 __kmp_itt_thread_name(gtid);
3894#endif /* USE_ITT_BUILD */
3895
3896#ifdef KMP_TDATA_GTID
3897 __kmp_gtid = gtid;
3898#endif
3899 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3900 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3901
3902 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3903 "plain=%u\n",
3904 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3905 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3906 KMP_INIT_BARRIER_STATE));
3907 { // Initialize barrier data.
3908 int b;
3909 for (b = 0; b < bs_last_barrier; ++b) {
3910 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3911#if USE_DEBUGGER
3912 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3913#endif
3914 }
3915 }
3916 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3917 KMP_INIT_BARRIER_STATE);
3918
3919#if KMP_AFFINITY_SUPPORTED
3920 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3921 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3922 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3923 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3924#endif /* KMP_AFFINITY_SUPPORTED */
3925 root_thread->th.th_def_allocator = __kmp_def_allocator;
3926 root_thread->th.th_prev_level = 0;
3927 root_thread->th.th_prev_num_threads = 1;
3928
3929 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3930 tmp->cg_root = root_thread;
3931 tmp->cg_thread_limit = __kmp_cg_max_nth;
3932 tmp->cg_nthreads = 1;
3933 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3934 " cg_nthreads init to 1\n",
3935 root_thread, tmp));
3936 tmp->up = NULL;
3937 root_thread->th.th_cg_roots = tmp;
3938
3939 __kmp_root_counter++;
3940
3941#if OMPT_SUPPORT
3942 if (!initial_thread && ompt_enabled.enabled) {
3943
3944 kmp_info_t *root_thread = ompt_get_thread();
3945
3946 ompt_set_thread_state(root_thread, ompt_state_overhead);
3947
3948 if (ompt_enabled.ompt_callback_thread_begin) {
3949 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3950 ompt_thread_initial, __ompt_get_thread_data_internal());
3951 }
3952 ompt_data_t *task_data;
3953 ompt_data_t *parallel_data;
3954 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3955 NULL);
3956 if (ompt_enabled.ompt_callback_implicit_task) {
3957 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3958 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3959 }
3960
3961 ompt_set_thread_state(root_thread, ompt_state_work_serial);
3962 }
3963#endif
3964#if OMPD_SUPPORT
3965 if (ompd_state & OMPD_ENABLE_BP)
3966 ompd_bp_thread_begin();
3967#endif
3968
3969 KMP_MB();
3970 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3971
3972 return gtid;
3973}
3974
3975#if KMP_NESTED_HOT_TEAMS
3976static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3977 const int max_level) {
3978 int i, n, nth;
3979 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3980 if (!hot_teams || !hot_teams[level].hot_team) {
3981 return 0;
3982 }
3983 KMP_DEBUG_ASSERT(level < max_level);
3984 kmp_team_t *team = hot_teams[level].hot_team;
3985 nth = hot_teams[level].hot_team_nth;
3986 n = nth - 1; // primary thread is not freed
3987 if (level < max_level - 1) {
3988 for (i = 0; i < nth; ++i) {
3989 kmp_info_t *th = team->t.t_threads[i];
3990 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3991 if (i > 0 && th->th.th_hot_teams) {
3992 __kmp_free(th->th.th_hot_teams);
3993 th->th.th_hot_teams = NULL;
3994 }
3995 }
3996 }
3997 __kmp_free_team(root, team, NULL);
3998 return n;
3999}
4000#endif
4001
4002// Resets a root thread and clear its root and hot teams.
4003// Returns the number of __kmp_threads entries directly and indirectly freed.
4004static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4005 kmp_team_t *root_team = root->r.r_root_team;
4006 kmp_team_t *hot_team = root->r.r_hot_team;
4007 int n = hot_team->t.t_nproc;
4008 int i;
4009
4010 KMP_DEBUG_ASSERT(!root->r.r_active);
4011
4012 root->r.r_root_team = NULL;
4013 root->r.r_hot_team = NULL;
4014 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4015 // before call to __kmp_free_team().
4016 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4017#if KMP_NESTED_HOT_TEAMS
4018 if (__kmp_hot_teams_max_level >
4019 0) { // need to free nested hot teams and their threads if any
4020 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4021 kmp_info_t *th = hot_team->t.t_threads[i];
4022 if (__kmp_hot_teams_max_level > 1) {
4023 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4024 }
4025 if (th->th.th_hot_teams) {
4026 __kmp_free(th->th.th_hot_teams);
4027 th->th.th_hot_teams = NULL;
4028 }
4029 }
4030 }
4031#endif
4032 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4033
4034 // Before we can reap the thread, we need to make certain that all other
4035 // threads in the teams that had this root as ancestor have stopped trying to
4036 // steal tasks.
4037 if (__kmp_tasking_mode != tskm_immediate_exec) {
4038 __kmp_wait_to_unref_task_teams();
4039 }
4040
4041#if KMP_OS_WINDOWS
4042 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4043 KA_TRACE(
4044 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4045 "\n",
4046 (LPVOID) & (root->r.r_uber_thread->th),
4047 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4048 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4049#endif /* KMP_OS_WINDOWS */
4050
4051#if OMPD_SUPPORT
4052 if (ompd_state & OMPD_ENABLE_BP)
4053 ompd_bp_thread_end();
4054#endif
4055
4056#if OMPT_SUPPORT
4057 ompt_data_t *task_data;
4058 ompt_data_t *parallel_data;
4059 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4060 NULL);
4061 if (ompt_enabled.ompt_callback_implicit_task) {
4062 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4063 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4064 }
4065 if (ompt_enabled.ompt_callback_thread_end) {
4066 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4067 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4068 }
4069#endif
4070
4071 TCW_4(__kmp_nth,
4072 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4073 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4074 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4075 " to %d\n",
4076 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4077 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4078 if (i == 1) {
4079 // need to free contention group structure
4080 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4081 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4082 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4083 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4084 root->r.r_uber_thread->th.th_cg_roots = NULL;
4085 }
4086 __kmp_reap_thread(root->r.r_uber_thread, 1);
4087
4088 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4089 // instead of freeing.
4090 root->r.r_uber_thread = NULL;
4091 /* mark root as no longer in use */
4092 root->r.r_begin = FALSE;
4093
4094 return n;
4095}
4096
4097void __kmp_unregister_root_current_thread(int gtid) {
4098 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4099 /* this lock should be ok, since unregister_root_current_thread is never
4100 called during an abort, only during a normal close. furthermore, if you
4101 have the forkjoin lock, you should never try to get the initz lock */
4102 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4103 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4104 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4105 "exiting T#%d\n",
4106 gtid));
4107 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4108 return;
4109 }
4110 kmp_root_t *root = __kmp_root[gtid];
4111
4112 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4113 KMP_ASSERT(KMP_UBER_GTID(gtid));
4114 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4115 KMP_ASSERT(root->r.r_active == FALSE);
4116
4117 KMP_MB();
4118
4119 kmp_info_t *thread = __kmp_threads[gtid];
4120 kmp_team_t *team = thread->th.th_team;
4121 kmp_task_team_t *task_team = thread->th.th_task_team;
4122
4123 // we need to wait for the proxy tasks before finishing the thread
4124 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4125 task_team->tt.tt_hidden_helper_task_encountered)) {
4126#if OMPT_SUPPORT
4127 // the runtime is shutting down so we won't report any events
4128 thread->th.ompt_thread_info.state = ompt_state_undefined;
4129#endif
4130 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4131 }
4132
4133 __kmp_reset_root(gtid, root);
4134
4135 KMP_MB();
4136 KC_TRACE(10,
4137 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4138
4139 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4140}
4141
4142#if KMP_OS_WINDOWS
4143/* __kmp_forkjoin_lock must be already held
4144 Unregisters a root thread that is not the current thread. Returns the number
4145 of __kmp_threads entries freed as a result. */
4146static int __kmp_unregister_root_other_thread(int gtid) {
4147 kmp_root_t *root = __kmp_root[gtid];
4148 int r;
4149
4150 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4151 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4152 KMP_ASSERT(KMP_UBER_GTID(gtid));
4153 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4154 KMP_ASSERT(root->r.r_active == FALSE);
4155
4156 r = __kmp_reset_root(gtid, root);
4157 KC_TRACE(10,
4158 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4159 return r;
4160}
4161#endif
4162
4163#if KMP_DEBUG
4164void __kmp_task_info() {
4165
4166 kmp_int32 gtid = __kmp_entry_gtid();
4167 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4168 kmp_info_t *this_thr = __kmp_threads[gtid];
4169 kmp_team_t *steam = this_thr->th.th_serial_team;
4170 kmp_team_t *team = this_thr->th.th_team;
4171
4172 __kmp_printf(
4173 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4174 "ptask=%p\n",
4175 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4176 team->t.t_implicit_task_taskdata[tid].td_parent);
4177}
4178#endif // KMP_DEBUG
4179
4180/* TODO optimize with one big memclr, take out what isn't needed, split
4181 responsibility to workers as much as possible, and delay initialization of
4182 features as much as possible */
4183static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4184 int tid, int gtid) {
4185 /* this_thr->th.th_info.ds.ds_gtid is setup in
4186 kmp_allocate_thread/create_worker.
4187 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4188 KMP_DEBUG_ASSERT(this_thr != NULL);
4189 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4190 KMP_DEBUG_ASSERT(team);
4191 KMP_DEBUG_ASSERT(team->t.t_threads);
4192 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4193 kmp_info_t *master = team->t.t_threads[0];
4194 KMP_DEBUG_ASSERT(master);
4195 KMP_DEBUG_ASSERT(master->th.th_root);
4196
4197 KMP_MB();
4198
4199 TCW_SYNC_PTR(this_thr->th.th_team, team);
4200
4201 this_thr->th.th_info.ds.ds_tid = tid;
4202 this_thr->th.th_set_nproc = 0;
4203 if (__kmp_tasking_mode != tskm_immediate_exec)
4204 // When tasking is possible, threads are not safe to reap until they are
4205 // done tasking; this will be set when tasking code is exited in wait
4206 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4207 else // no tasking --> always safe to reap
4208 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4209 this_thr->th.th_set_proc_bind = proc_bind_default;
4210#if KMP_AFFINITY_SUPPORTED
4211 this_thr->th.th_new_place = this_thr->th.th_current_place;
4212#endif
4213 this_thr->th.th_root = master->th.th_root;
4214
4215 /* setup the thread's cache of the team structure */
4216 this_thr->th.th_team_nproc = team->t.t_nproc;
4217 this_thr->th.th_team_master = master;
4218 this_thr->th.th_team_serialized = team->t.t_serialized;
4219
4220 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4221
4222 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4223 tid, gtid, this_thr, this_thr->th.th_current_task));
4224
4225 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4226 team, tid, TRUE);
4227
4228 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4229 tid, gtid, this_thr, this_thr->th.th_current_task));
4230 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4231 // __kmp_initialize_team()?
4232
4233 /* TODO no worksharing in speculative threads */
4234 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4235
4236 this_thr->th.th_local.this_construct = 0;
4237
4238 if (!this_thr->th.th_pri_common) {
4239 this_thr->th.th_pri_common =
4240 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4241 if (__kmp_storage_map) {
4242 __kmp_print_storage_map_gtid(
4243 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4244 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4245 }
4246 this_thr->th.th_pri_head = NULL;
4247 }
4248
4249 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4250 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4251 // Make new thread's CG root same as primary thread's
4252 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4253 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4254 if (tmp) {
4255 // worker changes CG, need to check if old CG should be freed
4256 int i = tmp->cg_nthreads--;
4257 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4258 " on node %p of thread %p to %d\n",
4259 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4260 if (i == 1) {
4261 __kmp_free(tmp); // last thread left CG --> free it
4262 }
4263 }
4264 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4265 // Increment new thread's CG root's counter to add the new thread
4266 this_thr->th.th_cg_roots->cg_nthreads++;
4267 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4268 " node %p of thread %p to %d\n",
4269 this_thr, this_thr->th.th_cg_roots,
4270 this_thr->th.th_cg_roots->cg_root,
4271 this_thr->th.th_cg_roots->cg_nthreads));
4272 this_thr->th.th_current_task->td_icvs.thread_limit =
4273 this_thr->th.th_cg_roots->cg_thread_limit;
4274 }
4275
4276 /* Initialize dynamic dispatch */
4277 {
4278 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4279 // Use team max_nproc since this will never change for the team.
4280 size_t disp_size =
4281 sizeof(dispatch_private_info_t) *
4282 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4283 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4284 team->t.t_max_nproc));
4285 KMP_ASSERT(dispatch);
4286 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4287 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4288
4289 dispatch->th_disp_index = 0;
4290 dispatch->th_doacross_buf_idx = 0;
4291 if (!dispatch->th_disp_buffer) {
4292 dispatch->th_disp_buffer =
4293 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4294
4295 if (__kmp_storage_map) {
4296 __kmp_print_storage_map_gtid(
4297 gtid, &dispatch->th_disp_buffer[0],
4298 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4299 ? 1
4300 : __kmp_dispatch_num_buffers],
4301 disp_size,
4302 "th_%d.th_dispatch.th_disp_buffer "
4303 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4304 gtid, team->t.t_id, gtid);
4305 }
4306 } else {
4307 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4308 }
4309
4310 dispatch->th_dispatch_pr_current = 0;
4311 dispatch->th_dispatch_sh_current = 0;
4312
4313 dispatch->th_deo_fcn = 0; /* ORDERED */
4314 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4315 }
4316
4317 this_thr->th.th_next_pool = NULL;
4318
4319 if (!this_thr->th.th_task_state_memo_stack) {
4320 size_t i;
4321 this_thr->th.th_task_state_memo_stack =
4322 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4323 this_thr->th.th_task_state_top = 0;
4324 this_thr->th.th_task_state_stack_sz = 4;
4325 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4326 ++i) // zero init the stack
4327 this_thr->th.th_task_state_memo_stack[i] = 0;
4328 }
4329
4330 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4331 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4332
4333 KMP_MB();
4334}
4335
4336/* allocate a new thread for the requesting team. this is only called from
4337 within a forkjoin critical section. we will first try to get an available
4338 thread from the thread pool. if none is available, we will fork a new one
4339 assuming we are able to create a new one. this should be assured, as the
4340 caller should check on this first. */
4341kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4342 int new_tid) {
4343 kmp_team_t *serial_team;
4344 kmp_info_t *new_thr;
4345 int new_gtid;
4346
4347 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4348 KMP_DEBUG_ASSERT(root && team);
4349#if !KMP_NESTED_HOT_TEAMS
4350 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4351#endif
4352 KMP_MB();
4353
4354 /* first, try to get one from the thread pool */
4355 if (__kmp_thread_pool) {
4356 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4357 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4358 if (new_thr == __kmp_thread_pool_insert_pt) {
4359 __kmp_thread_pool_insert_pt = NULL;
4360 }
4361 TCW_4(new_thr->th.th_in_pool, FALSE);
4362 __kmp_suspend_initialize_thread(new_thr);
4363 __kmp_lock_suspend_mx(new_thr);
4364 if (new_thr->th.th_active_in_pool == TRUE) {
4365 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4366 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4367 new_thr->th.th_active_in_pool = FALSE;
4368 }
4369 __kmp_unlock_suspend_mx(new_thr);
4370
4371 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4372 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4373 KMP_ASSERT(!new_thr->th.th_team);
4374 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4375
4376 /* setup the thread structure */
4377 __kmp_initialize_info(new_thr, team, new_tid,
4378 new_thr->th.th_info.ds.ds_gtid);
4379 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4380
4381 TCW_4(__kmp_nth, __kmp_nth + 1);
4382
4383 new_thr->th.th_task_state = 0;
4384 new_thr->th.th_task_state_top = 0;
4385 new_thr->th.th_task_state_stack_sz = 4;
4386
4387 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4388 // Make sure pool thread has transitioned to waiting on own thread struct
4389 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4390 // Thread activated in __kmp_allocate_team when increasing team size
4391 }
4392
4393#ifdef KMP_ADJUST_BLOCKTIME
4394 /* Adjust blocktime back to zero if necessary */
4395 /* Middle initialization might not have occurred yet */
4396 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4397 if (__kmp_nth > __kmp_avail_proc) {
4398 __kmp_zero_bt = TRUE;
4399 }
4400 }
4401#endif /* KMP_ADJUST_BLOCKTIME */
4402
4403#if KMP_DEBUG
4404 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4405 // KMP_BARRIER_PARENT_FLAG.
4406 int b;
4407 kmp_balign_t *balign = new_thr->th.th_bar;
4408 for (b = 0; b < bs_last_barrier; ++b)
4409 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4410#endif
4411
4412 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4413 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4414
4415 KMP_MB();
4416 return new_thr;
4417 }
4418
4419 /* no, well fork a new one */
4420 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4421 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4422
4423#if KMP_USE_MONITOR
4424 // If this is the first worker thread the RTL is creating, then also
4425 // launch the monitor thread. We try to do this as early as possible.
4426 if (!TCR_4(__kmp_init_monitor)) {
4427 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4428 if (!TCR_4(__kmp_init_monitor)) {
4429 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4430 TCW_4(__kmp_init_monitor, 1);
4431 __kmp_create_monitor(&__kmp_monitor);
4432 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4433#if KMP_OS_WINDOWS
4434 // AC: wait until monitor has started. This is a fix for CQ232808.
4435 // The reason is that if the library is loaded/unloaded in a loop with
4436 // small (parallel) work in between, then there is high probability that
4437 // monitor thread started after the library shutdown. At shutdown it is
4438 // too late to cope with the problem, because when the primary thread is
4439 // in DllMain (process detach) the monitor has no chances to start (it is
4440 // blocked), and primary thread has no means to inform the monitor that
4441 // the library has gone, because all the memory which the monitor can
4442 // access is going to be released/reset.
4443 while (TCR_4(__kmp_init_monitor) < 2) {
4444 KMP_YIELD(TRUE);
4445 }
4446 KF_TRACE(10, ("after monitor thread has started\n"));
4447#endif
4448 }
4449 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4450 }
4451#endif
4452
4453 KMP_MB();
4454
4455 {
4456 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4457 ? 1
4458 : __kmp_hidden_helper_threads_num + 1;
4459
4460 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4461 ++new_gtid) {
4462 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4463 }
4464
4465 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4466 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4467 }
4468 }
4469
4470 /* allocate space for it. */
4471 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4472
4473 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4474
4475#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4476 // suppress race conditions detection on synchronization flags in debug mode
4477 // this helps to analyze library internals eliminating false positives
4478 __itt_suppress_mark_range(
4479 __itt_suppress_range, __itt_suppress_threading_errors,
4480 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4481 __itt_suppress_mark_range(
4482 __itt_suppress_range, __itt_suppress_threading_errors,
4483 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4484#if KMP_OS_WINDOWS
4485 __itt_suppress_mark_range(
4486 __itt_suppress_range, __itt_suppress_threading_errors,
4487 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4488#else
4489 __itt_suppress_mark_range(__itt_suppress_range,
4490 __itt_suppress_threading_errors,
4491 &new_thr->th.th_suspend_init_count,
4492 sizeof(new_thr->th.th_suspend_init_count));
4493#endif
4494 // TODO: check if we need to also suppress b_arrived flags
4495 __itt_suppress_mark_range(__itt_suppress_range,
4496 __itt_suppress_threading_errors,
4497 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4498 sizeof(new_thr->th.th_bar[0].bb.b_go));
4499 __itt_suppress_mark_range(__itt_suppress_range,
4500 __itt_suppress_threading_errors,
4501 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4502 sizeof(new_thr->th.th_bar[1].bb.b_go));
4503 __itt_suppress_mark_range(__itt_suppress_range,
4504 __itt_suppress_threading_errors,
4505 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4506 sizeof(new_thr->th.th_bar[2].bb.b_go));
4507#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4508 if (__kmp_storage_map) {
4509 __kmp_print_thread_storage_map(new_thr, new_gtid);
4510 }
4511
4512 // add the reserve serialized team, initialized from the team's primary thread
4513 {
4514 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4515 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4516 new_thr->th.th_serial_team = serial_team =
4517 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4518#if OMPT_SUPPORT
4519 ompt_data_none, // root parallel id
4520#endif
4521 proc_bind_default, &r_icvs,
4522 0 USE_NESTED_HOT_ARG(NULL));
4523 }
4524 KMP_ASSERT(serial_team);
4525 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4526 // execution (it is unused for now).
4527 serial_team->t.t_threads[0] = new_thr;
4528 KF_TRACE(10,
4529 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4530 new_thr));
4531
4532 /* setup the thread structures */
4533 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4534
4535#if USE_FAST_MEMORY
4536 __kmp_initialize_fast_memory(new_thr);
4537#endif /* USE_FAST_MEMORY */
4538
4539#if KMP_USE_BGET
4540 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4541 __kmp_initialize_bget(new_thr);
4542#endif
4543
4544 __kmp_init_random(new_thr); // Initialize random number generator
4545
4546 /* Initialize these only once when thread is grabbed for a team allocation */
4547 KA_TRACE(20,
4548 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4549 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4550
4551 int b;
4552 kmp_balign_t *balign = new_thr->th.th_bar;
4553 for (b = 0; b < bs_last_barrier; ++b) {
4554 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4555 balign[b].bb.team = NULL;
4556 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4557 balign[b].bb.use_oncore_barrier = 0;
4558 }
4559
4560 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4561 new_thr->th.th_sleep_loc_type = flag_unset;
4562
4563 new_thr->th.th_spin_here = FALSE;
4564 new_thr->th.th_next_waiting = 0;
4565#if KMP_OS_UNIX
4566 new_thr->th.th_blocking = false;
4567#endif
4568
4569#if KMP_AFFINITY_SUPPORTED
4570 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4571 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4572 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4573 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4574#endif
4575 new_thr->th.th_def_allocator = __kmp_def_allocator;
4576 new_thr->th.th_prev_level = 0;
4577 new_thr->th.th_prev_num_threads = 1;
4578
4579 TCW_4(new_thr->th.th_in_pool, FALSE);
4580 new_thr->th.th_active_in_pool = FALSE;
4581 TCW_4(new_thr->th.th_active, TRUE);
4582
4583 /* adjust the global counters */
4584 __kmp_all_nth++;
4585 __kmp_nth++;
4586
4587 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4588 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4589 if (__kmp_adjust_gtid_mode) {
4590 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4591 if (TCR_4(__kmp_gtid_mode) != 2) {
4592 TCW_4(__kmp_gtid_mode, 2);
4593 }
4594 } else {
4595 if (TCR_4(__kmp_gtid_mode) != 1) {
4596 TCW_4(__kmp_gtid_mode, 1);
4597 }
4598 }
4599 }
4600
4601#ifdef KMP_ADJUST_BLOCKTIME
4602 /* Adjust blocktime back to zero if necessary */
4603 /* Middle initialization might not have occurred yet */
4604 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4605 if (__kmp_nth > __kmp_avail_proc) {
4606 __kmp_zero_bt = TRUE;
4607 }
4608 }
4609#endif /* KMP_ADJUST_BLOCKTIME */
4610
4611 /* actually fork it and create the new worker thread */
4612 KF_TRACE(
4613 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4614 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4615 KF_TRACE(10,
4616 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4617
4618 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4619 new_gtid));
4620 KMP_MB();
4621 return new_thr;
4622}
4623
4624/* Reinitialize team for reuse.
4625 The hot team code calls this case at every fork barrier, so EPCC barrier
4626 test are extremely sensitive to changes in it, esp. writes to the team
4627 struct, which cause a cache invalidation in all threads.
4628 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4629static void __kmp_reinitialize_team(kmp_team_t *team,
4630 kmp_internal_control_t *new_icvs,
4631 ident_t *loc) {
4632 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4633 team->t.t_threads[0], team));
4634 KMP_DEBUG_ASSERT(team && new_icvs);
4635 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4636 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4637
4638 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4639 // Copy ICVs to the primary thread's implicit taskdata
4640 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4641 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4642
4643 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4644 team->t.t_threads[0], team));
4645}
4646
4647/* Initialize the team data structure.
4648 This assumes the t_threads and t_max_nproc are already set.
4649 Also, we don't touch the arguments */
4650static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4651 kmp_internal_control_t *new_icvs,
4652 ident_t *loc) {
4653 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4654
4655 /* verify */
4656 KMP_DEBUG_ASSERT(team);
4657 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4658 KMP_DEBUG_ASSERT(team->t.t_threads);
4659 KMP_MB();
4660
4661 team->t.t_master_tid = 0; /* not needed */
4662 /* team->t.t_master_bar; not needed */
4663 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4664 team->t.t_nproc = new_nproc;
4665
4666 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4667 team->t.t_next_pool = NULL;
4668 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4669 * up hot team */
4670
4671 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4672 team->t.t_invoke = NULL; /* not needed */
4673
4674 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4675 team->t.t_sched.sched = new_icvs->sched.sched;
4676
4677#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4678 team->t.t_fp_control_saved = FALSE; /* not needed */
4679 team->t.t_x87_fpu_control_word = 0; /* not needed */
4680 team->t.t_mxcsr = 0; /* not needed */
4681#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4682
4683 team->t.t_construct = 0;
4684
4685 team->t.t_ordered.dt.t_value = 0;
4686 team->t.t_master_active = FALSE;
4687
4688#ifdef KMP_DEBUG
4689 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4690#endif
4691#if KMP_OS_WINDOWS
4692 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4693#endif
4694
4695 team->t.t_control_stack_top = NULL;
4696
4697 __kmp_reinitialize_team(team, new_icvs, loc);
4698
4699 KMP_MB();
4700 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4701}
4702
4703#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4704/* Sets full mask for thread and returns old mask, no changes to structures. */
4705static void
4706__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4707 if (KMP_AFFINITY_CAPABLE()) {
4708 int status;
4709 if (old_mask != NULL) {
4710 status = __kmp_get_system_affinity(old_mask, TRUE);
4711 int error = errno;
4712 if (status != 0) {
4713 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4714 __kmp_msg_null);
4715 }
4716 }
4717 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4718 }
4719}
4720#endif
4721
4722#if KMP_AFFINITY_SUPPORTED
4723
4724// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4725// It calculates the worker + primary thread's partition based upon the parent
4726// thread's partition, and binds each worker to a thread in their partition.
4727// The primary thread's partition should already include its current binding.
4728static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4729 // Do not partition places for the hidden helper team
4730 if (KMP_HIDDEN_HELPER_TEAM(team))
4731 return;
4732 // Copy the primary thread's place partition to the team struct
4733 kmp_info_t *master_th = team->t.t_threads[0];
4734 KMP_DEBUG_ASSERT(master_th != NULL);
4735 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4736 int first_place = master_th->th.th_first_place;
4737 int last_place = master_th->th.th_last_place;
4738 int masters_place = master_th->th.th_current_place;
4739 team->t.t_first_place = first_place;
4740 team->t.t_last_place = last_place;
4741
4742 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4743 "bound to place %d partition = [%d,%d]\n",
4744 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4745 team->t.t_id, masters_place, first_place, last_place));
4746
4747 switch (proc_bind) {
4748
4749 case proc_bind_default:
4750 // Serial teams might have the proc_bind policy set to proc_bind_default.
4751 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4752 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4753 break;
4754
4755 case proc_bind_primary: {
4756 int f;
4757 int n_th = team->t.t_nproc;
4758 for (f = 1; f < n_th; f++) {
4759 kmp_info_t *th = team->t.t_threads[f];
4760 KMP_DEBUG_ASSERT(th != NULL);
4761 th->th.th_first_place = first_place;
4762 th->th.th_last_place = last_place;
4763 th->th.th_new_place = masters_place;
4764 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4765 team->t.t_display_affinity != 1) {
4766 team->t.t_display_affinity = 1;
4767 }
4768
4769 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4770 "partition = [%d,%d]\n",
4771 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4772 f, masters_place, first_place, last_place));
4773 }
4774 } break;
4775
4776 case proc_bind_close: {
4777 int f;
4778 int n_th = team->t.t_nproc;
4779 int n_places;
4780 if (first_place <= last_place) {
4781 n_places = last_place - first_place + 1;
4782 } else {
4783 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4784 }
4785 if (n_th <= n_places) {
4786 int place = masters_place;
4787 for (f = 1; f < n_th; f++) {
4788 kmp_info_t *th = team->t.t_threads[f];
4789 KMP_DEBUG_ASSERT(th != NULL);
4790
4791 if (place == last_place) {
4792 place = first_place;
4793 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4794 place = 0;
4795 } else {
4796 place++;
4797 }
4798 th->th.th_first_place = first_place;
4799 th->th.th_last_place = last_place;
4800 th->th.th_new_place = place;
4801 if (__kmp_display_affinity && place != th->th.th_current_place &&
4802 team->t.t_display_affinity != 1) {
4803 team->t.t_display_affinity = 1;
4804 }
4805
4806 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4807 "partition = [%d,%d]\n",
4808 __kmp_gtid_from_thread(team->t.t_threads[f]),
4809 team->t.t_id, f, place, first_place, last_place));
4810 }
4811 } else {
4812 int S, rem, gap, s_count;
4813 S = n_th / n_places;
4814 s_count = 0;
4815 rem = n_th - (S * n_places);
4816 gap = rem > 0 ? n_places / rem : n_places;
4817 int place = masters_place;
4818 int gap_ct = gap;
4819 for (f = 0; f < n_th; f++) {
4820 kmp_info_t *th = team->t.t_threads[f];
4821 KMP_DEBUG_ASSERT(th != NULL);
4822
4823 th->th.th_first_place = first_place;
4824 th->th.th_last_place = last_place;
4825 th->th.th_new_place = place;
4826 if (__kmp_display_affinity && place != th->th.th_current_place &&
4827 team->t.t_display_affinity != 1) {
4828 team->t.t_display_affinity = 1;
4829 }
4830 s_count++;
4831
4832 if ((s_count == S) && rem && (gap_ct == gap)) {
4833 // do nothing, add an extra thread to place on next iteration
4834 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4835 // we added an extra thread to this place; move to next place
4836 if (place == last_place) {
4837 place = first_place;
4838 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4839 place = 0;
4840 } else {
4841 place++;
4842 }
4843 s_count = 0;
4844 gap_ct = 1;
4845 rem--;
4846 } else if (s_count == S) { // place full; don't add extra
4847 if (place == last_place) {
4848 place = first_place;
4849 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4850 place = 0;
4851 } else {
4852 place++;
4853 }
4854 gap_ct++;
4855 s_count = 0;
4856 }
4857
4858 KA_TRACE(100,
4859 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4860 "partition = [%d,%d]\n",
4861 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4862 th->th.th_new_place, first_place, last_place));
4863 }
4864 KMP_DEBUG_ASSERT(place == masters_place);
4865 }
4866 } break;
4867
4868 case proc_bind_spread: {
4869 int f;
4870 int n_th = team->t.t_nproc;
4871 int n_places;
4872 int thidx;
4873 if (first_place <= last_place) {
4874 n_places = last_place - first_place + 1;
4875 } else {
4876 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4877 }
4878 if (n_th <= n_places) {
4879 int place = -1;
4880
4881 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4882 int S = n_places / n_th;
4883 int s_count, rem, gap, gap_ct;
4884
4885 place = masters_place;
4886 rem = n_places - n_th * S;
4887 gap = rem ? n_th / rem : 1;
4888 gap_ct = gap;
4889 thidx = n_th;
4890 if (update_master_only == 1)
4891 thidx = 1;
4892 for (f = 0; f < thidx; f++) {
4893 kmp_info_t *th = team->t.t_threads[f];
4894 KMP_DEBUG_ASSERT(th != NULL);
4895
4896 th->th.th_first_place = place;
4897 th->th.th_new_place = place;
4898 if (__kmp_display_affinity && place != th->th.th_current_place &&
4899 team->t.t_display_affinity != 1) {
4900 team->t.t_display_affinity = 1;
4901 }
4902 s_count = 1;
4903 while (s_count < S) {
4904 if (place == last_place) {
4905 place = first_place;
4906 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4907 place = 0;
4908 } else {
4909 place++;
4910 }
4911 s_count++;
4912 }
4913 if (rem && (gap_ct == gap)) {
4914 if (place == last_place) {
4915 place = first_place;
4916 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4917 place = 0;
4918 } else {
4919 place++;
4920 }
4921 rem--;
4922 gap_ct = 0;
4923 }
4924 th->th.th_last_place = place;
4925 gap_ct++;
4926
4927 if (place == last_place) {
4928 place = first_place;
4929 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4930 place = 0;
4931 } else {
4932 place++;
4933 }
4934
4935 KA_TRACE(100,
4936 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4937 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4938 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4939 f, th->th.th_new_place, th->th.th_first_place,
4940 th->th.th_last_place, __kmp_affinity_num_masks));
4941 }
4942 } else {
4943 /* Having uniform space of available computation places I can create
4944 T partitions of round(P/T) size and put threads into the first
4945 place of each partition. */
4946 double current = static_cast<double>(masters_place);
4947 double spacing =
4948 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4949 int first, last;
4950 kmp_info_t *th;
4951
4952 thidx = n_th + 1;
4953 if (update_master_only == 1)
4954 thidx = 1;
4955 for (f = 0; f < thidx; f++) {
4956 first = static_cast<int>(current);
4957 last = static_cast<int>(current + spacing) - 1;
4958 KMP_DEBUG_ASSERT(last >= first);
4959 if (first >= n_places) {
4960 if (masters_place) {
4961 first -= n_places;
4962 last -= n_places;
4963 if (first == (masters_place + 1)) {
4964 KMP_DEBUG_ASSERT(f == n_th);
4965 first--;
4966 }
4967 if (last == masters_place) {
4968 KMP_DEBUG_ASSERT(f == (n_th - 1));
4969 last--;
4970 }
4971 } else {
4972 KMP_DEBUG_ASSERT(f == n_th);
4973 first = 0;
4974 last = 0;
4975 }
4976 }
4977 if (last >= n_places) {
4978 last = (n_places - 1);
4979 }
4980 place = first;
4981 current += spacing;
4982 if (f < n_th) {
4983 KMP_DEBUG_ASSERT(0 <= first);
4984 KMP_DEBUG_ASSERT(n_places > first);
4985 KMP_DEBUG_ASSERT(0 <= last);
4986 KMP_DEBUG_ASSERT(n_places > last);
4987 KMP_DEBUG_ASSERT(last_place >= first_place);
4988 th = team->t.t_threads[f];
4989 KMP_DEBUG_ASSERT(th);
4990 th->th.th_first_place = first;
4991 th->th.th_new_place = place;
4992 th->th.th_last_place = last;
4993 if (__kmp_display_affinity && place != th->th.th_current_place &&
4994 team->t.t_display_affinity != 1) {
4995 team->t.t_display_affinity = 1;
4996 }
4997 KA_TRACE(100,
4998 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4999 "partition = [%d,%d], spacing = %.4f\n",
5000 __kmp_gtid_from_thread(team->t.t_threads[f]),
5001 team->t.t_id, f, th->th.th_new_place,
5002 th->th.th_first_place, th->th.th_last_place, spacing));
5003 }
5004 }
5005 }
5006 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5007 } else {
5008 int S, rem, gap, s_count;
5009 S = n_th / n_places;
5010 s_count = 0;
5011 rem = n_th - (S * n_places);
5012 gap = rem > 0 ? n_places / rem : n_places;
5013 int place = masters_place;
5014 int gap_ct = gap;
5015 thidx = n_th;
5016 if (update_master_only == 1)
5017 thidx = 1;
5018 for (f = 0; f < thidx; f++) {
5019 kmp_info_t *th = team->t.t_threads[f];
5020 KMP_DEBUG_ASSERT(th != NULL);
5021
5022 th->th.th_first_place = place;
5023 th->th.th_last_place = place;
5024 th->th.th_new_place = place;
5025 if (__kmp_display_affinity && place != th->th.th_current_place &&
5026 team->t.t_display_affinity != 1) {
5027 team->t.t_display_affinity = 1;
5028 }
5029 s_count++;
5030
5031 if ((s_count == S) && rem && (gap_ct == gap)) {
5032 // do nothing, add an extra thread to place on next iteration
5033 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5034 // we added an extra thread to this place; move on to next place
5035 if (place == last_place) {
5036 place = first_place;
5037 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5038 place = 0;
5039 } else {
5040 place++;
5041 }
5042 s_count = 0;
5043 gap_ct = 1;
5044 rem--;
5045 } else if (s_count == S) { // place is full; don't add extra thread
5046 if (place == last_place) {
5047 place = first_place;
5048 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5049 place = 0;
5050 } else {
5051 place++;
5052 }
5053 gap_ct++;
5054 s_count = 0;
5055 }
5056
5057 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5058 "partition = [%d,%d]\n",
5059 __kmp_gtid_from_thread(team->t.t_threads[f]),
5060 team->t.t_id, f, th->th.th_new_place,
5061 th->th.th_first_place, th->th.th_last_place));
5062 }
5063 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5064 }
5065 } break;
5066
5067 default:
5068 break;
5069 }
5070
5071 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5072}
5073
5074#endif // KMP_AFFINITY_SUPPORTED
5075
5076/* allocate a new team data structure to use. take one off of the free pool if
5077 available */
5078kmp_team_t *
5079__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5080#if OMPT_SUPPORT
5081 ompt_data_t ompt_parallel_data,
5082#endif
5083 kmp_proc_bind_t new_proc_bind,
5084 kmp_internal_control_t *new_icvs,
5085 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5086 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5087 int f;
5088 kmp_team_t *team;
5089 int use_hot_team = !root->r.r_active;
5090 int level = 0;
5091 int do_place_partition = 1;
5092
5093 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5094 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5095 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5096 KMP_MB();
5097
5098#if KMP_NESTED_HOT_TEAMS
5099 kmp_hot_team_ptr_t *hot_teams;
5100 if (master) {
5101 team = master->th.th_team;
5102 level = team->t.t_active_level;
5103 if (master->th.th_teams_microtask) { // in teams construct?
5104 if (master->th.th_teams_size.nteams > 1 &&
5105 ( // #teams > 1
5106 team->t.t_pkfn ==
5107 (microtask_t)__kmp_teams_master || // inner fork of the teams
5108 master->th.th_teams_level <
5109 team->t.t_level)) { // or nested parallel inside the teams
5110 ++level; // not increment if #teams==1, or for outer fork of the teams;
5111 // increment otherwise
5112 }
5113 // Do not perform the place partition if inner fork of the teams
5114 // Wait until nested parallel region encountered inside teams construct
5115 if ((master->th.th_teams_size.nteams == 1 &&
5116 master->th.th_teams_level >= team->t.t_level) ||
5117 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5118 do_place_partition = 0;
5119 }
5120 hot_teams = master->th.th_hot_teams;
5121 if (level < __kmp_hot_teams_max_level && hot_teams &&
5122 hot_teams[level].hot_team) {
5123 // hot team has already been allocated for given level
5124 use_hot_team = 1;
5125 } else {
5126 use_hot_team = 0;
5127 }
5128 } else {
5129 // check we won't access uninitialized hot_teams, just in case
5130 KMP_DEBUG_ASSERT(new_nproc == 1);
5131 }
5132#endif
5133 // Optimization to use a "hot" team
5134 if (use_hot_team && new_nproc > 1) {
5135 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5136#if KMP_NESTED_HOT_TEAMS
5137 team = hot_teams[level].hot_team;
5138#else
5139 team = root->r.r_hot_team;
5140#endif
5141#if KMP_DEBUG
5142 if (__kmp_tasking_mode != tskm_immediate_exec) {
5143 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5144 "task_team[1] = %p before reinit\n",
5145 team->t.t_task_team[0], team->t.t_task_team[1]));
5146 }
5147#endif
5148
5149 if (team->t.t_nproc != new_nproc &&
5150 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5151 // Distributed barrier may need a resize
5152 int old_nthr = team->t.t_nproc;
5153 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5154 }
5155
5156 // If not doing the place partition, then reset the team's proc bind
5157 // to indicate that partitioning of all threads still needs to take place
5158 if (do_place_partition == 0)
5159 team->t.t_proc_bind = proc_bind_default;
5160 // Has the number of threads changed?
5161 /* Let's assume the most common case is that the number of threads is
5162 unchanged, and put that case first. */
5163 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5164 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5165 // This case can mean that omp_set_num_threads() was called and the hot
5166 // team size was already reduced, so we check the special flag
5167 if (team->t.t_size_changed == -1) {
5168 team->t.t_size_changed = 1;
5169 } else {
5170 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5171 }
5172
5173 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5174 kmp_r_sched_t new_sched = new_icvs->sched;
5175 // set primary thread's schedule as new run-time schedule
5176 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5177
5178 __kmp_reinitialize_team(team, new_icvs,
5179 root->r.r_uber_thread->th.th_ident);
5180
5181 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5182 team->t.t_threads[0], team));
5183 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5184
5185#if KMP_AFFINITY_SUPPORTED
5186 if ((team->t.t_size_changed == 0) &&
5187 (team->t.t_proc_bind == new_proc_bind)) {
5188 if (new_proc_bind == proc_bind_spread) {
5189 if (do_place_partition) {
5190 // add flag to update only master for spread
5191 __kmp_partition_places(team, 1);
5192 }
5193 }
5194 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5195 "proc_bind = %d, partition = [%d,%d]\n",
5196 team->t.t_id, new_proc_bind, team->t.t_first_place,
5197 team->t.t_last_place));
5198 } else {
5199 if (do_place_partition) {
5200 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5201 __kmp_partition_places(team);
5202 }
5203 }
5204#else
5205 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5206#endif /* KMP_AFFINITY_SUPPORTED */
5207 } else if (team->t.t_nproc > new_nproc) {
5208 KA_TRACE(20,
5209 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5210 new_nproc));
5211
5212 team->t.t_size_changed = 1;
5213 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5214 // Barrier size already reduced earlier in this function
5215 // Activate team threads via th_used_in_team
5216 __kmp_add_threads_to_team(team, new_nproc);
5217 }
5218#if KMP_NESTED_HOT_TEAMS
5219 if (__kmp_hot_teams_mode == 0) {
5220 // AC: saved number of threads should correspond to team's value in this
5221 // mode, can be bigger in mode 1, when hot team has threads in reserve
5222 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5223 hot_teams[level].hot_team_nth = new_nproc;
5224#endif // KMP_NESTED_HOT_TEAMS
5225 /* release the extra threads we don't need any more */
5226 for (f = new_nproc; f < team->t.t_nproc; f++) {
5227 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5228 if (__kmp_tasking_mode != tskm_immediate_exec) {
5229 // When decreasing team size, threads no longer in the team should
5230 // unref task team.
5231 team->t.t_threads[f]->th.th_task_team = NULL;
5232 }
5233 __kmp_free_thread(team->t.t_threads[f]);
5234 team->t.t_threads[f] = NULL;
5235 }
5236#if KMP_NESTED_HOT_TEAMS
5237 } // (__kmp_hot_teams_mode == 0)
5238 else {
5239 // When keeping extra threads in team, switch threads to wait on own
5240 // b_go flag
5241 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5242 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5243 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5244 for (int b = 0; b < bs_last_barrier; ++b) {
5245 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5246 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5247 }
5248 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5249 }
5250 }
5251 }
5252#endif // KMP_NESTED_HOT_TEAMS
5253 team->t.t_nproc = new_nproc;
5254 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5255 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5256 __kmp_reinitialize_team(team, new_icvs,
5257 root->r.r_uber_thread->th.th_ident);
5258
5259 // Update remaining threads
5260 for (f = 0; f < new_nproc; ++f) {
5261 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5262 }
5263
5264 // restore the current task state of the primary thread: should be the
5265 // implicit task
5266 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5267 team->t.t_threads[0], team));
5268
5269 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5270
5271#ifdef KMP_DEBUG
5272 for (f = 0; f < team->t.t_nproc; f++) {
5273 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5274 team->t.t_threads[f]->th.th_team_nproc ==
5275 team->t.t_nproc);
5276 }
5277#endif
5278
5279 if (do_place_partition) {
5280 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5281#if KMP_AFFINITY_SUPPORTED
5282 __kmp_partition_places(team);
5283#endif
5284 }
5285 } else { // team->t.t_nproc < new_nproc
5286#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5287 kmp_affin_mask_t *old_mask;
5288 if (KMP_AFFINITY_CAPABLE()) {
5289 KMP_CPU_ALLOC(old_mask);
5290 }
5291#endif
5292
5293 KA_TRACE(20,
5294 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5295 new_nproc));
5296 int old_nproc = team->t.t_nproc; // save old value and use to update only
5297 team->t.t_size_changed = 1;
5298
5299#if KMP_NESTED_HOT_TEAMS
5300 int avail_threads = hot_teams[level].hot_team_nth;
5301 if (new_nproc < avail_threads)
5302 avail_threads = new_nproc;
5303 kmp_info_t **other_threads = team->t.t_threads;
5304 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5305 // Adjust barrier data of reserved threads (if any) of the team
5306 // Other data will be set in __kmp_initialize_info() below.
5307 int b;
5308 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5309 for (b = 0; b < bs_last_barrier; ++b) {
5310 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5311 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5312#if USE_DEBUGGER
5313 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5314#endif
5315 }
5316 }
5317 if (hot_teams[level].hot_team_nth >= new_nproc) {
5318 // we have all needed threads in reserve, no need to allocate any
5319 // this only possible in mode 1, cannot have reserved threads in mode 0
5320 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5321 team->t.t_nproc = new_nproc; // just get reserved threads involved
5322 } else {
5323 // We may have some threads in reserve, but not enough;
5324 // get reserved threads involved if any.
5325 team->t.t_nproc = hot_teams[level].hot_team_nth;
5326 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5327#endif // KMP_NESTED_HOT_TEAMS
5328 if (team->t.t_max_nproc < new_nproc) {
5329 /* reallocate larger arrays */
5330 __kmp_reallocate_team_arrays(team, new_nproc);
5331 __kmp_reinitialize_team(team, new_icvs, NULL);
5332 }
5333
5334#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5335 /* Temporarily set full mask for primary thread before creation of
5336 workers. The reason is that workers inherit the affinity from the
5337 primary thread, so if a lot of workers are created on the single
5338 core quickly, they don't get a chance to set their own affinity for
5339 a long time. */
5340 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5341#endif
5342
5343 /* allocate new threads for the hot team */
5344 for (f = team->t.t_nproc; f < new_nproc; f++) {
5345 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5346 KMP_DEBUG_ASSERT(new_worker);
5347 team->t.t_threads[f] = new_worker;
5348
5349 KA_TRACE(20,
5350 ("__kmp_allocate_team: team %d init T#%d arrived: "
5351 "join=%llu, plain=%llu\n",
5352 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5353 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5354 team->t.t_bar[bs_plain_barrier].b_arrived));
5355
5356 { // Initialize barrier data for new threads.
5357 int b;
5358 kmp_balign_t *balign = new_worker->th.th_bar;
5359 for (b = 0; b < bs_last_barrier; ++b) {
5360 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5361 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5362 KMP_BARRIER_PARENT_FLAG);
5363#if USE_DEBUGGER
5364 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5365#endif
5366 }
5367 }
5368 }
5369
5370#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5371 if (KMP_AFFINITY_CAPABLE()) {
5372 /* Restore initial primary thread's affinity mask */
5373 __kmp_set_system_affinity(old_mask, TRUE);
5374 KMP_CPU_FREE(old_mask);
5375 }
5376#endif
5377#if KMP_NESTED_HOT_TEAMS
5378 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5379#endif // KMP_NESTED_HOT_TEAMS
5380 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5381 // Barrier size already increased earlier in this function
5382 // Activate team threads via th_used_in_team
5383 __kmp_add_threads_to_team(team, new_nproc);
5384 }
5385 /* make sure everyone is syncronized */
5386 // new threads below
5387 __kmp_initialize_team(team, new_nproc, new_icvs,
5388 root->r.r_uber_thread->th.th_ident);
5389
5390 /* reinitialize the threads */
5391 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5392 for (f = 0; f < team->t.t_nproc; ++f)
5393 __kmp_initialize_info(team->t.t_threads[f], team, f,
5394 __kmp_gtid_from_tid(f, team));
5395
5396 if (level) { // set th_task_state for new threads in nested hot team
5397 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5398 // only need to set the th_task_state for the new threads. th_task_state
5399 // for primary thread will not be accurate until after this in
5400 // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5401 // get the correct value.
5402 for (f = old_nproc; f < team->t.t_nproc; ++f)
5403 team->t.t_threads[f]->th.th_task_state =
5404 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5405 } else { // set th_task_state for new threads in non-nested hot team
5406 // copy primary thread's state
5407 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5408 for (f = old_nproc; f < team->t.t_nproc; ++f)
5409 team->t.t_threads[f]->th.th_task_state = old_state;
5410 }
5411
5412#ifdef KMP_DEBUG
5413 for (f = 0; f < team->t.t_nproc; ++f) {
5414 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5415 team->t.t_threads[f]->th.th_team_nproc ==
5416 team->t.t_nproc);
5417 }
5418#endif
5419
5420 if (do_place_partition) {
5421 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5422#if KMP_AFFINITY_SUPPORTED
5423 __kmp_partition_places(team);
5424#endif
5425 }
5426 } // Check changes in number of threads
5427
5428 kmp_info_t *master = team->t.t_threads[0];
5429 if (master->th.th_teams_microtask) {
5430 for (f = 1; f < new_nproc; ++f) {
5431 // propagate teams construct specific info to workers
5432 kmp_info_t *thr = team->t.t_threads[f];
5433 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5434 thr->th.th_teams_level = master->th.th_teams_level;
5435 thr->th.th_teams_size = master->th.th_teams_size;
5436 }
5437 }
5438#if KMP_NESTED_HOT_TEAMS
5439 if (level) {
5440 // Sync barrier state for nested hot teams, not needed for outermost hot
5441 // team.
5442 for (f = 1; f < new_nproc; ++f) {
5443 kmp_info_t *thr = team->t.t_threads[f];
5444 int b;
5445 kmp_balign_t *balign = thr->th.th_bar;
5446 for (b = 0; b < bs_last_barrier; ++b) {
5447 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5448 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5449#if USE_DEBUGGER
5450 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5451#endif
5452 }
5453 }
5454 }
5455#endif // KMP_NESTED_HOT_TEAMS
5456
5457 /* reallocate space for arguments if necessary */
5458 __kmp_alloc_argv_entries(argc, team, TRUE);
5459 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5460 // The hot team re-uses the previous task team,
5461 // if untouched during the previous release->gather phase.
5462
5463 KF_TRACE(10, (" hot_team = %p\n", team));
5464
5465#if KMP_DEBUG
5466 if (__kmp_tasking_mode != tskm_immediate_exec) {
5467 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5468 "task_team[1] = %p after reinit\n",
5469 team->t.t_task_team[0], team->t.t_task_team[1]));
5470 }
5471#endif
5472
5473#if OMPT_SUPPORT
5474 __ompt_team_assign_id(team, ompt_parallel_data);
5475#endif
5476
5477 KMP_MB();
5478
5479 return team;
5480 }
5481
5482 /* next, let's try to take one from the team pool */
5483 KMP_MB();
5484 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5485 /* TODO: consider resizing undersized teams instead of reaping them, now
5486 that we have a resizing mechanism */
5487 if (team->t.t_max_nproc >= max_nproc) {
5488 /* take this team from the team pool */
5489 __kmp_team_pool = team->t.t_next_pool;
5490
5491 if (max_nproc > 1 &&
5492 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5493 if (!team->t.b) { // Allocate barrier structure
5494 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5495 }
5496 }
5497
5498 /* setup the team for fresh use */
5499 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5500
5501 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5502 "task_team[1] %p to NULL\n",
5503 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5504 team->t.t_task_team[0] = NULL;
5505 team->t.t_task_team[1] = NULL;
5506
5507 /* reallocate space for arguments if necessary */
5508 __kmp_alloc_argv_entries(argc, team, TRUE);
5509 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5510
5511 KA_TRACE(
5512 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5513 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5514 { // Initialize barrier data.
5515 int b;
5516 for (b = 0; b < bs_last_barrier; ++b) {
5517 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5518#if USE_DEBUGGER
5519 team->t.t_bar[b].b_master_arrived = 0;
5520 team->t.t_bar[b].b_team_arrived = 0;
5521#endif
5522 }
5523 }
5524
5525 team->t.t_proc_bind = new_proc_bind;
5526
5527 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5528 team->t.t_id));
5529
5530#if OMPT_SUPPORT
5531 __ompt_team_assign_id(team, ompt_parallel_data);
5532#endif
5533
5534 KMP_MB();
5535
5536 return team;
5537 }
5538
5539 /* reap team if it is too small, then loop back and check the next one */
5540 // not sure if this is wise, but, will be redone during the hot-teams
5541 // rewrite.
5542 /* TODO: Use technique to find the right size hot-team, don't reap them */
5543 team = __kmp_reap_team(team);
5544 __kmp_team_pool = team;
5545 }
5546
5547 /* nothing available in the pool, no matter, make a new team! */
5548 KMP_MB();
5549 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5550
5551 /* and set it up */
5552 team->t.t_max_nproc = max_nproc;
5553 if (max_nproc > 1 &&
5554 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5555 // Allocate barrier structure
5556 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5557 }
5558
5559 /* NOTE well, for some reason allocating one big buffer and dividing it up
5560 seems to really hurt performance a lot on the P4, so, let's not use this */
5561 __kmp_allocate_team_arrays(team, max_nproc);
5562
5563 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5564 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5565
5566 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5567 "%p to NULL\n",
5568 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5569 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5570 // memory, no need to duplicate
5571 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5572 // memory, no need to duplicate
5573
5574 if (__kmp_storage_map) {
5575 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5576 }
5577
5578 /* allocate space for arguments */
5579 __kmp_alloc_argv_entries(argc, team, FALSE);
5580 team->t.t_argc = argc;
5581
5582 KA_TRACE(20,
5583 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5584 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5585 { // Initialize barrier data.
5586 int b;
5587 for (b = 0; b < bs_last_barrier; ++b) {
5588 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5589#if USE_DEBUGGER
5590 team->t.t_bar[b].b_master_arrived = 0;
5591 team->t.t_bar[b].b_team_arrived = 0;
5592#endif
5593 }
5594 }
5595
5596 team->t.t_proc_bind = new_proc_bind;
5597
5598#if OMPT_SUPPORT
5599 __ompt_team_assign_id(team, ompt_parallel_data);
5600 team->t.ompt_serialized_team_info = NULL;
5601#endif
5602
5603 KMP_MB();
5604
5605 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5606 team->t.t_id));
5607
5608 return team;
5609}
5610
5611/* TODO implement hot-teams at all levels */
5612/* TODO implement lazy thread release on demand (disband request) */
5613
5614/* free the team. return it to the team pool. release all the threads
5615 * associated with it */
5616void __kmp_free_team(kmp_root_t *root,
5617 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5618 int f;
5619 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5620 team->t.t_id));
5621
5622 /* verify state */
5623 KMP_DEBUG_ASSERT(root);
5624 KMP_DEBUG_ASSERT(team);
5625 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5626 KMP_DEBUG_ASSERT(team->t.t_threads);
5627
5628 int use_hot_team = team == root->r.r_hot_team;
5629#if KMP_NESTED_HOT_TEAMS
5630 int level;
5631 if (master) {
5632 level = team->t.t_active_level - 1;
5633 if (master->th.th_teams_microtask) { // in teams construct?
5634 if (master->th.th_teams_size.nteams > 1) {
5635 ++level; // level was not increased in teams construct for
5636 // team_of_masters
5637 }
5638 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5639 master->th.th_teams_level == team->t.t_level) {
5640 ++level; // level was not increased in teams construct for
5641 // team_of_workers before the parallel
5642 } // team->t.t_level will be increased inside parallel
5643 }
5644#if KMP_DEBUG
5645 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5646#endif
5647 if (level < __kmp_hot_teams_max_level) {
5648 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5649 use_hot_team = 1;
5650 }
5651 }
5652#endif // KMP_NESTED_HOT_TEAMS
5653
5654 /* team is done working */
5655 TCW_SYNC_PTR(team->t.t_pkfn,
5656 NULL); // Important for Debugging Support Library.
5657#if KMP_OS_WINDOWS
5658 team->t.t_copyin_counter = 0; // init counter for possible reuse
5659#endif
5660 // Do not reset pointer to parent team to NULL for hot teams.
5661
5662 /* if we are non-hot team, release our threads */
5663 if (!use_hot_team) {
5664 if (__kmp_tasking_mode != tskm_immediate_exec) {
5665 // Wait for threads to reach reapable state
5666 for (f = 1; f < team->t.t_nproc; ++f) {
5667 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5668 kmp_info_t *th = team->t.t_threads[f];
5669 volatile kmp_uint32 *state = &th->th.th_reap_state;
5670 while (*state != KMP_SAFE_TO_REAP) {
5671#if KMP_OS_WINDOWS
5672 // On Windows a thread can be killed at any time, check this
5673 DWORD ecode;
5674 if (!__kmp_is_thread_alive(th, &ecode)) {
5675 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5676 break;
5677 }
5678#endif
5679 // first check if thread is sleeping
5680 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5681 if (fl.is_sleeping())
5682 fl.resume(__kmp_gtid_from_thread(th));
5683 KMP_CPU_PAUSE();
5684 }
5685 }
5686
5687 // Delete task teams
5688 int tt_idx;
5689 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5690 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5691 if (task_team != NULL) {
5692 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5693 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5694 team->t.t_threads[f]->th.th_task_team = NULL;
5695 }
5696 KA_TRACE(
5697 20,
5698 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5699 __kmp_get_gtid(), task_team, team->t.t_id));
5700#if KMP_NESTED_HOT_TEAMS
5701 __kmp_free_task_team(master, task_team);
5702#endif
5703 team->t.t_task_team[tt_idx] = NULL;
5704 }
5705 }
5706 }
5707
5708 // Reset pointer to parent team only for non-hot teams.
5709 team->t.t_parent = NULL;
5710 team->t.t_level = 0;
5711 team->t.t_active_level = 0;
5712
5713 /* free the worker threads */
5714 for (f = 1; f < team->t.t_nproc; ++f) {
5715 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5716 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5717 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5718 1, 2);
5719 }
5720 __kmp_free_thread(team->t.t_threads[f]);
5721 }
5722
5723 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5724 if (team->t.b) {
5725 // wake up thread at old location
5726 team->t.b->go_release();
5727 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5728 for (f = 1; f < team->t.t_nproc; ++f) {
5729 if (team->t.b->sleep[f].sleep) {
5730 __kmp_atomic_resume_64(
5731 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5732 (kmp_atomic_flag_64<> *)NULL);
5733 }
5734 }
5735 }
5736 // Wait for threads to be removed from team
5737 for (int f = 1; f < team->t.t_nproc; ++f) {
5738 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5739 KMP_CPU_PAUSE();
5740 }
5741 }
5742 }
5743
5744 for (f = 1; f < team->t.t_nproc; ++f) {
5745 team->t.t_threads[f] = NULL;
5746 }
5747
5748 if (team->t.t_max_nproc > 1 &&
5749 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5750 distributedBarrier::deallocate(team->t.b);
5751 team->t.b = NULL;
5752 }
5753 /* put the team back in the team pool */
5754 /* TODO limit size of team pool, call reap_team if pool too large */
5755 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5756 __kmp_team_pool = (volatile kmp_team_t *)team;
5757 } else { // Check if team was created for primary threads in teams construct
5758 // See if first worker is a CG root
5759 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5760 team->t.t_threads[1]->th.th_cg_roots);
5761 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5762 // Clean up the CG root nodes on workers so that this team can be re-used
5763 for (f = 1; f < team->t.t_nproc; ++f) {
5764 kmp_info_t *thr = team->t.t_threads[f];
5765 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5766 thr->th.th_cg_roots->cg_root == thr);
5767 // Pop current CG root off list
5768 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5769 thr->th.th_cg_roots = tmp->up;
5770 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5771 " up to node %p. cg_nthreads was %d\n",
5772 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5773 int i = tmp->cg_nthreads--;
5774 if (i == 1) {
5775 __kmp_free(tmp); // free CG if we are the last thread in it
5776 }
5777 // Restore current task's thread_limit from CG root
5778 if (thr->th.th_cg_roots)
5779 thr->th.th_current_task->td_icvs.thread_limit =
5780 thr->th.th_cg_roots->cg_thread_limit;
5781 }
5782 }
5783 }
5784
5785 KMP_MB();
5786}
5787
5788/* reap the team. destroy it, reclaim all its resources and free its memory */
5789kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5790 kmp_team_t *next_pool = team->t.t_next_pool;
5791
5792 KMP_DEBUG_ASSERT(team);
5793 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5794 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5795 KMP_DEBUG_ASSERT(team->t.t_threads);
5796 KMP_DEBUG_ASSERT(team->t.t_argv);
5797
5798 /* TODO clean the threads that are a part of this? */
5799
5800 /* free stuff */
5801 __kmp_free_team_arrays(team);
5802 if (team->t.t_argv != &team->t.t_inline_argv[0])
5803 __kmp_free((void *)team->t.t_argv);
5804 __kmp_free(team);
5805
5806 KMP_MB();
5807 return next_pool;
5808}
5809
5810// Free the thread. Don't reap it, just place it on the pool of available
5811// threads.
5812//
5813// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5814// binding for the affinity mechanism to be useful.
5815//
5816// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5817// However, we want to avoid a potential performance problem by always
5818// scanning through the list to find the correct point at which to insert
5819// the thread (potential N**2 behavior). To do this we keep track of the
5820// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5821// With single-level parallelism, threads will always be added to the tail
5822// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5823// parallelism, all bets are off and we may need to scan through the entire
5824// free list.
5825//
5826// This change also has a potentially large performance benefit, for some
5827// applications. Previously, as threads were freed from the hot team, they
5828// would be placed back on the free list in inverse order. If the hot team
5829// grew back to it's original size, then the freed thread would be placed
5830// back on the hot team in reverse order. This could cause bad cache
5831// locality problems on programs where the size of the hot team regularly
5832// grew and shrunk.
5833//
5834// Now, for single-level parallelism, the OMP tid is always == gtid.
5835void __kmp_free_thread(kmp_info_t *this_th) {
5836 int gtid;
5837 kmp_info_t **scan;
5838
5839 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5840 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5841
5842 KMP_DEBUG_ASSERT(this_th);
5843
5844 // When moving thread to pool, switch thread to wait on own b_go flag, and
5845 // uninitialized (NULL team).
5846 int b;
5847 kmp_balign_t *balign = this_th->th.th_bar;
5848 for (b = 0; b < bs_last_barrier; ++b) {
5849 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5850 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5851 balign[b].bb.team = NULL;
5852 balign[b].bb.leaf_kids = 0;
5853 }
5854 this_th->th.th_task_state = 0;
5855 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5856
5857 /* put thread back on the free pool */
5858 TCW_PTR(this_th->th.th_team, NULL);
5859 TCW_PTR(this_th->th.th_root, NULL);
5860 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5861
5862 while (this_th->th.th_cg_roots) {
5863 this_th->th.th_cg_roots->cg_nthreads--;
5864 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5865 " %p of thread %p to %d\n",
5866 this_th, this_th->th.th_cg_roots,
5867 this_th->th.th_cg_roots->cg_root,
5868 this_th->th.th_cg_roots->cg_nthreads));
5869 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5870 if (tmp->cg_root == this_th) { // Thread is a cg_root
5871 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5872 KA_TRACE(
5873 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5874 this_th->th.th_cg_roots = tmp->up;
5875 __kmp_free(tmp);
5876 } else { // Worker thread
5877 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5878 __kmp_free(tmp);
5879 }
5880 this_th->th.th_cg_roots = NULL;
5881 break;
5882 }
5883 }
5884
5885 /* If the implicit task assigned to this thread can be used by other threads
5886 * -> multiple threads can share the data and try to free the task at
5887 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5888 * with higher probability when hot team is disabled but can occurs even when
5889 * the hot team is enabled */
5890 __kmp_free_implicit_task(this_th);
5891 this_th->th.th_current_task = NULL;
5892
5893 // If the __kmp_thread_pool_insert_pt is already past the new insert
5894 // point, then we need to re-scan the entire list.
5895 gtid = this_th->th.th_info.ds.ds_gtid;
5896 if (__kmp_thread_pool_insert_pt != NULL) {
5897 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5898 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5899 __kmp_thread_pool_insert_pt = NULL;
5900 }
5901 }
5902
5903 // Scan down the list to find the place to insert the thread.
5904 // scan is the address of a link in the list, possibly the address of
5905 // __kmp_thread_pool itself.
5906 //
5907 // In the absence of nested parallelism, the for loop will have 0 iterations.
5908 if (__kmp_thread_pool_insert_pt != NULL) {
5909 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5910 } else {
5911 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5912 }
5913 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5914 scan = &((*scan)->th.th_next_pool))
5915 ;
5916
5917 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5918 // to its address.
5919 TCW_PTR(this_th->th.th_next_pool, *scan);
5920 __kmp_thread_pool_insert_pt = *scan = this_th;
5921 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5922 (this_th->th.th_info.ds.ds_gtid <
5923 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5924 TCW_4(this_th->th.th_in_pool, TRUE);
5925 __kmp_suspend_initialize_thread(this_th);
5926 __kmp_lock_suspend_mx(this_th);
5927 if (this_th->th.th_active == TRUE) {
5928 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5929 this_th->th.th_active_in_pool = TRUE;
5930 }
5931#if KMP_DEBUG
5932 else {
5933 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5934 }
5935#endif
5936 __kmp_unlock_suspend_mx(this_th);
5937
5938 TCW_4(__kmp_nth, __kmp_nth - 1);
5939
5940#ifdef KMP_ADJUST_BLOCKTIME
5941 /* Adjust blocktime back to user setting or default if necessary */
5942 /* Middle initialization might never have occurred */
5943 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5944 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5945 if (__kmp_nth <= __kmp_avail_proc) {
5946 __kmp_zero_bt = FALSE;
5947 }
5948 }
5949#endif /* KMP_ADJUST_BLOCKTIME */
5950
5951 KMP_MB();
5952}
5953
5954/* ------------------------------------------------------------------------ */
5955
5956void *__kmp_launch_thread(kmp_info_t *this_thr) {
5957#if OMP_PROFILING_SUPPORT
5958 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5959 // TODO: add a configuration option for time granularity
5960 if (ProfileTraceFile)
5961 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5962#endif
5963
5964 int gtid = this_thr->th.th_info.ds.ds_gtid;
5965 /* void *stack_data;*/
5966 kmp_team_t **volatile pteam;
5967
5968 KMP_MB();
5969 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5970
5971 if (__kmp_env_consistency_check) {
5972 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5973 }
5974
5975#if OMPD_SUPPORT
5976 if (ompd_state & OMPD_ENABLE_BP)
5977 ompd_bp_thread_begin();
5978#endif
5979
5980#if OMPT_SUPPORT
5981 ompt_data_t *thread_data = nullptr;
5982 if (ompt_enabled.enabled) {
5983 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5984 *thread_data = ompt_data_none;
5985
5986 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5987 this_thr->th.ompt_thread_info.wait_id = 0;
5988 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5989 this_thr->th.ompt_thread_info.parallel_flags = 0;
5990 if (ompt_enabled.ompt_callback_thread_begin) {
5991 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5992 ompt_thread_worker, thread_data);
5993 }
5994 this_thr->th.ompt_thread_info.state = ompt_state_idle;
5995 }
5996#endif
5997
5998 /* This is the place where threads wait for work */
5999 while (!TCR_4(__kmp_global.g.g_done)) {
6000 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6001 KMP_MB();
6002
6003 /* wait for work to do */
6004 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6005
6006 /* No tid yet since not part of a team */
6007 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6008
6009#if OMPT_SUPPORT
6010 if (ompt_enabled.enabled) {
6011 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6012 }
6013#endif
6014
6015 pteam = &this_thr->th.th_team;
6016
6017 /* have we been allocated? */
6018 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6019 /* we were just woken up, so run our new task */
6020 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6021 int rc;
6022 KA_TRACE(20,
6023 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6024 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6025 (*pteam)->t.t_pkfn));
6026
6027 updateHWFPControl(*pteam);
6028
6029#if OMPT_SUPPORT
6030 if (ompt_enabled.enabled) {
6031 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6032 }
6033#endif
6034
6035 rc = (*pteam)->t.t_invoke(gtid);
6036 KMP_ASSERT(rc);
6037
6038 KMP_MB();
6039 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6040 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6041 (*pteam)->t.t_pkfn));
6042 }
6043#if OMPT_SUPPORT
6044 if (ompt_enabled.enabled) {
6045 /* no frame set while outside task */
6046 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6047
6048 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6049 }
6050#endif
6051 /* join barrier after parallel region */
6052 __kmp_join_barrier(gtid);
6053 }
6054 }
6055 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6056
6057#if OMPD_SUPPORT
6058 if (ompd_state & OMPD_ENABLE_BP)
6059 ompd_bp_thread_end();
6060#endif
6061
6062#if OMPT_SUPPORT
6063 if (ompt_enabled.ompt_callback_thread_end) {
6064 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6065 }
6066#endif
6067
6068 this_thr->th.th_task_team = NULL;
6069 /* run the destructors for the threadprivate data for this thread */
6070 __kmp_common_destroy_gtid(gtid);
6071
6072 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6073 KMP_MB();
6074
6075#if OMP_PROFILING_SUPPORT
6076 llvm::timeTraceProfilerFinishThread();
6077#endif
6078 return this_thr;
6079}
6080
6081/* ------------------------------------------------------------------------ */
6082
6083void __kmp_internal_end_dest(void *specific_gtid) {
6084 // Make sure no significant bits are lost
6085 int gtid;
6086 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6087
6088 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6089 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6090 * this is because 0 is reserved for the nothing-stored case */
6091
6092 __kmp_internal_end_thread(gtid);
6093}
6094
6095#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6096
6097__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6098 __kmp_internal_end_atexit();
6099}
6100
6101#endif
6102
6103/* [Windows] josh: when the atexit handler is called, there may still be more
6104 than one thread alive */
6105void __kmp_internal_end_atexit(void) {
6106 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6107 /* [Windows]
6108 josh: ideally, we want to completely shutdown the library in this atexit
6109 handler, but stat code that depends on thread specific data for gtid fails
6110 because that data becomes unavailable at some point during the shutdown, so
6111 we call __kmp_internal_end_thread instead. We should eventually remove the
6112 dependency on __kmp_get_specific_gtid in the stat code and use
6113 __kmp_internal_end_library to cleanly shutdown the library.
6114
6115 // TODO: Can some of this comment about GVS be removed?
6116 I suspect that the offending stat code is executed when the calling thread
6117 tries to clean up a dead root thread's data structures, resulting in GVS
6118 code trying to close the GVS structures for that thread, but since the stat
6119 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6120 the calling thread is cleaning up itself instead of another thread, it get
6121 confused. This happens because allowing a thread to unregister and cleanup
6122 another thread is a recent modification for addressing an issue.
6123 Based on the current design (20050722), a thread may end up
6124 trying to unregister another thread only if thread death does not trigger
6125 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6126 thread specific data destructor function to detect thread death. For
6127 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6128 is nothing. Thus, the workaround is applicable only for Windows static
6129 stat library. */
6130 __kmp_internal_end_library(-1);
6131#if KMP_OS_WINDOWS
6132 __kmp_close_console();
6133#endif
6134}
6135
6136static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6137 // It is assumed __kmp_forkjoin_lock is acquired.
6138
6139 int gtid;
6140
6141 KMP_DEBUG_ASSERT(thread != NULL);
6142
6143 gtid = thread->th.th_info.ds.ds_gtid;
6144
6145 if (!is_root) {
6146 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6147 /* Assume the threads are at the fork barrier here */
6148 KA_TRACE(
6149 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6150 gtid));
6151 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6152 while (
6153 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6154 KMP_CPU_PAUSE();
6155 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6156 } else {
6157 /* Need release fence here to prevent seg faults for tree forkjoin
6158 barrier (GEH) */
6159 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6160 thread);
6161 __kmp_release_64(&flag);
6162 }
6163 }
6164
6165 // Terminate OS thread.
6166 __kmp_reap_worker(thread);
6167
6168 // The thread was killed asynchronously. If it was actively
6169 // spinning in the thread pool, decrement the global count.
6170 //
6171 // There is a small timing hole here - if the worker thread was just waking
6172 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6173 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6174 // the global counter might not get updated.
6175 //
6176 // Currently, this can only happen as the library is unloaded,
6177 // so there are no harmful side effects.
6178 if (thread->th.th_active_in_pool) {
6179 thread->th.th_active_in_pool = FALSE;
6180 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6181 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6182 }
6183 }
6184
6185 __kmp_free_implicit_task(thread);
6186
6187// Free the fast memory for tasking
6188#if USE_FAST_MEMORY
6189 __kmp_free_fast_memory(thread);
6190#endif /* USE_FAST_MEMORY */
6191
6192 __kmp_suspend_uninitialize_thread(thread);
6193
6194 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6195 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6196
6197 --__kmp_all_nth;
6198 // __kmp_nth was decremented when thread is added to the pool.
6199
6200#ifdef KMP_ADJUST_BLOCKTIME
6201 /* Adjust blocktime back to user setting or default if necessary */
6202 /* Middle initialization might never have occurred */
6203 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6204 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6205 if (__kmp_nth <= __kmp_avail_proc) {
6206 __kmp_zero_bt = FALSE;
6207 }
6208 }
6209#endif /* KMP_ADJUST_BLOCKTIME */
6210
6211 /* free the memory being used */
6212 if (__kmp_env_consistency_check) {
6213 if (thread->th.th_cons) {
6214 __kmp_free_cons_stack(thread->th.th_cons);
6215 thread->th.th_cons = NULL;
6216 }
6217 }
6218
6219 if (thread->th.th_pri_common != NULL) {
6220 __kmp_free(thread->th.th_pri_common);
6221 thread->th.th_pri_common = NULL;
6222 }
6223
6224 if (thread->th.th_task_state_memo_stack != NULL) {
6225 __kmp_free(thread->th.th_task_state_memo_stack);
6226 thread->th.th_task_state_memo_stack = NULL;
6227 }
6228
6229#if KMP_USE_BGET
6230 if (thread->th.th_local.bget_data != NULL) {
6231 __kmp_finalize_bget(thread);
6232 }
6233#endif
6234
6235#if KMP_AFFINITY_SUPPORTED
6236 if (thread->th.th_affin_mask != NULL) {
6237 KMP_CPU_FREE(thread->th.th_affin_mask);
6238 thread->th.th_affin_mask = NULL;
6239 }
6240#endif /* KMP_AFFINITY_SUPPORTED */
6241
6242#if KMP_USE_HIER_SCHED
6243 if (thread->th.th_hier_bar_data != NULL) {
6244 __kmp_free(thread->th.th_hier_bar_data);
6245 thread->th.th_hier_bar_data = NULL;
6246 }
6247#endif
6248
6249 __kmp_reap_team(thread->th.th_serial_team);
6250 thread->th.th_serial_team = NULL;
6251 __kmp_free(thread);
6252
6253 KMP_MB();
6254
6255} // __kmp_reap_thread
6256
6257static void __kmp_itthash_clean(kmp_info_t *th) {
6258#if USE_ITT_NOTIFY
6259 if (__kmp_itt_region_domains.count > 0) {
6260 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6261 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6262 while (bucket) {
6263 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6264 __kmp_thread_free(th, bucket);
6265 bucket = next;
6266 }
6267 }
6268 }
6269 if (__kmp_itt_barrier_domains.count > 0) {
6270 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6271 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6272 while (bucket) {
6273 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6274 __kmp_thread_free(th, bucket);
6275 bucket = next;
6276 }
6277 }
6278 }
6279#endif
6280}
6281
6282static void __kmp_internal_end(void) {
6283 int i;
6284
6285 /* First, unregister the library */
6286 __kmp_unregister_library();
6287
6288#if KMP_OS_WINDOWS
6289 /* In Win static library, we can't tell when a root actually dies, so we
6290 reclaim the data structures for any root threads that have died but not
6291 unregistered themselves, in order to shut down cleanly.
6292 In Win dynamic library we also can't tell when a thread dies. */
6293 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6294// dead roots
6295#endif
6296
6297 for (i = 0; i < __kmp_threads_capacity; i++)
6298 if (__kmp_root[i])
6299 if (__kmp_root[i]->r.r_active)
6300 break;
6301 KMP_MB(); /* Flush all pending memory write invalidates. */
6302 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6303
6304 if (i < __kmp_threads_capacity) {
6305#if KMP_USE_MONITOR
6306 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6307 KMP_MB(); /* Flush all pending memory write invalidates. */
6308
6309 // Need to check that monitor was initialized before reaping it. If we are
6310 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6311 // __kmp_monitor will appear to contain valid data, but it is only valid in
6312 // the parent process, not the child.
6313 // New behavior (201008): instead of keying off of the flag
6314 // __kmp_init_parallel, the monitor thread creation is keyed off
6315 // of the new flag __kmp_init_monitor.
6316 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6317 if (TCR_4(__kmp_init_monitor)) {
6318 __kmp_reap_monitor(&__kmp_monitor);
6319 TCW_4(__kmp_init_monitor, 0);
6320 }
6321 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6322 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6323#endif // KMP_USE_MONITOR
6324 } else {
6325/* TODO move this to cleanup code */
6326#ifdef KMP_DEBUG
6327 /* make sure that everything has properly ended */
6328 for (i = 0; i < __kmp_threads_capacity; i++) {
6329 if (__kmp_root[i]) {
6330 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6331 // there can be uber threads alive here
6332 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6333 }
6334 }
6335#endif
6336
6337 KMP_MB();
6338
6339 // Reap the worker threads.
6340 // This is valid for now, but be careful if threads are reaped sooner.
6341 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6342 // Get the next thread from the pool.
6343 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6344 __kmp_thread_pool = thread->th.th_next_pool;
6345 // Reap it.
6346 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6347 thread->th.th_next_pool = NULL;
6348 thread->th.th_in_pool = FALSE;
6349 __kmp_reap_thread(thread, 0);
6350 }
6351 __kmp_thread_pool_insert_pt = NULL;
6352
6353 // Reap teams.
6354 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6355 // Get the next team from the pool.
6356 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6357 __kmp_team_pool = team->t.t_next_pool;
6358 // Reap it.
6359 team->t.t_next_pool = NULL;
6360 __kmp_reap_team(team);
6361 }
6362
6363 __kmp_reap_task_teams();
6364
6365#if KMP_OS_UNIX
6366 // Threads that are not reaped should not access any resources since they
6367 // are going to be deallocated soon, so the shutdown sequence should wait
6368 // until all threads either exit the final spin-waiting loop or begin
6369 // sleeping after the given blocktime.
6370 for (i = 0; i < __kmp_threads_capacity; i++) {
6371 kmp_info_t *thr = __kmp_threads[i];
6372 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6373 KMP_CPU_PAUSE();
6374 }
6375#endif
6376
6377 for (i = 0; i < __kmp_threads_capacity; ++i) {
6378 // TBD: Add some checking...
6379 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6380 }
6381
6382 /* Make sure all threadprivate destructors get run by joining with all
6383 worker threads before resetting this flag */
6384 TCW_SYNC_4(__kmp_init_common, FALSE);
6385
6386 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6387 KMP_MB();
6388
6389#if KMP_USE_MONITOR
6390 // See note above: One of the possible fixes for CQ138434 / CQ140126
6391 //
6392 // FIXME: push both code fragments down and CSE them?
6393 // push them into __kmp_cleanup() ?
6394 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6395 if (TCR_4(__kmp_init_monitor)) {
6396 __kmp_reap_monitor(&__kmp_monitor);
6397 TCW_4(__kmp_init_monitor, 0);
6398 }
6399 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6400 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6401#endif
6402 } /* else !__kmp_global.t_active */
6403 TCW_4(__kmp_init_gtid, FALSE);
6404 KMP_MB(); /* Flush all pending memory write invalidates. */
6405
6406 __kmp_cleanup();
6407#if OMPT_SUPPORT
6408 ompt_fini();
6409#endif
6410}
6411
6412void __kmp_internal_end_library(int gtid_req) {
6413 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6414 /* this shouldn't be a race condition because __kmp_internal_end() is the
6415 only place to clear __kmp_serial_init */
6416 /* we'll check this later too, after we get the lock */
6417 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6418 // redundant, because the next check will work in any case.
6419 if (__kmp_global.g.g_abort) {
6420 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6421 /* TODO abort? */
6422 return;
6423 }
6424 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6425 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6426 return;
6427 }
6428
6429 // If hidden helper team has been initialized, we need to deinit it
6430 if (TCR_4(__kmp_init_hidden_helper) &&
6431 !TCR_4(__kmp_hidden_helper_team_done)) {
6432 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6433 // First release the main thread to let it continue its work
6434 __kmp_hidden_helper_main_thread_release();
6435 // Wait until the hidden helper team has been destroyed
6436 __kmp_hidden_helper_threads_deinitz_wait();
6437 }
6438
6439 KMP_MB(); /* Flush all pending memory write invalidates. */
6440 /* find out who we are and what we should do */
6441 {
6442 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6443 KA_TRACE(
6444 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6445 if (gtid == KMP_GTID_SHUTDOWN) {
6446 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6447 "already shutdown\n"));
6448 return;
6449 } else if (gtid == KMP_GTID_MONITOR) {
6450 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6451 "registered, or system shutdown\n"));
6452 return;
6453 } else if (gtid == KMP_GTID_DNE) {
6454 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6455 "shutdown\n"));
6456 /* we don't know who we are, but we may still shutdown the library */
6457 } else if (KMP_UBER_GTID(gtid)) {
6458 /* unregister ourselves as an uber thread. gtid is no longer valid */
6459 if (__kmp_root[gtid]->r.r_active) {
6460 __kmp_global.g.g_abort = -1;
6461 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6462 __kmp_unregister_library();
6463 KA_TRACE(10,
6464 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6465 gtid));
6466 return;
6467 } else {
6468 __kmp_itthash_clean(__kmp_threads[gtid]);
6469 KA_TRACE(
6470 10,
6471 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6472 __kmp_unregister_root_current_thread(gtid);
6473 }
6474 } else {
6475/* worker threads may call this function through the atexit handler, if they
6476 * call exit() */
6477/* For now, skip the usual subsequent processing and just dump the debug buffer.
6478 TODO: do a thorough shutdown instead */
6479#ifdef DUMP_DEBUG_ON_EXIT
6480 if (__kmp_debug_buf)
6481 __kmp_dump_debug_buffer();
6482#endif
6483 // added unregister library call here when we switch to shm linux
6484 // if we don't, it will leave lots of files in /dev/shm
6485 // cleanup shared memory file before exiting.
6486 __kmp_unregister_library();
6487 return;
6488 }
6489 }
6490 /* synchronize the termination process */
6491 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6492
6493 /* have we already finished */
6494 if (__kmp_global.g.g_abort) {
6495 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6496 /* TODO abort? */
6497 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6498 return;
6499 }
6500 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6501 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6502 return;
6503 }
6504
6505 /* We need this lock to enforce mutex between this reading of
6506 __kmp_threads_capacity and the writing by __kmp_register_root.
6507 Alternatively, we can use a counter of roots that is atomically updated by
6508 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6509 __kmp_internal_end_*. */
6510 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6511
6512 /* now we can safely conduct the actual termination */
6513 __kmp_internal_end();
6514
6515 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6516 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6517
6518 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6519
6520#ifdef DUMP_DEBUG_ON_EXIT
6521 if (__kmp_debug_buf)
6522 __kmp_dump_debug_buffer();
6523#endif
6524
6525#if KMP_OS_WINDOWS
6526 __kmp_close_console();
6527#endif
6528
6529 __kmp_fini_allocator();
6530
6531} // __kmp_internal_end_library
6532
6533void __kmp_internal_end_thread(int gtid_req) {
6534 int i;
6535
6536 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6537 /* this shouldn't be a race condition because __kmp_internal_end() is the
6538 * only place to clear __kmp_serial_init */
6539 /* we'll check this later too, after we get the lock */
6540 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6541 // redundant, because the next check will work in any case.
6542 if (__kmp_global.g.g_abort) {
6543 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6544 /* TODO abort? */
6545 return;
6546 }
6547 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6548 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6549 return;
6550 }
6551
6552 // If hidden helper team has been initialized, we need to deinit it
6553 if (TCR_4(__kmp_init_hidden_helper) &&
6554 !TCR_4(__kmp_hidden_helper_team_done)) {
6555 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6556 // First release the main thread to let it continue its work
6557 __kmp_hidden_helper_main_thread_release();
6558 // Wait until the hidden helper team has been destroyed
6559 __kmp_hidden_helper_threads_deinitz_wait();
6560 }
6561
6562 KMP_MB(); /* Flush all pending memory write invalidates. */
6563
6564 /* find out who we are and what we should do */
6565 {
6566 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6567 KA_TRACE(10,
6568 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6569 if (gtid == KMP_GTID_SHUTDOWN) {
6570 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6571 "already shutdown\n"));
6572 return;
6573 } else if (gtid == KMP_GTID_MONITOR) {
6574 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6575 "registered, or system shutdown\n"));
6576 return;
6577 } else if (gtid == KMP_GTID_DNE) {
6578 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6579 "shutdown\n"));
6580 return;
6581 /* we don't know who we are */
6582 } else if (KMP_UBER_GTID(gtid)) {
6583 /* unregister ourselves as an uber thread. gtid is no longer valid */
6584 if (__kmp_root[gtid]->r.r_active) {
6585 __kmp_global.g.g_abort = -1;
6586 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6587 KA_TRACE(10,
6588 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6589 gtid));
6590 return;
6591 } else {
6592 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6593 gtid));
6594 __kmp_unregister_root_current_thread(gtid);
6595 }
6596 } else {
6597 /* just a worker thread, let's leave */
6598 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6599
6600 if (gtid >= 0) {
6601 __kmp_threads[gtid]->th.th_task_team = NULL;
6602 }
6603
6604 KA_TRACE(10,
6605 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6606 gtid));
6607 return;
6608 }
6609 }
6610#if KMP_DYNAMIC_LIB
6611 if (__kmp_pause_status != kmp_hard_paused)
6612 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6613 // because we will better shutdown later in the library destructor.
6614 {
6615 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6616 return;
6617 }
6618#endif
6619 /* synchronize the termination process */
6620 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6621
6622 /* have we already finished */
6623 if (__kmp_global.g.g_abort) {
6624 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6625 /* TODO abort? */
6626 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6627 return;
6628 }
6629 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6630 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6631 return;
6632 }
6633
6634 /* We need this lock to enforce mutex between this reading of
6635 __kmp_threads_capacity and the writing by __kmp_register_root.
6636 Alternatively, we can use a counter of roots that is atomically updated by
6637 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6638 __kmp_internal_end_*. */
6639
6640 /* should we finish the run-time? are all siblings done? */
6641 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6642
6643 for (i = 0; i < __kmp_threads_capacity; ++i) {
6644 if (KMP_UBER_GTID(i)) {
6645 KA_TRACE(
6646 10,
6647 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6648 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6649 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6650 return;
6651 }
6652 }
6653
6654 /* now we can safely conduct the actual termination */
6655
6656 __kmp_internal_end();
6657
6658 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6659 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6660
6661 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6662
6663#ifdef DUMP_DEBUG_ON_EXIT
6664 if (__kmp_debug_buf)
6665 __kmp_dump_debug_buffer();
6666#endif
6667} // __kmp_internal_end_thread
6668
6669// -----------------------------------------------------------------------------
6670// Library registration stuff.
6671
6672static long __kmp_registration_flag = 0;
6673// Random value used to indicate library initialization.
6674static char *__kmp_registration_str = NULL;
6675// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6676
6677static inline char *__kmp_reg_status_name() {
6678/* On RHEL 3u5 if linked statically, getpid() returns different values in
6679 each thread. If registration and unregistration go in different threads
6680 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6681 env var can not be found, because the name will contain different pid. */
6682// macOS* complains about name being too long with additional getuid()
6683#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6684 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6685 (int)getuid());
6686#else
6687 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6688#endif
6689} // __kmp_reg_status_get
6690
6691void __kmp_register_library_startup(void) {
6692
6693 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6694 int done = 0;
6695 union {
6696 double dtime;
6697 long ltime;
6698 } time;
6699#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6700 __kmp_initialize_system_tick();
6701#endif
6702 __kmp_read_system_time(&time.dtime);
6703 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6704 __kmp_registration_str =
6705 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6706 __kmp_registration_flag, KMP_LIBRARY_FILE);
6707
6708 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6709 __kmp_registration_str));
6710
6711 while (!done) {
6712
6713 char *value = NULL; // Actual value of the environment variable.
6714
6715#if defined(KMP_USE_SHM)
6716 char *shm_name = __kmp_str_format("/%s", name);
6717 int shm_preexist = 0;
6718 char *data1;
6719 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6720 if ((fd1 == -1) && (errno == EEXIST)) {
6721 // file didn't open because it already exists.
6722 // try opening existing file
6723 fd1 = shm_open(shm_name, O_RDWR, 0666);
6724 if (fd1 == -1) { // file didn't open
6725 // error out here
6726 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6727 __kmp_msg_null);
6728 } else {
6729 // able to open existing file
6730 shm_preexist = 1;
6731 }
6732 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6733 // already exists.
6734 // error out here.
6735 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6736 __kmp_msg_null);
6737 }
6738 if (shm_preexist == 0) {
6739 // we created SHM now set size
6740 if (ftruncate(fd1, SHM_SIZE) == -1) {
6741 // error occured setting size;
6742 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6743 KMP_ERR(errno), __kmp_msg_null);
6744 }
6745 }
6746 data1 =
6747 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6748 if (data1 == MAP_FAILED) {
6749 // failed to map shared memory
6750 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6751 __kmp_msg_null);
6752 }
6753 if (shm_preexist == 0) { // set data to SHM, set value
6754 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6755 }
6756 // Read value from either what we just wrote or existing file.
6757 value = __kmp_str_format("%s", data1); // read value from SHM
6758 munmap(data1, SHM_SIZE);
6759 close(fd1);
6760#else // Windows and unix with static library
6761 // Set environment variable, but do not overwrite if it is exist.
6762 __kmp_env_set(name, __kmp_registration_str, 0);
6763 // read value to see if it got set
6764 value = __kmp_env_get(name);
6765#endif
6766
6767 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6768 done = 1; // Ok, environment variable set successfully, exit the loop.
6769 } else {
6770 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6771 // Check whether it alive or dead.
6772 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6773 char *tail = value;
6774 char *flag_addr_str = NULL;
6775 char *flag_val_str = NULL;
6776 char const *file_name = NULL;
6777 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6778 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6779 file_name = tail;
6780 if (tail != NULL) {
6781 unsigned long *flag_addr = 0;
6782 unsigned long flag_val = 0;
6783 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6784 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6785 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6786 // First, check whether environment-encoded address is mapped into
6787 // addr space.
6788 // If so, dereference it to see if it still has the right value.
6789 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6790 neighbor = 1;
6791 } else {
6792 // If not, then we know the other copy of the library is no longer
6793 // running.
6794 neighbor = 2;
6795 }
6796 }
6797 }
6798 switch (neighbor) {
6799 case 0: // Cannot parse environment variable -- neighbor status unknown.
6800 // Assume it is the incompatible format of future version of the
6801 // library. Assume the other library is alive.
6802 // WARN( ... ); // TODO: Issue a warning.
6803 file_name = "unknown library";
6804 KMP_FALLTHROUGH();
6805 // Attention! Falling to the next case. That's intentional.
6806 case 1: { // Neighbor is alive.
6807 // Check it is allowed.
6808 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6809 if (!__kmp_str_match_true(duplicate_ok)) {
6810 // That's not allowed. Issue fatal error.
6811 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6812 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6813 }
6814 KMP_INTERNAL_FREE(duplicate_ok);
6815 __kmp_duplicate_library_ok = 1;
6816 done = 1; // Exit the loop.
6817 } break;
6818 case 2: { // Neighbor is dead.
6819
6820#if defined(KMP_USE_SHM)
6821 // close shared memory.
6822 shm_unlink(shm_name); // this removes file in /dev/shm
6823#else
6824 // Clear the variable and try to register library again.
6825 __kmp_env_unset(name);
6826#endif
6827 } break;
6828 default: {
6829 KMP_DEBUG_ASSERT(0);
6830 } break;
6831 }
6832 }
6833 KMP_INTERNAL_FREE((void *)value);
6834#if defined(KMP_USE_SHM)
6835 KMP_INTERNAL_FREE((void *)shm_name);
6836#endif
6837 } // while
6838 KMP_INTERNAL_FREE((void *)name);
6839
6840} // func __kmp_register_library_startup
6841
6842void __kmp_unregister_library(void) {
6843
6844 char *name = __kmp_reg_status_name();
6845 char *value = NULL;
6846
6847#if defined(KMP_USE_SHM)
6848 char *shm_name = __kmp_str_format("/%s", name);
6849 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6850 if (fd1 == -1) {
6851 // file did not open. return.
6852 return;
6853 }
6854 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6855 if (data1 != MAP_FAILED) {
6856 value = __kmp_str_format("%s", data1); // read value from SHM
6857 munmap(data1, SHM_SIZE);
6858 }
6859 close(fd1);
6860#else
6861 value = __kmp_env_get(name);
6862#endif
6863
6864 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6865 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6866 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6867// Ok, this is our variable. Delete it.
6868#if defined(KMP_USE_SHM)
6869 shm_unlink(shm_name); // this removes file in /dev/shm
6870#else
6871 __kmp_env_unset(name);
6872#endif
6873 }
6874
6875#if defined(KMP_USE_SHM)
6876 KMP_INTERNAL_FREE(shm_name);
6877#endif
6878
6879 KMP_INTERNAL_FREE(__kmp_registration_str);
6880 KMP_INTERNAL_FREE(value);
6881 KMP_INTERNAL_FREE(name);
6882
6883 __kmp_registration_flag = 0;
6884 __kmp_registration_str = NULL;
6885
6886} // __kmp_unregister_library
6887
6888// End of Library registration stuff.
6889// -----------------------------------------------------------------------------
6890
6891#if KMP_MIC_SUPPORTED
6892
6893static void __kmp_check_mic_type() {
6894 kmp_cpuid_t cpuid_state = {0};
6895 kmp_cpuid_t *cs_p = &cpuid_state;
6896 __kmp_x86_cpuid(1, 0, cs_p);
6897 // We don't support mic1 at the moment
6898 if ((cs_p->eax & 0xff0) == 0xB10) {
6899 __kmp_mic_type = mic2;
6900 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6901 __kmp_mic_type = mic3;
6902 } else {
6903 __kmp_mic_type = non_mic;
6904 }
6905}
6906
6907#endif /* KMP_MIC_SUPPORTED */
6908
6909#if KMP_HAVE_UMWAIT
6910static void __kmp_user_level_mwait_init() {
6911 struct kmp_cpuid buf;
6912 __kmp_x86_cpuid(7, 0, &buf);
6913 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6914 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6915 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6916 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6917 __kmp_umwait_enabled));
6918}
6919#elif KMP_HAVE_MWAIT
6920#ifndef AT_INTELPHIUSERMWAIT
6921// Spurious, non-existent value that should always fail to return anything.
6922// Will be replaced with the correct value when we know that.
6923#define AT_INTELPHIUSERMWAIT 10000
6924#endif
6925// getauxval() function is available in RHEL7 and SLES12. If a system with an
6926// earlier OS is used to build the RTL, we'll use the following internal
6927// function when the entry is not found.
6928unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6929unsigned long getauxval(unsigned long) { return 0; }
6930
6931static void __kmp_user_level_mwait_init() {
6932 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6933 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6934 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6935 // KMP_USER_LEVEL_MWAIT was set to TRUE.
6936 if (__kmp_mic_type == mic3) {
6937 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6938 if ((res & 0x1) || __kmp_user_level_mwait) {
6939 __kmp_mwait_enabled = TRUE;
6940 if (__kmp_user_level_mwait) {
6941 KMP_INFORM(EnvMwaitWarn);
6942 }
6943 } else {
6944 __kmp_mwait_enabled = FALSE;
6945 }
6946 }
6947 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6948 "__kmp_mwait_enabled = %d\n",
6949 __kmp_mic_type, __kmp_mwait_enabled));
6950}
6951#endif /* KMP_HAVE_UMWAIT */
6952
6953static void __kmp_do_serial_initialize(void) {
6954 int i, gtid;
6955 size_t size;
6956
6957 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6958
6959 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6960 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6961 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6962 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6963 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6964
6965#if OMPT_SUPPORT
6966 ompt_pre_init();
6967#endif
6968#if OMPD_SUPPORT
6969 __kmp_env_dump();
6970 ompd_init();
6971#endif
6972
6973 __kmp_validate_locks();
6974
6975 /* Initialize internal memory allocator */
6976 __kmp_init_allocator();
6977
6978 /* Register the library startup via an environment variable or via mapped
6979 shared memory file and check to see whether another copy of the library is
6980 already registered. Since forked child process is often terminated, we
6981 postpone the registration till middle initialization in the child */
6982 if (__kmp_need_register_serial)
6983 __kmp_register_library_startup();
6984
6985 /* TODO reinitialization of library */
6986 if (TCR_4(__kmp_global.g.g_done)) {
6987 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6988 }
6989
6990 __kmp_global.g.g_abort = 0;
6991 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6992
6993/* initialize the locks */
6994#if KMP_USE_ADAPTIVE_LOCKS
6995#if KMP_DEBUG_ADAPTIVE_LOCKS
6996 __kmp_init_speculative_stats();
6997#endif
6998#endif
6999#if KMP_STATS_ENABLED
7000 __kmp_stats_init();
7001#endif
7002 __kmp_init_lock(&__kmp_global_lock);
7003 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7004 __kmp_init_lock(&__kmp_debug_lock);
7005 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7006 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7007 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7008 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7009 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7010 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7011 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7012 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7013 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7014 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7015 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7016 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7017 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7018 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7019 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7020#if KMP_USE_MONITOR
7021 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7022#endif
7023 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7024
7025 /* conduct initialization and initial setup of configuration */
7026
7027 __kmp_runtime_initialize();
7028
7029#if KMP_MIC_SUPPORTED
7030 __kmp_check_mic_type();
7031#endif
7032
7033// Some global variable initialization moved here from kmp_env_initialize()
7034#ifdef KMP_DEBUG
7035 kmp_diag = 0;
7036#endif
7037 __kmp_abort_delay = 0;
7038
7039 // From __kmp_init_dflt_team_nth()
7040 /* assume the entire machine will be used */
7041 __kmp_dflt_team_nth_ub = __kmp_xproc;
7042 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7043 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7044 }
7045 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7046 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7047 }
7048 __kmp_max_nth = __kmp_sys_max_nth;
7049 __kmp_cg_max_nth = __kmp_sys_max_nth;
7050 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7051 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7052 __kmp_teams_max_nth = __kmp_sys_max_nth;
7053 }
7054
7055 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7056 // part
7057 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7058#if KMP_USE_MONITOR
7059 __kmp_monitor_wakeups =
7060 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7061 __kmp_bt_intervals =
7062 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7063#endif
7064 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7065 __kmp_library = library_throughput;
7066 // From KMP_SCHEDULE initialization
7067 __kmp_static = kmp_sch_static_balanced;
7068// AC: do not use analytical here, because it is non-monotonous
7069//__kmp_guided = kmp_sch_guided_iterative_chunked;
7070//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7071// need to repeat assignment
7072// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7073// bit control and barrier method control parts
7074#if KMP_FAST_REDUCTION_BARRIER
7075#define kmp_reduction_barrier_gather_bb ((int)1)
7076#define kmp_reduction_barrier_release_bb ((int)1)
7077#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7078#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7079#endif // KMP_FAST_REDUCTION_BARRIER
7080 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7081 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7082 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7083 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7084 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7085#if KMP_FAST_REDUCTION_BARRIER
7086 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7087 // lin_64 ): hyper,1
7088 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7089 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7090 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7091 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7092 }
7093#endif // KMP_FAST_REDUCTION_BARRIER
7094 }
7095#if KMP_FAST_REDUCTION_BARRIER
7096#undef kmp_reduction_barrier_release_pat
7097#undef kmp_reduction_barrier_gather_pat
7098#undef kmp_reduction_barrier_release_bb
7099#undef kmp_reduction_barrier_gather_bb
7100#endif // KMP_FAST_REDUCTION_BARRIER
7101#if KMP_MIC_SUPPORTED
7102 if (__kmp_mic_type == mic2) { // KNC
7103 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7104 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7105 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7106 1; // forkjoin release
7107 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7108 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7109 }
7110#if KMP_FAST_REDUCTION_BARRIER
7111 if (__kmp_mic_type == mic2) { // KNC
7112 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7113 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7114 }
7115#endif // KMP_FAST_REDUCTION_BARRIER
7116#endif // KMP_MIC_SUPPORTED
7117
7118// From KMP_CHECKS initialization
7119#ifdef KMP_DEBUG
7120 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7121#else
7122 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7123#endif
7124
7125 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7126 __kmp_foreign_tp = TRUE;
7127
7128 __kmp_global.g.g_dynamic = FALSE;
7129 __kmp_global.g.g_dynamic_mode = dynamic_default;
7130
7131 __kmp_init_nesting_mode();
7132
7133 __kmp_env_initialize(NULL);
7134
7135#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7136 __kmp_user_level_mwait_init();
7137#endif
7138// Print all messages in message catalog for testing purposes.
7139#ifdef KMP_DEBUG
7140 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7141 if (__kmp_str_match_true(val)) {
7142 kmp_str_buf_t buffer;
7143 __kmp_str_buf_init(&buffer);
7144 __kmp_i18n_dump_catalog(&buffer);
7145 __kmp_printf("%s", buffer.str);
7146 __kmp_str_buf_free(&buffer);
7147 }
7148 __kmp_env_free(&val);
7149#endif
7150
7151 __kmp_threads_capacity =
7152 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7153 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7154 __kmp_tp_capacity = __kmp_default_tp_capacity(
7155 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7156
7157 // If the library is shut down properly, both pools must be NULL. Just in
7158 // case, set them to NULL -- some memory may leak, but subsequent code will
7159 // work even if pools are not freed.
7160 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7161 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7162 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7163 __kmp_thread_pool = NULL;
7164 __kmp_thread_pool_insert_pt = NULL;
7165 __kmp_team_pool = NULL;
7166
7167 /* Allocate all of the variable sized records */
7168 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7169 * expandable */
7170 /* Since allocation is cache-aligned, just add extra padding at the end */
7171 size =
7172 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7173 CACHE_LINE;
7174 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7175 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7176 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7177
7178 /* init thread counts */
7179 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7180 0); // Asserts fail if the library is reinitializing and
7181 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7182 __kmp_all_nth = 0;
7183 __kmp_nth = 0;
7184
7185 /* setup the uber master thread and hierarchy */
7186 gtid = __kmp_register_root(TRUE);
7187 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7188 KMP_ASSERT(KMP_UBER_GTID(gtid));
7189 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7190
7191 KMP_MB(); /* Flush all pending memory write invalidates. */
7192
7193 __kmp_common_initialize();
7194
7195#if KMP_OS_UNIX
7196 /* invoke the child fork handler */
7197 __kmp_register_atfork();
7198#endif
7199
7200#if !KMP_DYNAMIC_LIB
7201 {
7202 /* Invoke the exit handler when the program finishes, only for static
7203 library. For dynamic library, we already have _fini and DllMain. */
7204 int rc = atexit(__kmp_internal_end_atexit);
7205 if (rc != 0) {
7206 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7207 __kmp_msg_null);
7208 }
7209 }
7210#endif
7211
7212#if KMP_HANDLE_SIGNALS
7213#if KMP_OS_UNIX
7214 /* NOTE: make sure that this is called before the user installs their own
7215 signal handlers so that the user handlers are called first. this way they
7216 can return false, not call our handler, avoid terminating the library, and
7217 continue execution where they left off. */
7218 __kmp_install_signals(FALSE);
7219#endif /* KMP_OS_UNIX */
7220#if KMP_OS_WINDOWS
7221 __kmp_install_signals(TRUE);
7222#endif /* KMP_OS_WINDOWS */
7223#endif
7224
7225 /* we have finished the serial initialization */
7226 __kmp_init_counter++;
7227
7228 __kmp_init_serial = TRUE;
7229
7230 if (__kmp_settings) {
7231 __kmp_env_print();
7232 }
7233
7234 if (__kmp_display_env || __kmp_display_env_verbose) {
7235 __kmp_env_print_2();
7236 }
7237
7238#if OMPT_SUPPORT
7239 ompt_post_init();
7240#endif
7241
7242 KMP_MB();
7243
7244 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7245}
7246
7247void __kmp_serial_initialize(void) {
7248 if (__kmp_init_serial) {
7249 return;
7250 }
7251 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7252 if (__kmp_init_serial) {
7253 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7254 return;
7255 }
7256 __kmp_do_serial_initialize();
7257 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7258}
7259
7260static void __kmp_do_middle_initialize(void) {
7261 int i, j;
7262 int prev_dflt_team_nth;
7263
7264 if (!__kmp_init_serial) {
7265 __kmp_do_serial_initialize();
7266 }
7267
7268 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7269
7270 if (UNLIKELY(!__kmp_need_register_serial)) {
7271 // We are in a forked child process. The registration was skipped during
7272 // serial initialization in __kmp_atfork_child handler. Do it here.
7273 __kmp_register_library_startup();
7274 }
7275
7276 // Save the previous value for the __kmp_dflt_team_nth so that
7277 // we can avoid some reinitialization if it hasn't changed.
7278 prev_dflt_team_nth = __kmp_dflt_team_nth;
7279
7280#if KMP_AFFINITY_SUPPORTED
7281 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7282 // number of cores on the machine.
7283 __kmp_affinity_initialize();
7284
7285#endif /* KMP_AFFINITY_SUPPORTED */
7286
7287 KMP_ASSERT(__kmp_xproc > 0);
7288 if (__kmp_avail_proc == 0) {
7289 __kmp_avail_proc = __kmp_xproc;
7290 }
7291
7292 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7293 // correct them now
7294 j = 0;
7295 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7296 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7297 __kmp_avail_proc;
7298 j++;
7299 }
7300
7301 if (__kmp_dflt_team_nth == 0) {
7302#ifdef KMP_DFLT_NTH_CORES
7303 // Default #threads = #cores
7304 __kmp_dflt_team_nth = __kmp_ncores;
7305 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7306 "__kmp_ncores (%d)\n",
7307 __kmp_dflt_team_nth));
7308#else
7309 // Default #threads = #available OS procs
7310 __kmp_dflt_team_nth = __kmp_avail_proc;
7311 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7312 "__kmp_avail_proc(%d)\n",
7313 __kmp_dflt_team_nth));
7314#endif /* KMP_DFLT_NTH_CORES */
7315 }
7316
7317 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7318 __kmp_dflt_team_nth = KMP_MIN_NTH;
7319 }
7320 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7321 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7322 }
7323
7324 if (__kmp_nesting_mode > 0)
7325 __kmp_set_nesting_mode_threads();
7326
7327 // There's no harm in continuing if the following check fails,
7328 // but it indicates an error in the previous logic.
7329 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7330
7331 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7332 // Run through the __kmp_threads array and set the num threads icv for each
7333 // root thread that is currently registered with the RTL (which has not
7334 // already explicitly set its nthreads-var with a call to
7335 // omp_set_num_threads()).
7336 for (i = 0; i < __kmp_threads_capacity; i++) {
7337 kmp_info_t *thread = __kmp_threads[i];
7338 if (thread == NULL)
7339 continue;
7340 if (thread->th.th_current_task->td_icvs.nproc != 0)
7341 continue;
7342
7343 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7344 }
7345 }
7346 KA_TRACE(
7347 20,
7348 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7349 __kmp_dflt_team_nth));
7350
7351#ifdef KMP_ADJUST_BLOCKTIME
7352 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7353 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7354 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7355 if (__kmp_nth > __kmp_avail_proc) {
7356 __kmp_zero_bt = TRUE;
7357 }
7358 }
7359#endif /* KMP_ADJUST_BLOCKTIME */
7360
7361 /* we have finished middle initialization */
7362 TCW_SYNC_4(__kmp_init_middle, TRUE);
7363
7364 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7365}
7366
7367void __kmp_middle_initialize(void) {
7368 if (__kmp_init_middle) {
7369 return;
7370 }
7371 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7372 if (__kmp_init_middle) {
7373 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7374 return;
7375 }
7376 __kmp_do_middle_initialize();
7377 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7378}
7379
7380void __kmp_parallel_initialize(void) {
7381 int gtid = __kmp_entry_gtid(); // this might be a new root
7382
7383 /* synchronize parallel initialization (for sibling) */
7384 if (TCR_4(__kmp_init_parallel))
7385 return;
7386 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7387 if (TCR_4(__kmp_init_parallel)) {
7388 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7389 return;
7390 }
7391
7392 /* TODO reinitialization after we have already shut down */
7393 if (TCR_4(__kmp_global.g.g_done)) {
7394 KA_TRACE(
7395 10,
7396 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7397 __kmp_infinite_loop();
7398 }
7399
7400 /* jc: The lock __kmp_initz_lock is already held, so calling
7401 __kmp_serial_initialize would cause a deadlock. So we call
7402 __kmp_do_serial_initialize directly. */
7403 if (!__kmp_init_middle) {
7404 __kmp_do_middle_initialize();
7405 }
7406 __kmp_assign_root_init_mask();
7407 __kmp_resume_if_hard_paused();
7408
7409 /* begin initialization */
7410 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7411 KMP_ASSERT(KMP_UBER_GTID(gtid));
7412
7413#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7414 // Save the FP control regs.
7415 // Worker threads will set theirs to these values at thread startup.
7416 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7417 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7418 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7419#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7420
7421#if KMP_OS_UNIX
7422#if KMP_HANDLE_SIGNALS
7423 /* must be after __kmp_serial_initialize */
7424 __kmp_install_signals(TRUE);
7425#endif
7426#endif
7427
7428 __kmp_suspend_initialize();
7429
7430#if defined(USE_LOAD_BALANCE)
7431 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7432 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7433 }
7434#else
7435 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7436 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7437 }
7438#endif
7439
7440 if (__kmp_version) {
7441 __kmp_print_version_2();
7442 }
7443
7444 /* we have finished parallel initialization */
7445 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7446
7447 KMP_MB();
7448 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7449
7450 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7451}
7452
7453void __kmp_hidden_helper_initialize() {
7454 if (TCR_4(__kmp_init_hidden_helper))
7455 return;
7456
7457 // __kmp_parallel_initialize is required before we initialize hidden helper
7458 if (!TCR_4(__kmp_init_parallel))
7459 __kmp_parallel_initialize();
7460
7461 // Double check. Note that this double check should not be placed before
7462 // __kmp_parallel_initialize as it will cause dead lock.
7463 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7464 if (TCR_4(__kmp_init_hidden_helper)) {
7465 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7466 return;
7467 }
7468
7469 // Set the count of hidden helper tasks to be executed to zero
7470 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7471
7472 // Set the global variable indicating that we're initializing hidden helper
7473 // team/threads
7474 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7475
7476 // Platform independent initialization
7477 __kmp_do_initialize_hidden_helper_threads();
7478
7479 // Wait here for the finish of initialization of hidden helper teams
7480 __kmp_hidden_helper_threads_initz_wait();
7481
7482 // We have finished hidden helper initialization
7483 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7484
7485 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7486}
7487
7488/* ------------------------------------------------------------------------ */
7489
7490void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7491 kmp_team_t *team) {
7492 kmp_disp_t *dispatch;
7493
7494 KMP_MB();
7495
7496 /* none of the threads have encountered any constructs, yet. */
7497 this_thr->th.th_local.this_construct = 0;
7498#if KMP_CACHE_MANAGE
7499 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7500#endif /* KMP_CACHE_MANAGE */
7501 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7502 KMP_DEBUG_ASSERT(dispatch);
7503 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7504 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7505 // this_thr->th.th_info.ds.ds_tid ] );
7506
7507 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7508 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7509 if (__kmp_env_consistency_check)
7510 __kmp_push_parallel(gtid, team->t.t_ident);
7511
7512 KMP_MB(); /* Flush all pending memory write invalidates. */
7513}
7514
7515void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7516 kmp_team_t *team) {
7517 if (__kmp_env_consistency_check)
7518 __kmp_pop_parallel(gtid, team->t.t_ident);
7519
7520 __kmp_finish_implicit_task(this_thr);
7521}
7522
7523int __kmp_invoke_task_func(int gtid) {
7524 int rc;
7525 int tid = __kmp_tid_from_gtid(gtid);
7526 kmp_info_t *this_thr = __kmp_threads[gtid];
7527 kmp_team_t *team = this_thr->th.th_team;
7528
7529 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7530#if USE_ITT_BUILD
7531 if (__itt_stack_caller_create_ptr) {
7532 // inform ittnotify about entering user's code
7533 if (team->t.t_stack_id != NULL) {
7534 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7535 } else {
7536 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7537 __kmp_itt_stack_callee_enter(
7538 (__itt_caller)team->t.t_parent->t.t_stack_id);
7539 }
7540 }
7541#endif /* USE_ITT_BUILD */
7542#if INCLUDE_SSC_MARKS
7543 SSC_MARK_INVOKING();
7544#endif
7545
7546#if OMPT_SUPPORT
7547 void *dummy;
7548 void **exit_frame_p;
7549 ompt_data_t *my_task_data;
7550 ompt_data_t *my_parallel_data;
7551 int ompt_team_size;
7552
7553 if (ompt_enabled.enabled) {
7554 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7555 .ompt_task_info.frame.exit_frame.ptr);
7556 } else {
7557 exit_frame_p = &dummy;
7558 }
7559
7560 my_task_data =
7561 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7562 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7563 if (ompt_enabled.ompt_callback_implicit_task) {
7564 ompt_team_size = team->t.t_nproc;
7565 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7566 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7567 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7568 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7569 }
7570#endif
7571
7572#if KMP_STATS_ENABLED
7573 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7574 if (previous_state == stats_state_e::TEAMS_REGION) {
7575 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7576 } else {
7577 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7578 }
7579 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7580#endif
7581
7582 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7583 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7584#if OMPT_SUPPORT
7585 ,
7586 exit_frame_p
7587#endif
7588 );
7589#if OMPT_SUPPORT
7590 *exit_frame_p = NULL;
7591 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7592#endif
7593
7594#if KMP_STATS_ENABLED
7595 if (previous_state == stats_state_e::TEAMS_REGION) {
7596 KMP_SET_THREAD_STATE(previous_state);
7597 }
7598 KMP_POP_PARTITIONED_TIMER();
7599#endif
7600
7601#if USE_ITT_BUILD
7602 if (__itt_stack_caller_create_ptr) {
7603 // inform ittnotify about leaving user's code
7604 if (team->t.t_stack_id != NULL) {
7605 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7606 } else {
7607 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7608 __kmp_itt_stack_callee_leave(
7609 (__itt_caller)team->t.t_parent->t.t_stack_id);
7610 }
7611 }
7612#endif /* USE_ITT_BUILD */
7613 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7614
7615 return rc;
7616}
7617
7618void __kmp_teams_master(int gtid) {
7619 // This routine is called by all primary threads in teams construct
7620 kmp_info_t *thr = __kmp_threads[gtid];
7621 kmp_team_t *team = thr->th.th_team;
7622 ident_t *loc = team->t.t_ident;
7623 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7624 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7625 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7626 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7627 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7628
7629 // This thread is a new CG root. Set up the proper variables.
7630 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7631 tmp->cg_root = thr; // Make thr the CG root
7632 // Init to thread limit stored when league primary threads were forked
7633 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7634 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7635 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7636 " cg_nthreads to 1\n",
7637 thr, tmp));
7638 tmp->up = thr->th.th_cg_roots;
7639 thr->th.th_cg_roots = tmp;
7640
7641// Launch league of teams now, but not let workers execute
7642// (they hang on fork barrier until next parallel)
7643#if INCLUDE_SSC_MARKS
7644 SSC_MARK_FORKING();
7645#endif
7646 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7647 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7648 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7649#if INCLUDE_SSC_MARKS
7650 SSC_MARK_JOINING();
7651#endif
7652 // If the team size was reduced from the limit, set it to the new size
7653 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7654 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7655 // AC: last parameter "1" eliminates join barrier which won't work because
7656 // worker threads are in a fork barrier waiting for more parallel regions
7657 __kmp_join_call(loc, gtid
7658#if OMPT_SUPPORT
7659 ,
7660 fork_context_intel
7661#endif
7662 ,
7663 1);
7664}
7665
7666int __kmp_invoke_teams_master(int gtid) {
7667 kmp_info_t *this_thr = __kmp_threads[gtid];
7668 kmp_team_t *team = this_thr->th.th_team;
7669#if KMP_DEBUG
7670 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7671 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7672 (void *)__kmp_teams_master);
7673#endif
7674 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7675#if OMPT_SUPPORT
7676 int tid = __kmp_tid_from_gtid(gtid);
7677 ompt_data_t *task_data =
7678 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7679 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7680 if (ompt_enabled.ompt_callback_implicit_task) {
7681 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7682 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7683 ompt_task_initial);
7684 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7685 }
7686#endif
7687 __kmp_teams_master(gtid);
7688#if OMPT_SUPPORT
7689 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7690#endif
7691 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7692 return 1;
7693}
7694
7695/* this sets the requested number of threads for the next parallel region
7696 encountered by this team. since this should be enclosed in the forkjoin
7697 critical section it should avoid race conditions with asymmetrical nested
7698 parallelism */
7699
7700void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7701 kmp_info_t *thr = __kmp_threads[gtid];
7702
7703 if (num_threads > 0)
7704 thr->th.th_set_nproc = num_threads;
7705}
7706
7707static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7708 int num_threads) {
7709 KMP_DEBUG_ASSERT(thr);
7710 // Remember the number of threads for inner parallel regions
7711 if (!TCR_4(__kmp_init_middle))
7712 __kmp_middle_initialize(); // get internal globals calculated
7713 __kmp_assign_root_init_mask();
7714 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7715 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7716
7717 if (num_threads == 0) {
7718 if (__kmp_teams_thread_limit > 0) {
7719 num_threads = __kmp_teams_thread_limit;
7720 } else {
7721 num_threads = __kmp_avail_proc / num_teams;
7722 }
7723 // adjust num_threads w/o warning as it is not user setting
7724 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7725 // no thread_limit clause specified - do not change thread-limit-var ICV
7726 if (num_threads > __kmp_dflt_team_nth) {
7727 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7728 }
7729 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7730 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7731 } // prevent team size to exceed thread-limit-var
7732 if (num_teams * num_threads > __kmp_teams_max_nth) {
7733 num_threads = __kmp_teams_max_nth / num_teams;
7734 }
7735 if (num_threads == 0) {
7736 num_threads = 1;
7737 }
7738 } else {
7739 if (num_threads < 0) {
7740 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7741 __kmp_msg_null);
7742 num_threads = 1;
7743 }
7744 // This thread will be the primary thread of the league primary threads
7745 // Store new thread limit; old limit is saved in th_cg_roots list
7746 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7747 // num_threads = min(num_threads, nthreads-var)
7748 if (num_threads > __kmp_dflt_team_nth) {
7749 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7750 }
7751 if (num_teams * num_threads > __kmp_teams_max_nth) {
7752 int new_threads = __kmp_teams_max_nth / num_teams;
7753 if (new_threads == 0) {
7754 new_threads = 1;
7755 }
7756 if (new_threads != num_threads) {
7757 if (!__kmp_reserve_warn) { // user asked for too many threads
7758 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7759 __kmp_msg(kmp_ms_warning,
7760 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7761 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7762 }
7763 }
7764 num_threads = new_threads;
7765 }
7766 }
7767 thr->th.th_teams_size.nth = num_threads;
7768}
7769
7770/* this sets the requested number of teams for the teams region and/or
7771 the number of threads for the next parallel region encountered */
7772void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7773 int num_threads) {
7774 kmp_info_t *thr = __kmp_threads[gtid];
7775 if (num_teams < 0) {
7776 // OpenMP specification requires requested values to be positive,
7777 // but people can send us any value, so we'd better check
7778 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7779 __kmp_msg_null);
7780 num_teams = 1;
7781 }
7782 if (num_teams == 0) {
7783 if (__kmp_nteams > 0) {
7784 num_teams = __kmp_nteams;
7785 } else {
7786 num_teams = 1; // default number of teams is 1.
7787 }
7788 }
7789 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7790 if (!__kmp_reserve_warn) {
7791 __kmp_reserve_warn = 1;
7792 __kmp_msg(kmp_ms_warning,
7793 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7794 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7795 }
7796 num_teams = __kmp_teams_max_nth;
7797 }
7798 // Set number of teams (number of threads in the outer "parallel" of the
7799 // teams)
7800 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7801
7802 __kmp_push_thread_limit(thr, num_teams, num_threads);
7803}
7804
7805/* This sets the requested number of teams for the teams region and/or
7806 the number of threads for the next parallel region encountered */
7807void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7808 int num_teams_ub, int num_threads) {
7809 kmp_info_t *thr = __kmp_threads[gtid];
7810 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7811 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7812 KMP_DEBUG_ASSERT(num_threads >= 0);
7813
7814 if (num_teams_lb > num_teams_ub) {
7815 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7816 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7817 }
7818
7819 int num_teams = 1; // defalt number of teams is 1.
7820
7821 if (num_teams_lb == 0 && num_teams_ub > 0)
7822 num_teams_lb = num_teams_ub;
7823
7824 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7825 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7826 if (num_teams > __kmp_teams_max_nth) {
7827 if (!__kmp_reserve_warn) {
7828 __kmp_reserve_warn = 1;
7829 __kmp_msg(kmp_ms_warning,
7830 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7831 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7832 }
7833 num_teams = __kmp_teams_max_nth;
7834 }
7835 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7836 num_teams = num_teams_ub;
7837 } else { // num_teams_lb <= num_teams <= num_teams_ub
7838 if (num_threads <= 0) {
7839 if (num_teams_ub > __kmp_teams_max_nth) {
7840 num_teams = num_teams_lb;
7841 } else {
7842 num_teams = num_teams_ub;
7843 }
7844 } else {
7845 num_teams = (num_threads > __kmp_teams_max_nth)
7846 ? num_teams
7847 : __kmp_teams_max_nth / num_threads;
7848 if (num_teams < num_teams_lb) {
7849 num_teams = num_teams_lb;
7850 } else if (num_teams > num_teams_ub) {
7851 num_teams = num_teams_ub;
7852 }
7853 }
7854 }
7855 // Set number of teams (number of threads in the outer "parallel" of the
7856 // teams)
7857 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7858
7859 __kmp_push_thread_limit(thr, num_teams, num_threads);
7860}
7861
7862// Set the proc_bind var to use in the following parallel region.
7863void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7864 kmp_info_t *thr = __kmp_threads[gtid];
7865 thr->th.th_set_proc_bind = proc_bind;
7866}
7867
7868/* Launch the worker threads into the microtask. */
7869
7870void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7871 kmp_info_t *this_thr = __kmp_threads[gtid];
7872
7873#ifdef KMP_DEBUG
7874 int f;
7875#endif /* KMP_DEBUG */
7876
7877 KMP_DEBUG_ASSERT(team);
7878 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7879 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7880 KMP_MB(); /* Flush all pending memory write invalidates. */
7881
7882 team->t.t_construct = 0; /* no single directives seen yet */
7883 team->t.t_ordered.dt.t_value =
7884 0; /* thread 0 enters the ordered section first */
7885
7886 /* Reset the identifiers on the dispatch buffer */
7887 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7888 if (team->t.t_max_nproc > 1) {
7889 int i;
7890 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7891 team->t.t_disp_buffer[i].buffer_index = i;
7892 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7893 }
7894 } else {
7895 team->t.t_disp_buffer[0].buffer_index = 0;
7896 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7897 }
7898
7899 KMP_MB(); /* Flush all pending memory write invalidates. */
7900 KMP_ASSERT(this_thr->th.th_team == team);
7901
7902#ifdef KMP_DEBUG
7903 for (f = 0; f < team->t.t_nproc; f++) {
7904 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7905 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7906 }
7907#endif /* KMP_DEBUG */
7908
7909 /* release the worker threads so they may begin working */
7910 __kmp_fork_barrier(gtid, 0);
7911}
7912
7913void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7914 kmp_info_t *this_thr = __kmp_threads[gtid];
7915
7916 KMP_DEBUG_ASSERT(team);
7917 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7918 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7919 KMP_MB(); /* Flush all pending memory write invalidates. */
7920
7921 /* Join barrier after fork */
7922
7923#ifdef KMP_DEBUG
7924 if (__kmp_threads[gtid] &&
7925 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7926 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7927 __kmp_threads[gtid]);
7928 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7929 "team->t.t_nproc=%d\n",
7930 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7931 team->t.t_nproc);
7932 __kmp_print_structure();
7933 }
7934 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7935 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7936#endif /* KMP_DEBUG */
7937
7938 __kmp_join_barrier(gtid); /* wait for everyone */
7939#if OMPT_SUPPORT
7940 if (ompt_enabled.enabled &&
7941 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7942 int ds_tid = this_thr->th.th_info.ds.ds_tid;
7943 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7944 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7945#if OMPT_OPTIONAL
7946 void *codeptr = NULL;
7947 if (KMP_MASTER_TID(ds_tid) &&
7948 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7949 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7950 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7951
7952 if (ompt_enabled.ompt_callback_sync_region_wait) {
7953 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7954 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7955 codeptr);
7956 }
7957 if (ompt_enabled.ompt_callback_sync_region) {
7958 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7959 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7960 codeptr);
7961 }
7962#endif
7963 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7964 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7965 ompt_scope_end, NULL, task_data, 0, ds_tid,
7966 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7967 }
7968 }
7969#endif
7970
7971 KMP_MB(); /* Flush all pending memory write invalidates. */
7972 KMP_ASSERT(this_thr->th.th_team == team);
7973}
7974
7975/* ------------------------------------------------------------------------ */
7976
7977#ifdef USE_LOAD_BALANCE
7978
7979// Return the worker threads actively spinning in the hot team, if we
7980// are at the outermost level of parallelism. Otherwise, return 0.
7981static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7982 int i;
7983 int retval;
7984 kmp_team_t *hot_team;
7985
7986 if (root->r.r_active) {
7987 return 0;
7988 }
7989 hot_team = root->r.r_hot_team;
7990 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7991 return hot_team->t.t_nproc - 1; // Don't count primary thread
7992 }
7993
7994 // Skip the primary thread - it is accounted for elsewhere.
7995 retval = 0;
7996 for (i = 1; i < hot_team->t.t_nproc; i++) {
7997 if (hot_team->t.t_threads[i]->th.th_active) {
7998 retval++;
7999 }
8000 }
8001 return retval;
8002}
8003
8004// Perform an automatic adjustment to the number of
8005// threads used by the next parallel region.
8006static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8007 int retval;
8008 int pool_active;
8009 int hot_team_active;
8010 int team_curr_active;
8011 int system_active;
8012
8013 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8014 set_nproc));
8015 KMP_DEBUG_ASSERT(root);
8016 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8017 ->th.th_current_task->td_icvs.dynamic == TRUE);
8018 KMP_DEBUG_ASSERT(set_nproc > 1);
8019
8020 if (set_nproc == 1) {
8021 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8022 return 1;
8023 }
8024
8025 // Threads that are active in the thread pool, active in the hot team for this
8026 // particular root (if we are at the outer par level), and the currently
8027 // executing thread (to become the primary thread) are available to add to the
8028 // new team, but are currently contributing to the system load, and must be
8029 // accounted for.
8030 pool_active = __kmp_thread_pool_active_nth;
8031 hot_team_active = __kmp_active_hot_team_nproc(root);
8032 team_curr_active = pool_active + hot_team_active + 1;
8033
8034 // Check the system load.
8035 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8036 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8037 "hot team active = %d\n",
8038 system_active, pool_active, hot_team_active));
8039
8040 if (system_active < 0) {
8041 // There was an error reading the necessary info from /proc, so use the
8042 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8043 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8044 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8045 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8046
8047 // Make this call behave like the thread limit algorithm.
8048 retval = __kmp_avail_proc - __kmp_nth +
8049 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8050 if (retval > set_nproc) {
8051 retval = set_nproc;
8052 }
8053 if (retval < KMP_MIN_NTH) {
8054 retval = KMP_MIN_NTH;
8055 }
8056
8057 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8058 retval));
8059 return retval;
8060 }
8061
8062 // There is a slight delay in the load balance algorithm in detecting new
8063 // running procs. The real system load at this instant should be at least as
8064 // large as the #active omp thread that are available to add to the team.
8065 if (system_active < team_curr_active) {
8066 system_active = team_curr_active;
8067 }
8068 retval = __kmp_avail_proc - system_active + team_curr_active;
8069 if (retval > set_nproc) {
8070 retval = set_nproc;
8071 }
8072 if (retval < KMP_MIN_NTH) {
8073 retval = KMP_MIN_NTH;
8074 }
8075
8076 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8077 return retval;
8078} // __kmp_load_balance_nproc()
8079
8080#endif /* USE_LOAD_BALANCE */
8081
8082/* ------------------------------------------------------------------------ */
8083
8084/* NOTE: this is called with the __kmp_init_lock held */
8085void __kmp_cleanup(void) {
8086 int f;
8087
8088 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8089
8090 if (TCR_4(__kmp_init_parallel)) {
8091#if KMP_HANDLE_SIGNALS
8092 __kmp_remove_signals();
8093#endif
8094 TCW_4(__kmp_init_parallel, FALSE);
8095 }
8096
8097 if (TCR_4(__kmp_init_middle)) {
8098#if KMP_AFFINITY_SUPPORTED
8099 __kmp_affinity_uninitialize();
8100#endif /* KMP_AFFINITY_SUPPORTED */
8101 __kmp_cleanup_hierarchy();
8102 TCW_4(__kmp_init_middle, FALSE);
8103 }
8104
8105 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8106
8107 if (__kmp_init_serial) {
8108 __kmp_runtime_destroy();
8109 __kmp_init_serial = FALSE;
8110 }
8111
8112 __kmp_cleanup_threadprivate_caches();
8113
8114 for (f = 0; f < __kmp_threads_capacity; f++) {
8115 if (__kmp_root[f] != NULL) {
8116 __kmp_free(__kmp_root[f]);
8117 __kmp_root[f] = NULL;
8118 }
8119 }
8120 __kmp_free(__kmp_threads);
8121 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8122 // there is no need in freeing __kmp_root.
8123 __kmp_threads = NULL;
8124 __kmp_root = NULL;
8125 __kmp_threads_capacity = 0;
8126
8127 // Free old __kmp_threads arrays if they exist.
8128 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8129 while (ptr) {
8130 kmp_old_threads_list_t *next = ptr->next;
8131 __kmp_free(ptr->threads);
8132 __kmp_free(ptr);
8133 ptr = next;
8134 }
8135
8136#if KMP_USE_DYNAMIC_LOCK
8137 __kmp_cleanup_indirect_user_locks();
8138#else
8139 __kmp_cleanup_user_locks();
8140#endif
8141#if OMPD_SUPPORT
8142 if (ompd_state) {
8143 __kmp_free(ompd_env_block);
8144 ompd_env_block = NULL;
8145 ompd_env_block_size = 0;
8146 }
8147#endif
8148
8149#if KMP_AFFINITY_SUPPORTED
8150 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8151 __kmp_cpuinfo_file = NULL;
8152#endif /* KMP_AFFINITY_SUPPORTED */
8153
8154#if KMP_USE_ADAPTIVE_LOCKS
8155#if KMP_DEBUG_ADAPTIVE_LOCKS
8156 __kmp_print_speculative_stats();
8157#endif
8158#endif
8159 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8160 __kmp_nested_nth.nth = NULL;
8161 __kmp_nested_nth.size = 0;
8162 __kmp_nested_nth.used = 0;
8163 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8164 __kmp_nested_proc_bind.bind_types = NULL;
8165 __kmp_nested_proc_bind.size = 0;
8166 __kmp_nested_proc_bind.used = 0;
8167 if (__kmp_affinity_format) {
8168 KMP_INTERNAL_FREE(__kmp_affinity_format);
8169 __kmp_affinity_format = NULL;
8170 }
8171
8172 __kmp_i18n_catclose();
8173
8174#if KMP_USE_HIER_SCHED
8175 __kmp_hier_scheds.deallocate();
8176#endif
8177
8178#if KMP_STATS_ENABLED
8179 __kmp_stats_fini();
8180#endif
8181
8182 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8183}
8184
8185/* ------------------------------------------------------------------------ */
8186
8187int __kmp_ignore_mppbeg(void) {
8188 char *env;
8189
8190 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8191 if (__kmp_str_match_false(env))
8192 return FALSE;
8193 }
8194 // By default __kmpc_begin() is no-op.
8195 return TRUE;
8196}
8197
8198int __kmp_ignore_mppend(void) {
8199 char *env;
8200
8201 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8202 if (__kmp_str_match_false(env))
8203 return FALSE;
8204 }
8205 // By default __kmpc_end() is no-op.
8206 return TRUE;
8207}
8208
8209void __kmp_internal_begin(void) {
8210 int gtid;
8211 kmp_root_t *root;
8212
8213 /* this is a very important step as it will register new sibling threads
8214 and assign these new uber threads a new gtid */
8215 gtid = __kmp_entry_gtid();
8216 root = __kmp_threads[gtid]->th.th_root;
8217 KMP_ASSERT(KMP_UBER_GTID(gtid));
8218
8219 if (root->r.r_begin)
8220 return;
8221 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8222 if (root->r.r_begin) {
8223 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8224 return;
8225 }
8226
8227 root->r.r_begin = TRUE;
8228
8229 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8230}
8231
8232/* ------------------------------------------------------------------------ */
8233
8234void __kmp_user_set_library(enum library_type arg) {
8235 int gtid;
8236 kmp_root_t *root;
8237 kmp_info_t *thread;
8238
8239 /* first, make sure we are initialized so we can get our gtid */
8240
8241 gtid = __kmp_entry_gtid();
8242 thread = __kmp_threads[gtid];
8243
8244 root = thread->th.th_root;
8245
8246 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8247 library_serial));
8248 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8249 thread */
8250 KMP_WARNING(SetLibraryIncorrectCall);
8251 return;
8252 }
8253
8254 switch (arg) {
8255 case library_serial:
8256 thread->th.th_set_nproc = 0;
8257 set__nproc(thread, 1);
8258 break;
8259 case library_turnaround:
8260 thread->th.th_set_nproc = 0;
8261 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8262 : __kmp_dflt_team_nth_ub);
8263 break;
8264 case library_throughput:
8265 thread->th.th_set_nproc = 0;
8266 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8267 : __kmp_dflt_team_nth_ub);
8268 break;
8269 default:
8270 KMP_FATAL(UnknownLibraryType, arg);
8271 }
8272
8273 __kmp_aux_set_library(arg);
8274}
8275
8276void __kmp_aux_set_stacksize(size_t arg) {
8277 if (!__kmp_init_serial)
8278 __kmp_serial_initialize();
8279
8280#if KMP_OS_DARWIN
8281 if (arg & (0x1000 - 1)) {
8282 arg &= ~(0x1000 - 1);
8283 if (arg + 0x1000) /* check for overflow if we round up */
8284 arg += 0x1000;
8285 }
8286#endif
8287 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8288
8289 /* only change the default stacksize before the first parallel region */
8290 if (!TCR_4(__kmp_init_parallel)) {
8291 size_t value = arg; /* argument is in bytes */
8292
8293 if (value < __kmp_sys_min_stksize)
8294 value = __kmp_sys_min_stksize;
8295 else if (value > KMP_MAX_STKSIZE)
8296 value = KMP_MAX_STKSIZE;
8297
8298 __kmp_stksize = value;
8299
8300 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8301 }
8302
8303 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8304}
8305
8306/* set the behaviour of the runtime library */
8307/* TODO this can cause some odd behaviour with sibling parallelism... */
8308void __kmp_aux_set_library(enum library_type arg) {
8309 __kmp_library = arg;
8310
8311 switch (__kmp_library) {
8312 case library_serial: {
8313 KMP_INFORM(LibraryIsSerial);
8314 } break;
8315 case library_turnaround:
8316 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8317 __kmp_use_yield = 2; // only yield when oversubscribed
8318 break;
8319 case library_throughput:
8320 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8321 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8322 break;
8323 default:
8324 KMP_FATAL(UnknownLibraryType, arg);
8325 }
8326}
8327
8328/* Getting team information common for all team API */
8329// Returns NULL if not in teams construct
8330static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8331 kmp_info_t *thr = __kmp_entry_thread();
8332 teams_serialized = 0;
8333 if (thr->th.th_teams_microtask) {
8334 kmp_team_t *team = thr->th.th_team;
8335 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8336 int ii = team->t.t_level;
8337 teams_serialized = team->t.t_serialized;
8338 int level = tlevel + 1;
8339 KMP_DEBUG_ASSERT(ii >= tlevel);
8340 while (ii > level) {
8341 for (teams_serialized = team->t.t_serialized;
8342 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8343 }
8344 if (team->t.t_serialized && (!teams_serialized)) {
8345 team = team->t.t_parent;
8346 continue;
8347 }
8348 if (ii > level) {
8349 team = team->t.t_parent;
8350 ii--;
8351 }
8352 }
8353 return team;
8354 }
8355 return NULL;
8356}
8357
8358int __kmp_aux_get_team_num() {
8359 int serialized;
8360 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8361 if (team) {
8362 if (serialized > 1) {
8363 return 0; // teams region is serialized ( 1 team of 1 thread ).
8364 } else {
8365 return team->t.t_master_tid;
8366 }
8367 }
8368 return 0;
8369}
8370
8371int __kmp_aux_get_num_teams() {
8372 int serialized;
8373 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8374 if (team) {
8375 if (serialized > 1) {
8376 return 1;
8377 } else {
8378 return team->t.t_parent->t.t_nproc;
8379 }
8380 }
8381 return 1;
8382}
8383
8384/* ------------------------------------------------------------------------ */
8385
8386/*
8387 * Affinity Format Parser
8388 *
8389 * Field is in form of: %[[[0].]size]type
8390 * % and type are required (%% means print a literal '%')
8391 * type is either single char or long name surrounded by {},
8392 * e.g., N or {num_threads}
8393 * 0 => leading zeros
8394 * . => right justified when size is specified
8395 * by default output is left justified
8396 * size is the *minimum* field length
8397 * All other characters are printed as is
8398 *
8399 * Available field types:
8400 * L {thread_level} - omp_get_level()
8401 * n {thread_num} - omp_get_thread_num()
8402 * h {host} - name of host machine
8403 * P {process_id} - process id (integer)
8404 * T {thread_identifier} - native thread identifier (integer)
8405 * N {num_threads} - omp_get_num_threads()
8406 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8407 * a {thread_affinity} - comma separated list of integers or integer ranges
8408 * (values of affinity mask)
8409 *
8410 * Implementation-specific field types can be added
8411 * If a type is unknown, print "undefined"
8412 */
8413
8414// Structure holding the short name, long name, and corresponding data type
8415// for snprintf. A table of these will represent the entire valid keyword
8416// field types.
8417typedef struct kmp_affinity_format_field_t {
8418 char short_name; // from spec e.g., L -> thread level
8419 const char *long_name; // from spec thread_level -> thread level
8420 char field_format; // data type for snprintf (typically 'd' or 's'
8421 // for integer or string)
8422} kmp_affinity_format_field_t;
8423
8424static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8425#if KMP_AFFINITY_SUPPORTED
8426 {'A', "thread_affinity", 's'},
8427#endif
8428 {'t', "team_num", 'd'},
8429 {'T', "num_teams", 'd'},
8430 {'L', "nesting_level", 'd'},
8431 {'n', "thread_num", 'd'},
8432 {'N', "num_threads", 'd'},
8433 {'a', "ancestor_tnum", 'd'},
8434 {'H', "host", 's'},
8435 {'P', "process_id", 'd'},
8436 {'i', "native_thread_id", 'd'}};
8437
8438// Return the number of characters it takes to hold field
8439static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8440 const char **ptr,
8441 kmp_str_buf_t *field_buffer) {
8442 int rc, format_index, field_value;
8443 const char *width_left, *width_right;
8444 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8445 static const int FORMAT_SIZE = 20;
8446 char format[FORMAT_SIZE] = {0};
8447 char absolute_short_name = 0;
8448
8449 KMP_DEBUG_ASSERT(gtid >= 0);
8450 KMP_DEBUG_ASSERT(th);
8451 KMP_DEBUG_ASSERT(**ptr == '%');
8452 KMP_DEBUG_ASSERT(field_buffer);
8453
8454 __kmp_str_buf_clear(field_buffer);
8455
8456 // Skip the initial %
8457 (*ptr)++;
8458
8459 // Check for %% first
8460 if (**ptr == '%') {
8461 __kmp_str_buf_cat(field_buffer, "%", 1);
8462 (*ptr)++; // skip over the second %
8463 return 1;
8464 }
8465
8466 // Parse field modifiers if they are present
8467 pad_zeros = false;
8468 if (**ptr == '0') {
8469 pad_zeros = true;
8470 (*ptr)++; // skip over 0
8471 }
8472 right_justify = false;
8473 if (**ptr == '.') {
8474 right_justify = true;
8475 (*ptr)++; // skip over .
8476 }
8477 // Parse width of field: [width_left, width_right)
8478 width_left = width_right = NULL;
8479 if (**ptr >= '0' && **ptr <= '9') {
8480 width_left = *ptr;
8481 SKIP_DIGITS(*ptr);
8482 width_right = *ptr;
8483 }
8484
8485 // Create the format for KMP_SNPRINTF based on flags parsed above
8486 format_index = 0;
8487 format[format_index++] = '%';
8488 if (!right_justify)
8489 format[format_index++] = '-';
8490 if (pad_zeros)
8491 format[format_index++] = '0';
8492 if (width_left && width_right) {
8493 int i = 0;
8494 // Only allow 8 digit number widths.
8495 // This also prevents overflowing format variable
8496 while (i < 8 && width_left < width_right) {
8497 format[format_index++] = *width_left;
8498 width_left++;
8499 i++;
8500 }
8501 }
8502
8503 // Parse a name (long or short)
8504 // Canonicalize the name into absolute_short_name
8505 found_valid_name = false;
8506 parse_long_name = (**ptr == '{');
8507 if (parse_long_name)
8508 (*ptr)++; // skip initial left brace
8509 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8510 sizeof(__kmp_affinity_format_table[0]);
8511 ++i) {
8512 char short_name = __kmp_affinity_format_table[i].short_name;
8513 const char *long_name = __kmp_affinity_format_table[i].long_name;
8514 char field_format = __kmp_affinity_format_table[i].field_format;
8515 if (parse_long_name) {
8516 size_t length = KMP_STRLEN(long_name);
8517 if (strncmp(*ptr, long_name, length) == 0) {
8518 found_valid_name = true;
8519 (*ptr) += length; // skip the long name
8520 }
8521 } else if (**ptr == short_name) {
8522 found_valid_name = true;
8523 (*ptr)++; // skip the short name
8524 }
8525 if (found_valid_name) {
8526 format[format_index++] = field_format;
8527 format[format_index++] = '\0';
8528 absolute_short_name = short_name;
8529 break;
8530 }
8531 }
8532 if (parse_long_name) {
8533 if (**ptr != '}') {
8534 absolute_short_name = 0;
8535 } else {
8536 (*ptr)++; // skip over the right brace
8537 }
8538 }
8539
8540 // Attempt to fill the buffer with the requested
8541 // value using snprintf within __kmp_str_buf_print()
8542 switch (absolute_short_name) {
8543 case 't':
8544 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8545 break;
8546 case 'T':
8547 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8548 break;
8549 case 'L':
8550 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8551 break;
8552 case 'n':
8553 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8554 break;
8555 case 'H': {
8556 static const int BUFFER_SIZE = 256;
8557 char buf[BUFFER_SIZE];
8558 __kmp_expand_host_name(buf, BUFFER_SIZE);
8559 rc = __kmp_str_buf_print(field_buffer, format, buf);
8560 } break;
8561 case 'P':
8562 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8563 break;
8564 case 'i':
8565 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8566 break;
8567 case 'N':
8568 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8569 break;
8570 case 'a':
8571 field_value =
8572 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8573 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8574 break;
8575#if KMP_AFFINITY_SUPPORTED
8576 case 'A': {
8577 kmp_str_buf_t buf;
8578 __kmp_str_buf_init(&buf);
8579 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8580 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8581 __kmp_str_buf_free(&buf);
8582 } break;
8583#endif
8584 default:
8585 // According to spec, If an implementation does not have info for field
8586 // type, then "undefined" is printed
8587 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8588 // Skip the field
8589 if (parse_long_name) {
8590 SKIP_TOKEN(*ptr);
8591 if (**ptr == '}')
8592 (*ptr)++;
8593 } else {
8594 (*ptr)++;
8595 }
8596 }
8597
8598 KMP_ASSERT(format_index <= FORMAT_SIZE);
8599 return rc;
8600}
8601
8602/*
8603 * Return number of characters needed to hold the affinity string
8604 * (not including null byte character)
8605 * The resultant string is printed to buffer, which the caller can then
8606 * handle afterwards
8607 */
8608size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8609 kmp_str_buf_t *buffer) {
8610 const char *parse_ptr;
8611 size_t retval;
8612 const kmp_info_t *th;
8613 kmp_str_buf_t field;
8614
8615 KMP_DEBUG_ASSERT(buffer);
8616 KMP_DEBUG_ASSERT(gtid >= 0);
8617
8618 __kmp_str_buf_init(&field);
8619 __kmp_str_buf_clear(buffer);
8620
8621 th = __kmp_threads[gtid];
8622 retval = 0;
8623
8624 // If format is NULL or zero-length string, then we use
8625 // affinity-format-var ICV
8626 parse_ptr = format;
8627 if (parse_ptr == NULL || *parse_ptr == '\0') {
8628 parse_ptr = __kmp_affinity_format;
8629 }
8630 KMP_DEBUG_ASSERT(parse_ptr);
8631
8632 while (*parse_ptr != '\0') {
8633 // Parse a field
8634 if (*parse_ptr == '%') {
8635 // Put field in the buffer
8636 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8637 __kmp_str_buf_catbuf(buffer, &field);
8638 retval += rc;
8639 } else {
8640 // Put literal character in buffer
8641 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8642 retval++;
8643 parse_ptr++;
8644 }
8645 }
8646 __kmp_str_buf_free(&field);
8647 return retval;
8648}
8649
8650// Displays the affinity string to stdout
8651void __kmp_aux_display_affinity(int gtid, const char *format) {
8652 kmp_str_buf_t buf;
8653 __kmp_str_buf_init(&buf);
8654 __kmp_aux_capture_affinity(gtid, format, &buf);
8655 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8656 __kmp_str_buf_free(&buf);
8657}
8658
8659/* ------------------------------------------------------------------------ */
8660
8661void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8662 int blocktime = arg; /* argument is in milliseconds */
8663#if KMP_USE_MONITOR
8664 int bt_intervals;
8665#endif
8666 kmp_int8 bt_set;
8667
8668 __kmp_save_internal_controls(thread);
8669
8670 /* Normalize and set blocktime for the teams */
8671 if (blocktime < KMP_MIN_BLOCKTIME)
8672 blocktime = KMP_MIN_BLOCKTIME;
8673 else if (blocktime > KMP_MAX_BLOCKTIME)
8674 blocktime = KMP_MAX_BLOCKTIME;
8675
8676 set__blocktime_team(thread->th.th_team, tid, blocktime);
8677 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8678
8679#if KMP_USE_MONITOR
8680 /* Calculate and set blocktime intervals for the teams */
8681 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8682
8683 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8684 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8685#endif
8686
8687 /* Set whether blocktime has been set to "TRUE" */
8688 bt_set = TRUE;
8689
8690 set__bt_set_team(thread->th.th_team, tid, bt_set);
8691 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8692#if KMP_USE_MONITOR
8693 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8694 "bt_intervals=%d, monitor_updates=%d\n",
8695 __kmp_gtid_from_tid(tid, thread->th.th_team),
8696 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8697 __kmp_monitor_wakeups));
8698#else
8699 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8700 __kmp_gtid_from_tid(tid, thread->th.th_team),
8701 thread->th.th_team->t.t_id, tid, blocktime));
8702#endif
8703}
8704
8705void __kmp_aux_set_defaults(char const *str, size_t len) {
8706 if (!__kmp_init_serial) {
8707 __kmp_serial_initialize();
8708 }
8709 __kmp_env_initialize(str);
8710
8711 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8712 __kmp_env_print();
8713 }
8714} // __kmp_aux_set_defaults
8715
8716/* ------------------------------------------------------------------------ */
8717/* internal fast reduction routines */
8718
8719PACKED_REDUCTION_METHOD_T
8720__kmp_determine_reduction_method(
8721 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8722 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8723 kmp_critical_name *lck) {
8724
8725 // Default reduction method: critical construct ( lck != NULL, like in current
8726 // PAROPT )
8727 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8728 // can be selected by RTL
8729 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8730 // can be selected by RTL
8731 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8732 // among generated by PAROPT.
8733
8734 PACKED_REDUCTION_METHOD_T retval;
8735
8736 int team_size;
8737
8738 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8739 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8740
8741#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8742 (loc && \
8743 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8744#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8745
8746 retval = critical_reduce_block;
8747
8748 // another choice of getting a team size (with 1 dynamic deference) is slower
8749 team_size = __kmp_get_team_num_threads(global_tid);
8750 if (team_size == 1) {
8751
8752 retval = empty_reduce_block;
8753
8754 } else {
8755
8756 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8757
8758#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8759 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8760
8761#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8762 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8763
8764 int teamsize_cutoff = 4;
8765
8766#if KMP_MIC_SUPPORTED
8767 if (__kmp_mic_type != non_mic) {
8768 teamsize_cutoff = 8;
8769 }
8770#endif
8771 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8772 if (tree_available) {
8773 if (team_size <= teamsize_cutoff) {
8774 if (atomic_available) {
8775 retval = atomic_reduce_block;
8776 }
8777 } else {
8778 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8779 }
8780 } else if (atomic_available) {
8781 retval = atomic_reduce_block;
8782 }
8783#else
8784#error "Unknown or unsupported OS"
8785#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8786 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8787
8788#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8789
8790#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8791
8792 // basic tuning
8793
8794 if (atomic_available) {
8795 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8796 retval = atomic_reduce_block;
8797 }
8798 } // otherwise: use critical section
8799
8800#elif KMP_OS_DARWIN
8801
8802 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8803 if (atomic_available && (num_vars <= 3)) {
8804 retval = atomic_reduce_block;
8805 } else if (tree_available) {
8806 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8807 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8808 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8809 }
8810 } // otherwise: use critical section
8811
8812#else
8813#error "Unknown or unsupported OS"
8814#endif
8815
8816#else
8817#error "Unknown or unsupported architecture"
8818#endif
8819 }
8820
8821 // KMP_FORCE_REDUCTION
8822
8823 // If the team is serialized (team_size == 1), ignore the forced reduction
8824 // method and stay with the unsynchronized method (empty_reduce_block)
8825 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8826 team_size != 1) {
8827
8828 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8829
8830 int atomic_available, tree_available;
8831
8832 switch ((forced_retval = __kmp_force_reduction_method)) {
8833 case critical_reduce_block:
8834 KMP_ASSERT(lck); // lck should be != 0
8835 break;
8836
8837 case atomic_reduce_block:
8838 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8839 if (!atomic_available) {
8840 KMP_WARNING(RedMethodNotSupported, "atomic");
8841 forced_retval = critical_reduce_block;
8842 }
8843 break;
8844
8845 case tree_reduce_block:
8846 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8847 if (!tree_available) {
8848 KMP_WARNING(RedMethodNotSupported, "tree");
8849 forced_retval = critical_reduce_block;
8850 } else {
8851#if KMP_FAST_REDUCTION_BARRIER
8852 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8853#endif
8854 }
8855 break;
8856
8857 default:
8858 KMP_ASSERT(0); // "unsupported method specified"
8859 }
8860
8861 retval = forced_retval;
8862 }
8863
8864 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8865
8866#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8867#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8868
8869 return (retval);
8870}
8871// this function is for testing set/get/determine reduce method
8872kmp_int32 __kmp_get_reduce_method(void) {
8873 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8874}
8875
8876// Soft pause sets up threads to ignore blocktime and just go to sleep.
8877// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8878void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8879
8880// Hard pause shuts down the runtime completely. Resume happens naturally when
8881// OpenMP is used subsequently.
8882void __kmp_hard_pause() {
8883 __kmp_pause_status = kmp_hard_paused;
8884 __kmp_internal_end_thread(-1);
8885}
8886
8887// Soft resume sets __kmp_pause_status, and wakes up all threads.
8888void __kmp_resume_if_soft_paused() {
8889 if (__kmp_pause_status == kmp_soft_paused) {
8890 __kmp_pause_status = kmp_not_paused;
8891
8892 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8893 kmp_info_t *thread = __kmp_threads[gtid];
8894 if (thread) { // Wake it if sleeping
8895 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8896 thread);
8897 if (fl.is_sleeping())
8898 fl.resume(gtid);
8899 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8900 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8901 } else { // thread holds the lock and may sleep soon
8902 do { // until either the thread sleeps, or we can get the lock
8903 if (fl.is_sleeping()) {
8904 fl.resume(gtid);
8905 break;
8906 } else if (__kmp_try_suspend_mx(thread)) {
8907 __kmp_unlock_suspend_mx(thread);
8908 break;
8909 }
8910 } while (1);
8911 }
8912 }
8913 }
8914 }
8915}
8916
8917// This function is called via __kmpc_pause_resource. Returns 0 if successful.
8918// TODO: add warning messages
8919int __kmp_pause_resource(kmp_pause_status_t level) {
8920 if (level == kmp_not_paused) { // requesting resume
8921 if (__kmp_pause_status == kmp_not_paused) {
8922 // error message about runtime not being paused, so can't resume
8923 return 1;
8924 } else {
8925 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8926 __kmp_pause_status == kmp_hard_paused);
8927 __kmp_pause_status = kmp_not_paused;
8928 return 0;
8929 }
8930 } else if (level == kmp_soft_paused) { // requesting soft pause
8931 if (__kmp_pause_status != kmp_not_paused) {
8932 // error message about already being paused
8933 return 1;
8934 } else {
8935 __kmp_soft_pause();
8936 return 0;
8937 }
8938 } else if (level == kmp_hard_paused) { // requesting hard pause
8939 if (__kmp_pause_status != kmp_not_paused) {
8940 // error message about already being paused
8941 return 1;
8942 } else {
8943 __kmp_hard_pause();
8944 return 0;
8945 }
8946 } else {
8947 // error message about invalid level
8948 return 1;
8949 }
8950}
8951
8952void __kmp_omp_display_env(int verbose) {
8953 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8954 if (__kmp_init_serial == 0)
8955 __kmp_do_serial_initialize();
8956 __kmp_display_env_impl(!verbose, verbose);
8957 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8958}
8959
8960// The team size is changing, so distributed barrier must be modified
8961void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8962 int new_nthreads) {
8963 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8964 bp_dist_bar);
8965 kmp_info_t **other_threads = team->t.t_threads;
8966
8967 // We want all the workers to stop waiting on the barrier while we adjust the
8968 // size of the team.
8969 for (int f = 1; f < old_nthreads; ++f) {
8970 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8971 // Ignore threads that are already inactive or not present in the team
8972 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8973 // teams construct causes thread_limit to get passed in, and some of
8974 // those could be inactive; just ignore them
8975 continue;
8976 }
8977 // If thread is transitioning still to in_use state, wait for it
8978 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8979 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8980 KMP_CPU_PAUSE();
8981 }
8982 // The thread should be in_use now
8983 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8984 // Transition to unused state
8985 team->t.t_threads[f]->th.th_used_in_team.store(2);
8986 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8987 }
8988 // Release all the workers
8989 team->t.b->go_release();
8990
8991 KMP_MFENCE();
8992
8993 // Workers should see transition status 2 and move to 0; but may need to be
8994 // woken up first
8995 int count = old_nthreads - 1;
8996 while (count > 0) {
8997 count = old_nthreads - 1;
8998 for (int f = 1; f < old_nthreads; ++f) {
8999 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9000 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9001 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9002 void *, other_threads[f]->th.th_sleep_loc);
9003 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9004 }
9005 } else {
9006 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9007 count--;
9008 }
9009 }
9010 }
9011 // Now update the barrier size
9012 team->t.b->update_num_threads(new_nthreads);
9013 team->t.b->go_reset();
9014}
9015
9016void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9017 // Add the threads back to the team
9018 KMP_DEBUG_ASSERT(team);
9019 // Threads were paused and pointed at th_used_in_team temporarily during a
9020 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9021 // the thread that it should transition itself back into the team. Then, if
9022 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9023 // to wake it up.
9024 for (int f = 1; f < new_nthreads; ++f) {
9025 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9026 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9027 3);
9028 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9029 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9030 (kmp_flag_32<false, false> *)NULL);
9031 }
9032 }
9033 // The threads should be transitioning to the team; when they are done, they
9034 // should have set th_used_in_team to 1. This loop forces master to wait until
9035 // all threads have moved into the team and are waiting in the barrier.
9036 int count = new_nthreads - 1;
9037 while (count > 0) {
9038 count = new_nthreads - 1;
9039 for (int f = 1; f < new_nthreads; ++f) {
9040 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9041 count--;
9042 }
9043 }
9044 }
9045}
9046
9047// Globals and functions for hidden helper task
9048kmp_info_t **__kmp_hidden_helper_threads;
9049kmp_info_t *__kmp_hidden_helper_main_thread;
9050std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9051#if KMP_OS_LINUX
9052kmp_int32 __kmp_hidden_helper_threads_num = 8;
9053kmp_int32 __kmp_enable_hidden_helper = TRUE;
9054#else
9055kmp_int32 __kmp_hidden_helper_threads_num = 0;
9056kmp_int32 __kmp_enable_hidden_helper = FALSE;
9057#endif
9058
9059namespace {
9060std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9061
9062void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9063 // This is an explicit synchronization on all hidden helper threads in case
9064 // that when a regular thread pushes a hidden helper task to one hidden
9065 // helper thread, the thread has not been awaken once since they're released
9066 // by the main thread after creating the team.
9067 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9068 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9069 __kmp_hidden_helper_threads_num)
9070 ;
9071
9072 // If main thread, then wait for signal
9073 if (__kmpc_master(nullptr, *gtid)) {
9074 // First, unset the initial state and release the initial thread
9075 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9076 __kmp_hidden_helper_initz_release();
9077 __kmp_hidden_helper_main_thread_wait();
9078 // Now wake up all worker threads
9079 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9080 __kmp_hidden_helper_worker_thread_signal();
9081 }
9082 }
9083}
9084} // namespace
9085
9086void __kmp_hidden_helper_threads_initz_routine() {
9087 // Create a new root for hidden helper team/threads
9088 const int gtid = __kmp_register_root(TRUE);
9089 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9090 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9091 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9092 __kmp_hidden_helper_threads_num;
9093
9094 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9095
9096 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9097
9098 // Set the initialization flag to FALSE
9099 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9100
9101 __kmp_hidden_helper_threads_deinitz_release();
9102}
9103
9104/* Nesting Mode:
9105 Set via KMP_NESTING_MODE, which takes an integer.
9106 Note: we skip duplicate topology levels, and skip levels with only
9107 one entity.
9108 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9109 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9110 in the topology, and initializes the number of threads at each of those
9111 levels to the number of entities at each level, respectively, below the
9112 entity at the parent level.
9113 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9114 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9115 the user to turn nesting on explicitly. This is an even more experimental
9116 option to this experimental feature, and may change or go away in the
9117 future.
9118*/
9119
9120// Allocate space to store nesting levels
9121void __kmp_init_nesting_mode() {
9122 int levels = KMP_HW_LAST;
9123 __kmp_nesting_mode_nlevels = levels;
9124 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9125 for (int i = 0; i < levels; ++i)
9126 __kmp_nesting_nth_level[i] = 0;
9127 if (__kmp_nested_nth.size < levels) {
9128 __kmp_nested_nth.nth =
9129 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9130 __kmp_nested_nth.size = levels;
9131 }
9132}
9133
9134// Set # threads for top levels of nesting; must be called after topology set
9135void __kmp_set_nesting_mode_threads() {
9136 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9137
9138 if (__kmp_nesting_mode == 1)
9139 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9140 else if (__kmp_nesting_mode > 1)
9141 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9142
9143 if (__kmp_topology) { // use topology info
9144 int loc, hw_level;
9145 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9146 loc < __kmp_nesting_mode_nlevels;
9147 loc++, hw_level++) {
9148 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9149 if (__kmp_nesting_nth_level[loc] == 1)
9150 loc--;
9151 }
9152 // Make sure all cores are used
9153 if (__kmp_nesting_mode > 1 && loc > 1) {
9154 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9155 int num_cores = __kmp_topology->get_count(core_level);
9156 int upper_levels = 1;
9157 for (int level = 0; level < loc - 1; ++level)
9158 upper_levels *= __kmp_nesting_nth_level[level];
9159 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9160 __kmp_nesting_nth_level[loc - 1] =
9161 num_cores / __kmp_nesting_nth_level[loc - 2];
9162 }
9163 __kmp_nesting_mode_nlevels = loc;
9164 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9165 } else { // no topology info available; provide a reasonable guesstimation
9166 if (__kmp_avail_proc >= 4) {
9167 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9168 __kmp_nesting_nth_level[1] = 2;
9169 __kmp_nesting_mode_nlevels = 2;
9170 } else {
9171 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9172 __kmp_nesting_mode_nlevels = 1;
9173 }
9174 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9175 }
9176 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9177 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9178 }
9179 set__nproc(thread, __kmp_nesting_nth_level[0]);
9180 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9181 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9182 if (get__max_active_levels(thread) > 1) {
9183 // if max levels was set, set nesting mode levels to same
9184 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9185 }
9186 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9187 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9188}
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236